In [29]:
import argparse
import pandas as pd
import sqlalchemy
import numpy as np
from numpy import mean
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform


import xgboost as xgb 
import lightgbm as lgb
from sklearn.externals import joblib
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,\
                             AdaBoostClassifier, GradientBoostingClassifier,\
                             GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor

In [30]:
def postgres_connector(host, port, database, user, password=None):
    user_info = user if password is None else user + ':' + password
    url = 'postgres://%s@%s:%d/%s' % (user_info, host, port, database)
    return sqlalchemy.create_engine(url, client_encoding='utf-8')

def read_data(table):
    query='select * from {}'.format(table)
    df=pd.read_sql(query, engine)
    print('{} table successfully loaded!'.format(table))
    return df

def little_merge(post_table, t2, table_name):
    
    #merge tables
    df_merge=post_table.merge(t2, on='post_key',how='left')
    
    #fill missing value
    df_merge['created_at_hour_y']=df_merge['created_at_hour_y'].fillna(df_merge['created_at_hour_x'])
    df_merge['count']=df_merge['count'].fillna(0)
    
    #exclude data which is not in training domain(10 hr)
    df_merge=df_merge[(df_merge['created_at_hour_y']-\
                       df_merge['created_at_hour_x']).dt.total_seconds()/3600<10]
    df_merge=df_merge[(df_merge['created_at_hour_y']-\
                      df_merge['created_at_hour_x']).dt.total_seconds()/3600>=0]
    df_merge['time_after_create']=(df_merge['created_at_hour_y']\
                                  -df_merge['created_at_hour_x']).dt.total_seconds()/3600
    
    #drop non-training columns
    df=df_merge.drop(['created_at_hour_x','created_at_hour_y', 'like_count_36_hour'], axis=1)
    
    #features engineering:generate new training features
    df_train=pd.pivot_table(df,values='count',index='post_key',columns='time_after_create')
    df_train=df_train.fillna(0)
    
    #rename the columns
    for i in range(10):
        df_train=df_train.rename(columns={df_train.columns[i]:'{} hour after {}'.format(i,table_name)})
    
    #convert float to int
    #df_train[list(df_train.columns)]=df_train[list(df_train.columns)].astype(int)
    print('posts table and {} table successfully merge!'.format(table_name))
    return df_train

def post_table_feature_engineer(table):
    
    #adding new training categorical features: created_weekday, created_hour
    table['created_weekday']=table['created_at_hour'].apply(lambda x: x.weekday())
    table['created_time']=table['created_at_hour'].apply(lambda x: x.hour)
    
    #setting is_trending label where 'like_count_36_hour'>1000 => 1 and 'like_count_36_hour'<=1000 =>0
    table['is_trending']=table['like_count_36_hour'].apply(lambda x: 1 if x>1000 else 0)
    
    #drop non_training features
    table=table.drop(['created_at_hour','like_count_36_hour'], axis=1)
    
    print('posts table finish feature engineering!')
    return table

def big_merge(t1, t2, t3, t4, post_table):
    
    #outer join the 4 small tables
    df1=t1.merge(t2, on='post_key',how='outer')
    df2=df1.merge(t3, on='post_key',how='outer')
    df3=df2.merge(t4, on='post_key',how='outer')
    
    #left join post_table
    df=post_table.merge(df3, on='post_key',how='left')
    
    #fill missing values
    df=df.fillna(0)
    
    #convert float to int
    df[list(df.columns[1:])]=df[list(df.columns[1:])].astype(int)
    print('all table merged successfully!')
    return df

def create_dummy(data, dummy_features):
    df=pd.get_dummies(data,columns=dummy_features, drop_first=True)
    print('dummy variable created successfully!')
    return df

def train_test(train, test):
    x_train=train.iloc[:,2:]
    y_train=train['is_trending']
    x_test=test.iloc[:,2:]
    y_test=test['is_trending']
    print('training and testing data successfully generate!')
    return x_train, y_train, x_test, y_test

def lgb_hypertuned_parameters_model(x, y):
    # Initiate the Light-GBM Classifier
    LGB_clf = lgb.LGBMClassifier(objective='binary',n_estimators=150,max_depth=-1 ,random_state=0)

    # Construct ranges for each parameter 
    param_grid ={'num_leaves': sp_randint(6, 50), 
                 'min_child_samples': sp_randint(100, 500), 
                 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
                 'subsample': sp_uniform(loc=0.2, scale=0.8), 
                 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
                 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
                 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

    # Do the randomized grid searching 30 times 
    # Find the optimal combination of parameters such that the highest f1 score attained
    grid_obj = RandomizedSearchCV(LGB_clf, param_distributions=param_grid, cv=5, scoring='f1', n_iter = 30)
    grid_obj.fit(x, y)
    print('model finish training!')
    return grid_obj

def save_model(model):
    joblib.dump(grid_obj, args.save_filepath)
    print('model successfully saved!')

In [None]:
if __name__ == '__main__':
    
    parser = argparse.ArgumentParser(description='Train a Light-gbm classifier for Dcard posts')
    parser.add_argument('host', help='input database host')
    parser.add_argument('save_filepath', help='filepath where model saved')
    args = parser.parse_args()
    
    
    #access database
    engine = postgres_connector(
    args.host,
    5432,
    "intern_task",
    "candidate",
    "dcard-data-intern-2020"
    )
    
    #load training data
    #load train tables
    df_posts=read_data('posts_train')
    df_share=read_data('post_shared_train')
    df_comment=read_data('post_comment_created_train')
    df_like=read_data('post_liked_train')
    df_collect=read_data('post_collected_train')
    
    #join feature tables and feature engineering
    posts_share=little_merge(df_posts,df_share, 'share')
    posts_comment=little_merge(df_posts,df_comment, 'comment')
    posts_like=little_merge(df_posts,df_like, 'like')
    posts_collect=little_merge(df_posts,df_collect, 'collect')

    #target table feature engineering
    df_posts_train=post_table_feature_engineer(df_posts)

    #merge all tables
    df=big_merge(posts_share, posts_comment, posts_like, posts_collect, df_posts_train)

    #deal with categorical features
    df_train=create_dummy(df, ['created_weekday', 'created_time'])
    
    
    #load testing data
    #load test tables
    df_posts_test=read_data('posts_test')
    df_share_test=read_data('post_shared_test')
    df_comment_test=read_data('post_comment_created_test')
    df_like_test=read_data('post_liked_test')
    df_collect_test=read_data('post_collected_test')

    #join feature tables and feature engineering
    posts_share_test=little_merge(df_posts_test,df_share_test, 'share')
    posts_comment_test=little_merge(df_posts_test,df_comment_test, 'comment')
    posts_like_test=little_merge(df_posts_test,df_like_test, 'like')
    posts_collect_test=little_merge(df_posts_test,df_collect_test, 'collect')

    #target table feature engineering
    df_posts_test=post_table_feature_engineer(df_posts_test)

    #merge all tables
    df_t=big_merge(posts_share_test, posts_comment_test, posts_like_test, posts_collect_test, df_posts_test)

    #deal with categorical features
    df_test=create_dummy(df_t, ['created_weekday', 'created_time'])
    
    
    
    #generate train and test data
    x_train, y_train, x_test, y_test=train_test(df_train, df_test)
    
    #fit model
    grid_obj=lgb_hypertuned_parameters_model(x_train, y_train)
    
    #save model
    save_model(grid_obj)