Data Processing and feature engineering

In [0]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
import scipy.stats as st
import gc
import xgboost as xgb
import warnings
from sklearn import ensemble, tree
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
warnings.filterwarnings('ignore')
%matplotlib inline

In [0]:
#Defining the machine learning models
def Train_Decision_tree(X_training, y_training, X_valid, y_valid):
    dt_clf = DecisionTreeClassifier()
    dt_clf.fit(X_training, y_training)
    pred_dt = dt_clf.predict(X_valid)
    Decision_tree_accuracy = roc_auc_score(y_valid, pred_dt)
    print('Decision_tree_AUC_ROC=\n', Decision_tree_accuracy)
    Trained_Decision_tree_Model = dt_clf

    return Trained_Decision_tree_Model, Decision_tree_accuracy
    
def Train_Random_forest(X_training, y_training, X_valid, y_valid):
    rf_clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
    rf_model = rf_clf.fit(X_training, y_training)
    pred_rf = rf_clf.predict(X_valid)
    Random_forest_accuracy = roc_auc_score(y_valid, pred_rf)
    print('Random_forest_AUC_ROC=\n', Random_forest_accuracy)
    Trained_Random_forest_Model = rf_clf
    return Trained_Random_forest_Model, Random_forest_accuracy
    
def Train_logistic_regression(X_training, y_training, X_valid, y_valid):
    logreg_clf = LogisticRegression()
    logreg_clf.fit(X_training, y_training)
    pred_logreg = logreg_clf.predict(X_valid)
    logistic_regression_accuracy = roc_auc_score(y_valid, pred_logreg)
    print('logistic_regression_AUC_ROC=\n', logistic_regression_accuracy)
    Trained_logistic_regression_Model = logreg_clf
    return Trained_logistic_regression_Model, logistic_regression_accuracy
    
def Train_support_vector_machine(X_training, y_training, X_valid, y_valid):
    linsvc_clf = LinearSVC()
    linsvc_clf.fit(X_training, y_training)
    pred_linsvc = linsvc_clf.predict(X_valid)
    support_vector_machine_accuracy = roc_auc_score(y_valid, pred_linsvc)
    print('support_vector_AUC_ROC=\n', support_vector_machine_accuracy)
    Trained_support_vector_machine_Model = linsvc_clf
    return Trained_support_vector_machine_Model, support_vector_machine_accuracy
    
def Train_XGBoost(X_training, y_training, X_valid, y_valid):

    XGB_model = XGBClassifier(learning_rate =0.1,
                              n_estimators=1000,
                              max_depth=15,
                              min_child_weight=1,
                              gamma=0,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              objective= 'binary:logistic',
                              nthread=4,
                              scale_pos_weight=1,
                              seed=27)
     
    XGB_model.fit(X_training, y_training)
    pred_XGB = XGB_model.predict(X_valid)
    pred_XGB = [round(value) for value in pred_XGB]
    XGBoost_accuracy = roc_auc_score(y_valid, pred_XGB)
    print('XGBoost_AUC_ROC=\n', XGBoost_accuracy)
    Trained_XGBoost_Model = XGB_model
    return Trained_XGBoost_Model, XGBoost_accuracy

In [0]:
df = pd.read_csv('/content/gdrive/My Drive/Big_data_project/train/mnt/ssd/kaggle-talkingdata2/competition_files/train.csv', parse_dates=['click_time'])
print(df.shape)
df.dtypes

In [0]:
def upsample(df): 
    y=df['is_attributed']
    X=df.drop('is_attributed',axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)
    X = pd.concat([X_train, y_train], axis=1)
    download=X[X['is_attributed']==1]
    not_download=X[X['is_attributed']==0]
    download_upsampled = resample(download, replace=True,  n_samples=len(not_download), random_state=27) 
    # combine majority and upsampled minority
    upsampled = pd.concat([not_download, download_upsampled])
    # check new class counts
    print(upsampled.is_attributed.value_counts())
    y_train=upsampled['is_attributed']
    X_train=upsampled.drop('is_attributed',axis=1)
    
    return X_train,X_test,y_train, y_test

def downsample(df):
    y=df['is_attributed']
    X=df.drop('is_attributed',axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)
    X = pd.concat([X_train, y_train], axis=1)
    download=X[X['is_attributed']==1]
    not_download=X[X['is_attributed']==0]
    not_download_downsampled = resample(not_download,replace = False,  n_samples = len(download), random_state = 27) 
    # combine minority and downsampled majority
    downsampled = pd.concat([not_download_downsampled, download])
    # checking counts
    print(downsampled.is_attributed.value_counts())
    y_train=downsampled['is_attributed']
    X_train=downsampled.drop('is_attributed',axis=1)

    return X_train,X_test,y_train, y_test

from imblearn.over_sampling import SMOTE

# Separate input features and target
def SMOTE_data(df):
    y=df['is_attributed']
    X=df.drop('is_attributed',axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)
    col_names = ['attributed_time','click_time', 'ip']
    X_train = X_train.drop(col_names, axis=1)
    X_test = X_test.drop(col_names, axis=1)
    sm = SMOTE(random_state=27, ratio=1.0)
    X_train, y_train = sm.fit_sample(X_train, y_train)

    return X_train,X_test,y_train, y_test

In [0]:
#Upsample
X_train,X_test,y_train, y_test=upsample(df)
col_names = ['attributed_time','click_time', 'ip']
X_train = X_train.drop(col_names, axis=1)
X_test = X_test.drop(col_names, axis=1)
Trained_Decision_tree_Model, Decision_tree_accuracy = Train_Decision_tree(X_train, y_train, X_test, y_test)
Trained_support_vector_machine_Model, support_vector_machine_accuracy = Train_support_vector_machine(X_train, y_train, X_test, y_test)
Trained_Random_forest_Model, Random_forest_accuracy = Train_Random_forest(X_train, y_train, X_test, y_test)
Trained_logistic_regression_Model, logistic_regression_accuracy = Train_logistic_regression(X_train, y_train, X_test, y_test)
Trained_XGBoost_Model, XGBoost_accuracy = Train_XGBoost(X_train, y_train, X_test, y_test)

1    74815
0    74815
Name: is_attributed, dtype: int64
Decision_tree_AUC_ROC=
 0.7881030607611264
support_vector_AUC_ROC=
 0.6874949915858644
Random_forest_AUC_ROC=
 0.811952651883736
logistic_regression_AUC_ROC=
 0.6921418459201936
XGBoost_AUC_ROC=
 0.811972685540279


In [0]:
#Downsample
X_train,X_test,y_train, y_test=downsample(df)
col_names = ['attributed_time','click_time', 'ip']
X_train = X_train.drop(col_names, axis=1)
X_test = X_test.drop(col_names, axis=1)
Trained_Decision_tree_Model, Decision_tree_accuracy = Train_Decision_tree(X_train, y_train, X_test, y_test)
Trained_support_vector_machine_Model, support_vector_machine_accuracy = Train_support_vector_machine(X_train, y_train, X_test, y_test)
Trained_Random_forest_Model, Random_forest_accuracy = Train_Random_forest(X_train, y_train, X_test, y_test)
Trained_logistic_regression_Model, logistic_regression_accuracy = Train_logistic_regression(X_train, y_train, X_test, y_test)
Trained_XGBoost_Model, XGBoost_accuracy = Train_XGBoost(X_train, y_train, X_test, y_test)

Decision_tree_AUC_ROC=
 0.8822831881370227
support_vector_AUC_ROC=
 0.5263347185175857
Random_forest_AUC_ROC=
 0.9087476484303153
logistic_regression_AUC_ROC=
 0.6899581773570075
XGBoost_AUC_ROC=
 0.9127190823440523


In [0]:
#Not used in final because of performance
def feature_engineering(df1):

  df1['day'] = df1['click_time'].dt.day.astype('uint8')
  df1['hour'] = df1['click_time'].dt.hour.astype('uint8')
  df1['minute'] = df1['click_time'].dt.minute.astype('uint8')
  df1['second'] = df1['click_time'].dt.second.astype('uint8')

  print('grouping by ip-channel combination')
  # Count the number of clicks by ip
  gp = df1[['ip','channel']].groupby(['ip'])[['channel']].count().reset_index().rename(index=str, columns={'channel':'ip_clickcount'})
  df1 = df1.merge(gp, on='ip', how='left', sort=False)
  del gp

  print('grouping by ip-day-hour combination')
  gp = df1[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_tcount'})
  df1 = df1.merge(gp, on=['ip','day','hour'], how='left',sort=False)
  del gp

  print('grouping by ip-app combination')
  gp = df1[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
  df1 = df1.merge(gp, on=['ip','app'], how='left',sort=False)
  del gp

  print('grouping by ip-app-os combination')
  gp = df1[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
  df1 = df1.merge(gp, on=['ip','app', 'os'], how='left',sort=False)
  del gp

  colmn_names = ['attributed_time','click_time', 'ip']
  df1 = df1.drop(colmn_names, axis=1)

  return df1

In [0]:
X_train,X_test,y_train, y_test=downsample(df)
X_train=feature_engineering(X_train)
X_test=feature_engineering(X_test)

1    185
0    185
Name: is_attributed, dtype: int64
grouping by ip-channel combination...
grouping by ip-day-hour combination...
grouping by ip-app combination...
grouping by ip-app-os combination...
grouping by ip-app-chl-mean-hour  combination...
vars and data type: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 370 entries, 0 to 369
Data columns (total 16 columns):
ip                          370 non-null int64
app                         370 non-null int64
device                      370 non-null int64
os                          370 non-null int64
channel                     370 non-null int64
click_time                  370 non-null datetime64[ns]
attributed_time             185 non-null object
day                         370 non-null uint8
hour                        370 non-null uint8
minute                      370 non-null uint8
second                      370 non-null uint8
ip_clickcount               370 non-null int64
ip_tcount                   370 non-null int64
ip_a

In [0]:
Trained_Decision_tree_Model, Decision_tree_accuracy = Train_Decision_tree(X_train, y_train, X_test, y_test)
Trained_support_vector_machine_Model, support_vector_machine_accuracy = Train_support_vector_machine(X_train, y_train, X_test, y_test)  
Trained_Random_forest_Model, Random_forest_accuracy = Train_Random_forest(X_train, y_train, X_test, y_test)
Trained_logistic_regression_Model, logistic_regression_accuracy = Train_logistic_regression(X_train, y_train, X_test, y_test)
Trained_XGBoost_Model, XGBoost_accuracy = Train_XGBoost(X_train, y_train, X_test, y_test)

Decision_tree_AUC_ROC=
 0.7836174296627858
support_vector_AUC_ROC=
 0.5648718418371435
Random_forest_AUC_ROC=
 0.8786427865480675
logistic_regression_AUC_ROC=
 0.6655619536058673
XGBoost_AUC_ROC=
 0.895180093032485


In [0]:
df_train = pd.read_csv('/content/gdrive/My Drive/Big_data_project/train_sample/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv', parse_dates=['click_time'])

In [0]:
#Used for final Version:
#Some feature dropped in final version
def feature_extraction(df_train):
  df_train['dow'] = df_train['click_time'].dt.dayofweek.astype('uint16')
  df_train['doy'] = df_train['click_time'].dt.dayofyear.astype('uint16')
  df_train['hour'] = df_train['click_time'].dt.hour.astype('uint16')
  features_clicks = ['ip', 'app', 'os', 'device']

  for col in features_clicks:
      col_count_dict = dict(df_train[[col]].groupby(col).size().sort_index())
      # train
      df_train['{}_clicks'.format(col)] = df_train[col].map(col_count_dict).astype('uint16')

  features_comb_list = [('app', 'device'), ('ip', 'app'), ('app', 'os')]
  for (col_a, col_b) in features_comb_list:
      df = df_train.groupby([col_a, col_b]).size().astype('uint16')
      df = pd.DataFrame(df, columns=['{}_{}_comb_clicks'.format(col_a, col_b)]).reset_index()      
      # train
      df_train = df_train.merge(df, how='left', on=[col_a, col_b])
    
  print(df_train.columns)
  new_features = [
    'app',
    'device',
    'os',
    'channel',
    'hour',
    'dow',
    'doy',
    'ip_clicks',
    'app_clicks',
    'os_clicks',
    'device_clicks',
    'app_device_comb_clicks',
    'ip_app_comb_clicks',
    'app_os_comb_clicks',
    'is_attributed'
  ]
  df=df_train[new_features]

  return df

In [0]:
df1=feature_extraction(df_train)
X_train,X_test,y_train, y_test=downsample(df1)

Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time',
       'is_attributed', 'dow', 'doy', 'hour', 'ip_clicks', 'app_clicks',
       'os_clicks', 'device_clicks', 'app_device_comb_clicks',
       'ip_app_comb_clicks', 'app_os_comb_clicks'],
      dtype='object')
1    185
0    185
Name: is_attributed, dtype: int64


In [0]:
Trained_Decision_tree_Model, Decision_tree_accuracy = Train_Decision_tree(X_train, y_train, X_test, y_test)
Trained_support_vector_machine_Model, support_vector_machine_accuracy = Train_support_vector_machine(X_train, y_train, X_test, y_test)  
Trained_Random_forest_Model, Random_forest_accuracy = Train_Random_forest(X_train, y_train, X_test, y_test)
Trained_logistic_regression_Model, logistic_regression_accuracy = Train_logistic_regression(X_train, y_train, X_test, y_test)
Trained_XGBoost_Model, XGBoost_accuracy = Train_XGBoost(X_train, y_train, X_test, y_test)

Decision_tree_AUC_ROC=
 0.8929811607309803
support_vector_AUC_ROC=
 0.5605617437294654
Random_forest_AUC_ROC=
 0.9338498200786844
logistic_regression_AUC_ROC=
 0.8801243231486039
XGBoost_AUC_ROC=
 0.9216693569005452
