In [13]:
import pandas as pd

In [15]:
n_rows = 30000

df = pd.read_csv("train.gz", nrows=n_rows)

print(df.head(5))

                     id  click      hour    C1  banner_pos   site_id  \
0   1000009418151094273      0  14102100  1005           0  1fbe01fe   
1  10000169349117863715      0  14102100  1005           0  1fbe01fe   
2  10000371904215119486      0  14102100  1005           0  1fbe01fe   
3  10000640724480838376      0  14102100  1005           0  1fbe01fe   
4  10000679056417042096      0  14102100  1005           1  fe8cc448   

  site_domain site_category    app_id app_domain  ... device_type  \
0    f3845767      28905ebd  ecad2386   7801e8d9  ...           1   
1    f3845767      28905ebd  ecad2386   7801e8d9  ...           1   
2    f3845767      28905ebd  ecad2386   7801e8d9  ...           1   
3    f3845767      28905ebd  ecad2386   7801e8d9  ...           1   
4    9166c161      0569f928  ecad2386   7801e8d9  ...           1   

  device_conn_type    C14  C15  C16   C17  C18  C19     C20  C21  
0                2  15706  320   50  1722    0   35      -1   79  
1                0

In [16]:
X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values

Y = df['click'].values

print(X.shape)

(30000, 19)


In [18]:
n_train = int(n_rows * 0.9)
X_train = X[:n_train]
Y_train = Y[:n_train]

X_test = X[n_train:]
Y_test = Y[n_train:]

In [19]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

X_train_enc = enc.fit_transform(X_train)
X_train_enc[0]

print(X_train_enc[0])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 19 stored elements and shape (1, 3929)>
  Coords	Values
  (0, 2)	1.0
  (0, 6)	1.0
  (0, 71)	1.0
  (0, 1002)	1.0
  (0, 1024)	1.0
  (0, 1460)	1.0
  (0, 1508)	1.0
  (0, 1529)	1.0
  (0, 2016)	1.0
  (0, 3257)	1.0
  (0, 3261)	1.0
  (0, 3361)	1.0
  (0, 3605)	1.0
  (0, 3609)	1.0
  (0, 3644)	1.0
  (0, 3735)	1.0
  (0, 3740)	1.0
  (0, 3775)	1.0
  (0, 3915)	1.0


In [20]:
X_test_enc = enc.transform(X_test)

In [21]:
from sklearn.tree import DecisionTreeClassifier

parameters = {'max_depth': [3, 10, None]}

decision_tree = DecisionTreeClassifier(criterion='gini', min_samples_split=30)

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3, scoring='roc_auc')

grid_search.fit(X_train_enc, Y_train)
print(grid_search.best_params_)

{'max_depth': 10}


In [22]:
decision_tree_best = grid_search.best_estimator_
pos_prob = decision_tree_best.predict_proba(X_test_enc)[:, 1]

from sklearn.metrics import roc_auc_score
print(f'The ROC AUC on testing set is: {roc_auc_score(Y_test, pos_prob):.3f}')

The ROC AUC on testing set is: 0.679


In [24]:
import numpy as np
pos_prob = np.zeros(len(Y_test))
click_index = np.random.choice(len(Y_test), int(len(Y_test) *  51211.0/300000), replace=False)
pos_prob[click_index] = 1

print(f'The ROC AUC on testing set using random selection is: {roc_auc_score(Y_test, pos_prob):.3f}')

The ROC AUC on testing set using random selection is: 0.504


### Ensembling decision trees – random forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1)

grid_search = GridSearchCV(random_forest, parameters, n_jobs=-1, cv=3, scoring='roc_auc')
grid_search.fit(X_train_enc, Y_train)

print(grid_search.best_params_)

{'max_depth': None}


In [27]:
random_forest_best = grid_search.best_estimator_
pos_prob = random_forest_best.predict_proba(X_test_enc)[:, 1]
print(f'The ROC AUC on testing set using random forest is: {roc_auc_score(Y_test, pos_prob):.3f}')

The ROC AUC on testing set using random forest is: 0.706
