In [1]:
import pandas as pd
import time
import csv
import numpy as np
import os
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.linear_model import SGDClassifier, LassoCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sknn.mlp import Classifier, Layer
from sklearn.metrics import explained_variance_score, accuracy_score
from sklearn.model_selection import KFold, cross_val_score



In [2]:
features = ['hour', 'day', 'dow', 'C1', 'banner_pos', 'device_type', 'device_conn_type',
            'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'site_id', 'site_domain',
            'site_category', 'app_id', 'app_domain', 'app_category', 'device_model',
            'device_id', 'device_ip']

# Load data

train = pd.read_csv('data/train-100000R.csv', nrows=1000, dtype={'id': pd.np.string_})
test = pd.read_csv('data/train-100000R.csv', nrows=1000, dtype={'id': pd.np.string_})
    
train.shape, test.shape

((1000, 24), (1000, 24))

In [3]:
# Pre-processing non-number values
le = LabelEncoder()
for col in ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_model',
            'device_id', 'device_ip']:
    le.fit(list(train[col]) + list(test[col]))
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

train.shape, test.shape

((1000, 24), (1000, 24))

In [4]:
# Stochastic Gradient Descent is sensitive to feature scaling, so it is highly recommended to scale your data.
scaler = StandardScaler()
for col in ['C1', 'banner_pos', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']:
    ch = np.hstack([train[col], test[col]]).reshape(-1, 1)
#     print(ch.shape,'\n================\n', ch[:10], '\n================\n', ch_reshape.shape, '\n',ch_reshape[:10])
    scaler.fit(ch)
    train[col] = scaler.transform(train[col].values.reshape(-1, 1))
    test[col] = scaler.transform(test[col].values.reshape(-1, 1))
    
train.shape, test.shape, train.columns



((1000, 24),
 (1000, 24),
 Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
        'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
        'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
        'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
       dtype='object'))

In [5]:
# Add new features:
train['day'] = train['hour'].apply(lambda x: (x - x % 10000) / 1000000)  # day
train['dow'] = train['hour'].apply(lambda x: ((x - x % 10000) / 1000000) % 7)  # day of week
train['hour'] = train['hour'].apply(lambda x: x % 10000 / 100)  # hour
 
test['day'] = test['hour'].apply(lambda x: (x - x % 10000) / 1000000)  # day
test['dow'] = test['hour'].apply(lambda x: ((x - x % 10000) / 1000000) % 7)  # day of week
test['hour'] = test['hour'].apply(lambda x: x % 10000 / 100)  # hour

train.shape, test.shape, type(train), type(test)

((1000, 26),
 (1000, 26),
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [6]:
# Remove outliner
for col in ['C18', 'C20', 'C21']:
    # keep only the ones that are within +3 to -3 standard deviations in the column col,
    train = train[np.abs(train[col] - train[col].mean()) <= (3 * train[col].std())]
       
train.shape, test.shape

((1000, 26), (1000, 26))

In [24]:
# Define classifiers

classifiers = [
    LogisticRegression(random_state=0),
#     LassoCV(),
    KNeighborsClassifier(n_neighbors=100, weights='uniform', algorithm='auto',
                         leaf_size=100, p=2, metric='minkowski'),
    LinearDiscriminantAnalysis(n_components=3),
    GaussianNB(),
    DecisionTreeClassifier(),
    GradientBoostingClassifier(),
    SGDClassifier(loss='log', n_iter=30, verbose=5, learning_rate='invscaling', eta0=0.0000000001), #'invscaling'
#     Classifier(
#         layers=[
# #             Layer('Rectifier', units=100),
#             Layer("Softmax")],
#         learning_rate=0.0001000,
#         learning_rule='momentum',
#         learning_momentum=0.9,
#         batch_size=25,
#         valid_size=0.1,
#         # valid_set=(X_test, y_test),
#         n_stable=10,
#         n_iter=10,
#         verbose=True),
    ExtraTreesClassifier(n_estimators=100),
    RandomForestClassifier(n_estimators=100)

#         TODO : https://github.com/dmlc/xgboost/issues/2334
#         XGBClassifier(n_estimators=512, max_depth=4),
]

classifiers    

[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 KNeighborsClassifier(algorithm='auto', leaf_size=100, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=100, p=2,
            weights='uniform'),
 LinearDiscriminantAnalysis(n_components=3, priors=None, shrinkage=None,
               solver='svd', store_covariance=False, tol=0.0001),
 GaussianNB(priors=None),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'),
 GradientBoostingClassifier(criterion='f

In [25]:
# Train - kfold diverging for NN
time_taken = []
k_fold_splits = 2
cv_score = np.array((len(classifiers), k_fold_splits))

for classifier in classifiers:
    print("=====================================================================================================")
    start = time.time()
    
    if classifier.__class__.__name__ not in ['Classifier', 'SGDClassifier']: #not NN
        k_fold = KFold(n_splits=k_fold_splits)
        for X_train_idx, X_test_idx in k_fold.split(train):
            classifier.fit(train[features], train.click).score(train[features], train.click)
        cvs = cross_val_score(classifier, train[features], train.click, cv=k_fold, n_jobs=-1) #all cpu's
        print("cvs shape", cvs.shape)
        cv_score = np.vstack((cv_score, cvs))
            
    
    else:
        classifier.fit(train[features], train.click)
        cvs = np.zeros((k_fold_splits))
    
    print("cv score", cv_score)
    time_taken.append(time.time() - start)
    print('{} \n -> Training time: {}'.format(classifier.__class__.__name__, time.time() - start))
    print("=====================================================================================================")
    
    
time_taken

cvs shape (2,)
cv score [[9.    2.   ]
 [0.814 0.828]]
LogisticRegression 
 -> Training time: 3.2990105152130127
cvs shape (2,)
cv score [[9.    2.   ]
 [0.814 0.828]
 [0.828 0.836]]
KNeighborsClassifier 
 -> Training time: 3.0221245288848877




cvs shape (2,)
cv score [[9.    2.   ]
 [0.814 0.828]
 [0.828 0.836]
 [0.828 0.808]]
LinearDiscriminantAnalysis 
 -> Training time: 2.974665880203247
cvs shape (2,)
cv score [[9.    2.   ]
 [0.814 0.828]
 [0.828 0.836]
 [0.828 0.808]
 [0.586 0.468]]
GaussianNB 
 -> Training time: 2.885432481765747
cvs shape (2,)
cv score [[9.    2.   ]
 [0.814 0.828]
 [0.828 0.836]
 [0.828 0.808]
 [0.586 0.468]
 [0.72  0.71 ]]
DecisionTreeClassifier 
 -> Training time: 2.9995338916778564
cvs shape (2,)
cv score [[9.    2.   ]
 [0.814 0.828]
 [0.828 0.836]
 [0.828 0.808]
 [0.586 0.468]
 [0.72  0.71 ]
 [0.802 0.82 ]]
GradientBoostingClassifier 
 -> Training time: 3.5013458728790283




-- Epoch 1
Norm: 0.00, NNZs: 24, Bias: -0.000000, T: 1000, Avg. loss: 0.693033
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 0.00, NNZs: 24, Bias: -0.000000, T: 2000, Avg. loss: 0.692935
Total training time: 0.02 seconds.
-- Epoch 3
Norm: 0.00, NNZs: 24, Bias: -0.000000, T: 3000, Avg. loss: 0.692870
Total training time: 0.03 seconds.
-- Epoch 4
Norm: 0.00, NNZs: 24, Bias: -0.000000, T: 4000, Avg. loss: 0.692818
Total training time: 0.03 seconds.
-- Epoch 5
Norm: 0.00, NNZs: 24, Bias: -0.000000, T: 5000, Avg. loss: 0.692773
Total training time: 0.04 seconds.
-- Epoch 6
Norm: 0.00, NNZs: 24, Bias: -0.000000, T: 6000, Avg. loss: 0.692733
Total training time: 0.05 seconds.
-- Epoch 7
Norm: 0.00, NNZs: 24, Bias: -0.000000, T: 7000, Avg. loss: 0.692697
Total training time: 0.06 seconds.
-- Epoch 8
Norm: 0.00, NNZs: 24, Bias: -0.000000, T: 8000, Avg. loss: 0.692663
Total training time: 0.07 seconds.
-- Epoch 9
Norm: 0.00, NNZs: 24, Bias: -0.000000, T: 9000, Avg. loss: 0.692631
Total tra

[3.2990105152130127,
 3.0221245288848877,
 2.974665880203247,
 2.885432481765747,
 2.9995338916778564,
 3.5013458728790283,
 0.2626473903656006,
 3.9500389099121094,
 3.810330867767334]

In [9]:
# Evaluation 

log_losses = []
rmse_loss = []
accuracy = []
y_expected = test.click.values

print('{:<26} | {:<25} | {:<25} | {:<10} | {}'.format('Classifier', 'Log loss', 'Rmse loss', 'Accuracy', 'Time'))

for idx, classifier in enumerate(classifiers):
    if classifier.__class__.__name__ != 'LassoCV':
        y_predicted = classifier.predict_proba(test[features])
        accu_score = accuracy_score(y_expected, classifier.predict(test[features]))
    log_loss_class = log_loss(y_expected, y_predicted)
    rmse_loss_class = mean_squared_error(y_expected, np.compress([False, True], y_predicted, axis=1))**0.5
    
    
    log_losses.append(log_loss_class)
    rmse_loss.append(rmse_loss_class)
    accuracy.append(accu_score)

    print('{:<26} | {:<25} | {:<25} | {:<10} | {}'.format(classifier.__class__.__name__, log_loss_class, rmse_loss_class, accu_score, time_taken[idx]))


Classifier                 | Log loss                  | Rmse loss                 | Accuracy   | Time
LogisticRegression         | 0.40144727326001545       | 0.3537067386221846        | 0.834      | 3.044811725616455
KNeighborsClassifier       | 0.44502315158365513       | 0.3711123819006852        | 0.832      | 2.9766688346862793
LinearDiscriminantAnalysis | 0.6145220917464038        | 0.3919325348491192        | 0.832      | 3.0550663471221924
GaussianNB                 | 1.4593109756307105        | 0.5568927816905266        | 0.555      | 2.871941566467285
DecisionTreeClassifier     | 0.002772588722240776      | 0.03162277660168379       | 0.998      | 2.9717466831207275
GradientBoostingClassifier | 0.2709133653722514        | 0.28197951622968914       | 0.891      | 3.4896512031555176
SGDClassifier              | 0.6921718115546823        | 0.49951207800750835       | 0.832      | 0.33983302116394043
ExtraTreesClassifier       | 0.002772588722240776      | 0.03162277660168379   

In [26]:
print("Best")
print('Log loss (Lowest)\n ', classifiers[log_losses.index(min(log_losses))])
print('Rmse (Lowest) \n ', classifiers[rmse_loss.index(min(rmse_loss))])
print('Accuracy (Highest)\n', classifiers[accuracy.index(max(accuracy))])
print('Time (Least)\n', classifiers[time_taken.index(min(time_taken))])

Best
Log loss (Lowest)
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Rmse (Lowest) 
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Accuracy (Highest)
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_sp

In [11]:
#TODO: 
# 1. Grid search 
# 2. Plot this to see time vs accuracy vs error
# 3. nn and sgd parameter
# 4. Integrate the feature selection part - choose best feat