In [57]:
# Import libraries and set desired options
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, TimeSeriesSplit, GridSearchCV
from tqdm import tqdm_notebook
sns.set()

In [6]:
##a helper function for writing predictions to a file 

def write_to_submission_file(predicted_labels, out_file, target = 'target',
                            index_label = 'session_id'):
    predicted_df = pd.DataFrame(predicted_labels,
                               index= np.arange(1, predicted_labels.shape[0]+1),
                               columns = [target])
    predicted_df.to_csv(out_file, index_label = index_label)

In [8]:
cd catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/

/Users/Igor/Desktop/MICourse/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2


## Read training and test sets, sort train set by session start time

In [9]:

train_df = pd.read_csv('train_sessions.csv', index_col= 'session_id')

test_df = pd.read_csv('test_sessions.csv', index_col='session_id')

# convert time1...,time10 columns to datetime type 
times = ['time%s' % i for i in range(1,11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# sort the data by time
train_df = train_df.sort_values(by= 'time1')

train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [None]:
test_df.head()

In [None]:
cv = CountVectorizer()

In [None]:
cv.fit_transform(['site_1 site_17 site_2',
                 'site_2 site_2 site_1']).todense()

In [None]:
x_sparse =cv.fit_transform(['this movie is awful',
                  'enjoyed this movie, this movie is']).todense()

In [None]:
x_sparse

In [None]:
test_df[sites].head()

In [None]:
test_df[sites].fillna(0).astype('int').head()

## Transform data into format which can be fed into CountVectorizer

In [10]:
sites = ['site%s'%i for i in range (1,11)]
train_df[sites].fillna(0).astype('int').to_csv('train_session_text.txt', sep = ' ',
                                              index = None , header = None)
test_df[sites].fillna(0).astype('int').to_csv('test_session_text.txt', sep = ' ',
                                            index = None, header = None)

In [11]:
!head -5 train_session_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


In [12]:
!head -5 test_session_text.txt

29 35 22 321 23 2211 6730 21 44582 15336
782 782 782 782 782 782 782 782 782 782
55 55 55 55 55 55 55 55 1445 1445
1023 1022 50 222 202 3374 50 48 48 3374
301 301 301 66 67 69 70 68 71 167


 ### Fit CountVectorizer and transform data with it
 

In [13]:
%%time
cv = CountVectorizer()
with open('train_session_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
    
with open('test_session_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
    
print(X_train.shape, X_test.shape)

(253561, 41592) (82797, 41592)
CPU times: user 6.58 s, sys: 183 ms, total: 6.76 s
Wall time: 6.97 s


In [14]:
y_train = train_df['target'].astype('int')

In [None]:
X_train.todense()

In [None]:
X_test.todense()

## train logistic regression

In [15]:
logit = LogisticRegression(C=1, random_state =17)

In [16]:
cv_scores = cross_val_score(logit, X_train, y_train, cv=5, scoring = 'roc_auc')

In [17]:
cv_scores

array([0.91381466, 0.82974635, 0.87640134, 0.892229  , 0.91358429])

In [18]:
cv_scores.mean()

0.8851551295408328

In [19]:
%time
logit.fit(X_train, y_train)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 10 µs


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
 test_pred_logit1 = logit.predict_proba(X_test)[:,1]

In [21]:
test_pred_logit1

array([2.42986662e-03, 5.10598819e-09, 1.88419038e-08, ...,
       8.84325834e-03, 4.74556372e-04, 2.26304798e-05])

In [None]:
write_to_submission_file(predicted_labels=test_pred_logit1, out_file='logit_subm1.txt') ## ROC AUC 0.908

## Time features 
 - hour when the session starts
 - morning
 - day
 - evening 
 - night

In [23]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour>=7) & (hour <=11)).astype('int')
    day = ((hour >=11) & (hour <= 18)).astype('int')
    evening = ((hour >= 19)& (hour <=23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1,1),
               day.values.reshape(-1,1),
               evening.values.reshape(-1,1),
               night.values.reshape(-1,1)])
    return X

In [24]:
%time
X_train_with_time = add_time_features(train_df.fillna(0), X_train)
X_test_with_time = add_time_features(test_df.fillna(0), X_test)

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 14.8 µs


In [25]:
X_train_with_time.shape, X_test_with_time.shape

((253561, 41596), (82797, 41596))

In [26]:
X_train.shape

(253561, 41592)

In [15]:
logit = LogisticRegression(C=1, random_state =17)

In [34]:
cv_scores = cross_val_score(logit, X_train_with_time, y_train, cv=7, scoring = 'roc_auc')

In [35]:
cv_scores

array([0.91446142, 0.92132894, 0.89654102, 0.95614852, 0.96167935,
       0.96122944, 0.94573113])

In [36]:
cv_scores.mean()

0.9367314026041084

In [37]:
%time
logit.fit(X_train_with_time, y_train)

CPU times: user 8 µs, sys: 2 µs, total: 10 µs
Wall time: 49.8 µs


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [39]:
 test_pred_logit3 = logit.predict_proba(X_test_with_time)[:,1]

In [40]:
test_pred_logit3

array([4.69157321e-05, 4.11393537e-08, 3.83703570e-08, ...,
       2.07509336e-04, 1.70342837e-05, 7.69981563e-07])

In [33]:
# CV 0.9304
write_to_submission_file(predicted_labels=test_pred_logit2, out_file='logit_subm2.txt') ## ROC AUC PL 0.93567

In [41]:
# CV 0.9367
write_to_submission_file(predicted_labels=test_pred_logit3, out_file='logit_subm3.txt') ## ROC AUC PL 0.93567 
                                                                                ## the same as the previous one

## Add feature Hour

In [42]:
def add_time_hour_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour>=7) & (hour <=11)).astype('int')
    day = ((hour >=11) & (hour <= 18)).astype('int')
    evening = ((hour >= 19)& (hour <=23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, hour.values.reshape(-1,1),
            morning.values.reshape(-1,1),
               day.values.reshape(-1,1),
               evening.values.reshape(-1,1),
               night.values.reshape(-1,1)])
    return X

In [43]:
%time
X_train_with_time_hour = add_time_hour_features(train_df.fillna(0), X_train)
X_test_with_time_hour = add_time_hour_features(test_df.fillna(0), X_test)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 31.9 µs


In [45]:
X_train_with_time_hour.shape, X_test_with_time_hour.shape

((253561, 41597), (82797, 41597))

In [46]:
X_train.shape

(253561, 41592)

In [47]:
cv_scores = cross_val_score(logit, X_train_with_time_hour, y_train, cv=5, scoring = 'roc_auc')

In [48]:
cv_scores.mean()

0.9326275776225563

In [49]:
%time
logit.fit(X_train_with_time_hour, y_train)

CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 14.8 µs


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
 test_pred_logit4 = logit.predict_proba(X_test_with_time_hour)[:,1]

In [51]:
test_pred_logit4

array([3.56604251e-05, 4.56891541e-08, 5.09355604e-08, ...,
       1.27335183e-04, 2.38961132e-05, 7.39919639e-07])

In [52]:
# CV 0.9326
write_to_submission_file(predicted_labels=test_pred_logit4, out_file='logit_subm4.txt') ## ROC AUC PL 0.92587
  ## worse than without feature Hour

In [54]:
train_df.head(10)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0
242171,952,2013-01-12 08:50:22,947.0,2013-01-12 08:50:23,953.0,2013-01-12 08:50:23,946.0,2013-01-12 08:50:23,947.0,2013-01-12 08:50:24,...,2013-01-12 08:50:24,953.0,2013-01-12 08:50:24,955.0,2013-01-12 08:50:24,946.0,2013-01-12 08:50:25,947.0,2013-01-12 08:50:25,0
57157,953,2013-01-12 08:50:25,947.0,2013-01-12 08:50:26,946.0,2013-01-12 08:50:26,953.0,2013-01-12 08:50:26,955.0,2013-01-12 08:50:26,...,2013-01-12 08:50:27,953.0,2013-01-12 08:50:27,946.0,2013-01-12 08:50:27,953.0,2013-01-12 08:50:28,1033.0,2013-01-12 08:50:28,0
240201,946,2013-01-12 08:50:28,947.0,2013-01-12 08:50:28,954.0,2013-01-12 08:50:28,953.0,2013-01-12 08:50:29,946.0,2013-01-12 08:50:29,...,2013-01-12 08:50:29,946.0,2013-01-12 08:50:30,956.0,2013-01-12 08:50:30,957.0,2013-01-12 08:50:31,956.0,2013-01-12 08:50:31,0
210686,946,2013-01-12 08:50:31,956.0,2013-01-12 08:50:32,946.0,2013-01-12 08:50:32,946.0,2013-01-12 08:50:33,955.0,2013-01-12 08:50:33,...,2013-01-12 08:50:33,946.0,2013-01-12 08:50:34,946.0,2013-01-12 08:50:35,946.0,2013-01-12 08:50:36,948.0,2013-01-12 08:50:36,0
98804,948,2013-01-12 08:50:37,946.0,2013-01-12 08:50:37,948.0,2013-01-12 08:50:38,784.0,2013-01-12 08:50:49,49.0,2013-01-12 08:50:59,...,2013-01-12 08:51:03,812.0,2013-01-12 08:51:03,982.0,2013-01-12 08:51:03,52.0,2013-01-12 08:51:03,52.0,2013-01-12 08:51:04,0


## Tune the logit model by GridSearchCV and Time Series Split

In [58]:
time_split = TimeSeriesSplit(n_splits=10)

In [59]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train_with_time)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

In [60]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [61]:
cv_scores = cross_val_score(logit, X_train_with_time, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=1)

In [62]:
cv_scores.mean()

0.9151897430356163

In [63]:
%time
logit.fit(X_train_with_time, y_train)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 35 µs


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [64]:
 test_pred_logit5 = logit.predict_proba(X_test_with_time)[:,1]

In [65]:
test_pred_logit5

array([4.69157321e-05, 4.11393537e-08, 3.83703570e-08, ...,
       2.07509336e-04, 1.70342837e-05, 7.69981563e-07])

In [66]:
# CV 0.9151
write_to_submission_file(predicted_labels=test_pred_logit5, out_file='logit_subm5.txt') ## ROC AUC PL 0.93567
    # the same score as the best one
  
    
    # BoW for sites, 4 more features(morning, day, evening, night)
    # Logistic Regression(solver = liblinear)
    # TimesSeriesSplit =10  CV score 0.9151

In [67]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=1, cv=time_split, verbose=1)

In [68]:
%%time
logit_grid_searcher.fit(X_train_with_time, y_train) # WTF? Locally, it's 3min 30s

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  9.3min finished


CPU times: user 16min 23s, sys: 30.6 s, total: 16min 54s
Wall time: 9min 20s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [69]:
logit_grid_searcher.best_params_, logit_grid_searcher.best_estimator_

({'C': 0.5994842503189409},
 LogisticRegression(C=0.5994842503189409, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=17,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False))

In [70]:
logit_grid_searcher.best_score_

0.9153561692349115

In [71]:
 test_pred_logit6 = logit_grid_searcher.predict_proba(X_test_with_time)[:,1]

In [72]:
## CV GridSearchCV best score 0.9153
write_to_submission_file(predicted_labels=test_pred_logit6,out_file='logit_subm6.txt') ## ROC AUC PubL 0.93740
### increased the score by 