In [76]:
import pandas as pd
from scipy.sparse import hstack
import numpy as np

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [78]:
train_df_raw = pd.read_csv('train_sessions.csv',parse_dates = ['time1']).fillna(0).sort_values(by = ['time1'])

In [79]:
test_df_raw = pd.read_csv('test_sessions.csv',parse_dates = ['time1']).fillna(0).sort_values(by = ['time1'])

In [80]:
y_train = train_df_raw['target']
sessions_ids_test = test_df_raw['session_id']

In [81]:
site_columns = [col for col in train_df_raw.columns if col.startswith('site')]

In [82]:
train_df_raw = train_df_raw[site_columns].astype('int')
test_df_raw= test_df_raw[site_columns].astype('int')

In [83]:
train_df_raw.to_csv('train_sessions_text.txt',sep = ' ',index = None,header = None)
test_df_raw.to_csv('test_sessions_text.txt',sep = ' ',index = None , header = None)

In [84]:
tfidf = TfidfVectorizer(ngram_range = (1,2))

In [85]:
with open('train_sessions_text.txt') as train_file:
    train = []
    for line in train_file:
        train.append(line)
    X_train = tfidf.fit_transform(train)

In [86]:
with open('test_sessions_text.txt') as test_file:
    test = []
    for line in test_file:
        test.append(line)
    X_test = tfidf.transform(test)

In [87]:
X_train.shape,X_test.shape

((253561, 358456), (82797, 358456))

In [115]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler

In [116]:
maxabs_scalar = MaxAbsScaler()

In [117]:
X_train = maxabs_scalar.fit_transform(X_train)
X_test = maxabs_scalar.fit_transform(X_test)

In [88]:
train_df = pd.read_csv('train_sessions.csv',parse_dates=['time1']).sort_values(by=['time1'])

In [89]:
test_df = pd.read_csv('test_sessions.csv',parse_dates = ['time1']).sort_values(by=['time1'])

In [90]:
test_df['time1']

65539   2014-05-01 17:14:03
64198   2014-05-02 07:52:08
2267    2014-05-02 07:57:51
29733   2014-05-02 08:05:16
77047   2014-05-02 08:05:32
                ...        
73505   2014-12-05 20:07:54
59131   2014-12-05 20:55:17
78587   2014-12-05 21:54:46
26998   2014-12-05 22:26:40
60140   2014-12-05 23:26:53
Name: time1, Length: 82797, dtype: datetime64[ns]

In [91]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [118]:
X_train_new = add_time_features(train_df.fillna(0),X_train)
X_test_new = add_time_features(test_df.fillna(0),X_test)

In [119]:
y_train = train_df['target']

In [120]:
from sklearn.model_selection import TimeSeriesSplit,GridSearchCV

In [121]:
time_split = TimeSeriesSplit(n_splits = 10)

In [122]:
from sklearn.linear_model import LogisticRegression

In [131]:
from sklearn.svm import SVC

In [123]:
c = np.logspace(-2,2,10)

In [124]:
grid_param = {'C' : c}

In [132]:
logit = LogisticRegression(random_state = 42,solver = 'liblinear')
svc = SVC(random_state = 42)

In [133]:
grid_logit = GridSearchCV(logit,grid_param,cv = time_split,verbose = True,scoring = 'roc_auc',n_jobs = -1)
grid_svc = GridSearchCV(svc,grid_param,cv = time_split,verbose = True,scoring = 'roc_auc',n_jobs = -1)

In [127]:
grid_logit.fit(X_train_new,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.0min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=42, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
 

In [128]:
grid_logit.best_score_,grid_logit.best_params_

(0.9200540193372906, {'C': 1.6681005372000592})

In [129]:
predictions = grid_logit.predict_proba(X_test_new)[:,1]

In [130]:
submission_df = pd.DataFrame({'session_id': sessions_ids_test,'target':predictions})
submission_df.to_csv('submission3.csv',index = None)