In [66]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [67]:
# загрузим обучающую и тестовую выборки
train_df = pd.read_csv('~/Documents/Stepik + ODS/data/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('~/Documents/Stepik + ODS/data/test_sessions.csv',
                      index_col='session_id')

(253561, 21)

In [69]:
add_indexes = np.arange(253562, 276532, 1)
add_alice = pd.DataFrame(pd.np.tile(train_df[train_df['target'] == 1], (10, 1)),
                         columns=train_df.columns,
                        index=add_indexes)

  """Entry point for launching an IPython kernel.


In [70]:
train_df = train_df.append(add_alice)
train_df.shape

(276531, 21)

In [3]:
# приведем колонки time1, ..., time10 к временному формату
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# отсортируем данные по времени
train_df = train_df.sort_values(by='time1')

# посмотрим на заголовок обучающей выборки
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [4]:
# приведем колонки site1, ..., site10 к целочисленному формату и заменим пропуски нулями
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')
# загрузим словарик сайтов
with open(r"/home/vlad/Documents/Stepik + ODS/data/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# датафрейм словарика сайтов
sites_dict_df = pd.DataFrame(list(site_dict.keys()), 
                          index=list(site_dict.values()), 
                          columns=['site'])
print(u'всего сайтов:', sites_dict_df.shape[0])

всего сайтов: 48371


In [5]:
# наша целевая переменная
y_train = train_df['target']

# объединенная таблица исходных данных
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# индекс, по которому будем отделять обучающую выборку от тестовой
idx_split = train_df.shape[0]
# табличка с индексами посещенных сайтов в сессии
full_sites = full_df[sites]
full_sites.shape

(336358, 10)

In [6]:
full_sites

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947
...,...,...,...,...,...,...,...,...,...,...
82793,812,1039,676,0,0,0,0,0,0,0
82794,300,302,302,300,300,1222,302,1218,1221,1216
82795,29,33,35,22,37,6779,30,21,23,6780
82796,5828,23,21,804,21,3350,23,894,21,961


In [7]:
from scipy.sparse import csr_matrix

In [8]:
sites_flatten = full_sites.values.flatten()
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                sites_flatten,
                                range(0, sites_flatten.shape[0] + 10, 10)))[:, 1:]


In [9]:
X_train_sparse = full_sites_sparse[:idx_split]
X_test_sparse = full_sites_sparse[idx_split:]

In [10]:
def new_feature(df, X_sparse):
    scaler = StandardScaler()
    year = scaler.fit_transform(df['time1'].apply(lambda ts: ts.year).astype('int').values.reshape(-1, 1))
    month = scaler.fit_transform(df['time1'].apply(lambda ts: ts.month).astype('int').values.reshape(-1, 1))
    day_of_month = scaler.fit_transform(df['time1'].apply(lambda ts: ts.day).astype('int').values.reshape(-1, 1))
    day_of_week = scaler.fit_transform(df['time1'].apply(lambda ts: ts.dayofweek).astype('int').values.reshape(-1, 1))
    time_of_session = scaler.fit_transform((df[times].apply(lambda x: max(x), axis=1) - df[times]\
                                            .apply(lambda x: min(x), axis=1)).apply(lambda ts: ts.seconds)\
                                           .values.reshape(-1, 1))
    number_of_sites = scaler.fit_transform(df[times].apply(lambda x: pd.Series.count(x), axis=1).values.reshape(-1, 1))
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int').values.reshape(-1, 1)
    day = ((hour >= 12) & (hour <= 18)).astype('int').values.reshape(-1, 1)
    evening = ((hour >= 19) & (hour <= 23)).astype('int').values.reshape(-1, 1)
    night = ((hour >= 0) & (hour <= 6)).astype('int').values.reshape(-1, 1)
    X_sparse = hstack([X_sparse, year, month, day_of_month, day_of_week, time_of_session, number_of_sites, 
                                                   morning, day, evening, night]).tocsr()
                        
    
    return X_sparse

In [11]:
X_train_sparse = new_feature(train_df, X_train_sparse)
X_test_sparse = new_feature(test_df, X_test_sparse)

In [12]:
X_train_sparse.shape,X_test_sparse.shape, X_train_sparse.min(), X_train_sparse.max(),X_test_sparse.min(), X_test_sparse.max()

((253561, 48381),
 (82797, 48381),
 -4.904938312000599,
 10.0,
 -4.522269113455141,
 10.0)

In [13]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV

In [22]:
c_values = np.logspace(-3, 3, 10)
time_split = TimeSeriesSplit(n_splits=10)
logit = LogisticRegression()
logit_grid_searcher = GridSearchCV(estimator=logit, 
                                         param_grid={'C': c_values}, 
                                         scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=0)

In [18]:
%%time
logit_grid_searcher.fit(X_train_sparse, y_train)

CPU times: user 10 s, sys: 1.04 s, total: 11.1 s
Wall time: 2min 28s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.00000000e-03, 4.64158883e-03, 2.15443469e-02, 1.00000000e-01,
       4.64158883e-01, 2.15443469e+00, 1.00000000e+01, 4.64158883e+01,
       2.15443469e+02, 1.00000000e+03])},
             scoring='f1')

In [19]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.22269352431206751, {'C': 215.44346900318823})

In [20]:
test_pred = logit_grid_searcher.predict_proba(X_test_sparse)[:, 1]

In [21]:
pd.Series(test_pred, index=range(1, test_pred.shape[0] + 1), name='target')\
                .to_csv('benchmark_2.csv', header=True, index_label="session_id")