In [1]:
import pickle

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from tqdm import tqdm_notebook

%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

# 1. Загрузка и преобразование данных

In [2]:
# загрузим обучающую и тестовую выборки
train_df = pd.read_csv("D:/Jupyter_projects/data/train_sessions.csv", index_col="session_id")
test_df = pd.read_csv("D:/Jupyter_projects/data/test_sessions.csv", index_col="session_id")

# приведем колонки time1, ..., time10 к временному формату
times = ["time%s" % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# отсортируем данные по времени
train_df = train_df.sort_values(by="time1")

# посмотрим на заголовок обучающей выборки
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [3]:
# приведем колонки site1, ..., site10 к целочисленному формату и заменим пропуски нулями
sites = ["site%s" % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype("int")
test_df[sites] = test_df[sites].fillna(0).astype("int")

# загрузим словарик сайтов
with open(r"D:/Jupyter_projects/data/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# датафрейм словарика сайтов
sites_dict_df = pd.DataFrame(
    list(site_dict.keys()), index=list(site_dict.values()), columns=["site"]
)
print(u"всего сайтов:", sites_dict_df.shape[0])
sites_dict_df.head()

всего сайтов: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [4]:
# наша целевая переменная
y_train = train_df["target"]

# объединенная таблица исходных данных
full_df = pd.concat([train_df.drop("target", axis=1), test_df])

# индекс, по которому будем отделять обучающую выборку от тестовой
idx_split = train_df.shape[0]

In [5]:
# табличка с индексами посещенных сайтов в сессии
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [6]:
from scipy.sparse import csr_matrix

In [7]:
# последовательность с индексами
sites_flatten = full_sites.values.flatten()

# искомая матрица
full_sites_sparse = csr_matrix(
    (
        [1] * sites_flatten.shape[0],
        sites_flatten,
        range(0, sites_flatten.shape[0] + 10, 10),
    )
)[:, 1:]

In [8]:
full_sites_sparse.shape

(336358, 48371)

In [9]:
X_train_sparse = full_sites_sparse[:idx_split]
X_test_sparse = full_sites_sparse[idx_split:]

In [10]:
X_train_sparse.shape, y_train.shape

((253561, 48371), (253561,))

In [11]:
X_test_sparse.shape

(82797, 48371)

# 2. Построение первой модели

In [12]:
def get_auc_lr_valid(X, y, C=1.0, ratio=0.9, seed=17):
    """
    X, y – выборка
    ratio – в каком отношении поделить выборку
    C, seed – коэф-т регуляризации и random_state 
              логистической регрессии
    """
    train_len = int(ratio * X.shape[0])
    X_train = X[:train_len, :]
    X_valid = X[train_len:, :]
    y_train = y[:train_len]
    y_valid = y[train_len:]
    
    logit = LogisticRegression(C=C, n_jobs=-1, random_state=seed)
    
    logit.fit(X_train, y_train)
    valid_pred = logit.predict_proba(X_valid)[:,1]
    
    return roc_auc_score(y_valid, valid_pred)
    

In [13]:
%%time
get_auc_lr_valid(X_train_sparse, y_train)

Wall time: 15.5 s


0.9197954065422085

## Обучите модель на всей выборке, сделайте прогноз для тестовой выборки и сделайте посылку в соревновании.

In [14]:
%%time
logit = LogisticRegression(n_jobs=-1, random_state=17)
logit.fit(X_train_sparse, y_train)

Wall time: 7.62 s


LogisticRegression(n_jobs=-1, random_state=17)

In [15]:
test_pred = logit.predict_proba(X_test_sparse)[:,1]

In [16]:
test_pred.shape

(82797,)

# Accuracy

In [17]:
print(f'Accuracy - : {logit.score(X_train_sparse, y_train):.3f}')

Accuracy - : 0.993


In [18]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [19]:
#write_to_submission_file(test_pred, 'benchmark1.csv')

# 3. Улучшение модели, построение новых признаков¶

## Создайте такой признак, который будет представлять собой число вида ГГГГММ от той даты, когда проходила сессия, например 201407 -- 2014 год и 7 месяц. Таким образом, мы будем учитывать помесячный линейный тренд за весь период предоставленных данных.


In [20]:
new_feat_train = pd.DataFrame(index=train_df.index)
new_feat_test = pd.DataFrame(index=test_df.index)

In [21]:
new_feat_train['year_month'] = train_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)
new_feat_test['year_month'] = test_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)

In [22]:
new_feat_train.head()

Unnamed: 0_level_0,year_month
session_id,Unnamed: 1_level_1
21669,201301
54843,201301
77292,201301
114021,201301
146670,201301


In [23]:
scaler = StandardScaler()
scaler.fit(new_feat_train['year_month'].values.reshape(-1,1))

new_feat_train['year_month_scaled'] = scaler.transform(new_feat_train['year_month'].values.reshape(-1,1))
new_feat_test['year_month_scaled'] = scaler.transform(new_feat_test['year_month'].values.reshape(-1,1))


In [24]:
new_feat_train.head()

Unnamed: 0_level_0,year_month,year_month_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
21669,201301,-1.744405
54843,201301,-1.744405
77292,201301,-1.744405
114021,201301,-1.744405
146670,201301,-1.744405


## Добавьте новый признак, предварительно отмасштабировав его с помощью StandardScaler, и снова посчитайте ROC AUC на отложенной выборке.

In [25]:
X_train_sparse.shape, new_feat_train['year_month_scaled'].values.reshape(-1,1).shape

((253561, 48371), (253561, 1))

In [26]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparse,
                             new_feat_train['year_month_scaled'].values.reshape(-1,1)]))

In [27]:
X_test_sparse_new = csr_matrix(hstack([X_test_sparse,
                                       new_feat_test['year_month_scaled'].values.reshape(-1,1)]))

In [28]:
X_train_sparse.shape, X_train_sparse_new.shape

((253561, 48371), (253561, 48372))

In [29]:
X_train_sparse_new.shape, y_train.shape

((253561, 48372), (253561,))

In [30]:
%%time
get_auc_lr_valid(X_train_sparse_new, y_train)

Wall time: 6.24 s


0.9198903563591923

# Обучите модель на выборке, сделайте прогноз для тестовой выборки и сделайте посылку в соревновании.

In [31]:
%%time
logit1 = LogisticRegression(n_jobs=-1, random_state=17)
logit1.fit(X_train_sparse_new, y_train)

Wall time: 6.96 s


LogisticRegression(n_jobs=-1, random_state=17)

In [32]:
test_pred1 = logit1.predict_proba(X_test_sparse_new)[:,1]

In [33]:
test_pred1.shape

(82797,)

# Accuracy

In [34]:
print(f'Accuracy - : {logit1.score(X_train_sparse_new, y_train):.3f}')

Accuracy - : 0.993


In [35]:
#write_to_submission_file(test_pred1, 'benchmark2.csv')

## Добавьте два новых признака: start_hour и morning.

In [36]:
new_feat_train['start_hour'] = train_df['time1'].apply(lambda st: st.hour)
new_feat_test['start_hour'] = test_df['time1'].apply(lambda st: st.hour)

In [37]:
scaler2 = StandardScaler()
scaler2.fit(new_feat_train['start_hour'].values.reshape(-1,1))

new_feat_train['start_hour_scaler'] = scaler2.transform(new_feat_train['start_hour'].values.reshape(-1,1))
new_feat_test['start_hour_scaler'] = scaler2.transform(new_feat_test['start_hour'].values.reshape(-1,1))

In [38]:
new_feat_train.head()

Unnamed: 0_level_0,year_month,year_month_scaled,start_hour,start_hour_scaler
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
21669,201301,-1.744405,8,-1.357366
54843,201301,-1.744405,8,-1.357366
77292,201301,-1.744405,8,-1.357366
114021,201301,-1.744405,8,-1.357366
146670,201301,-1.744405,8,-1.357366


### morning

In [39]:
new_feat_train['morning'] = train_df['time1'].apply(lambda st: 1 if st.hour <= 11 else 0)
new_feat_test['morning'] = test_df['time1'].apply(lambda st: 1 if st.hour <= 11 else 0)

In [40]:
new_feat_train.head()

Unnamed: 0_level_0,year_month,year_month_scaled,start_hour,start_hour_scaler,morning
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21669,201301,-1.744405,8,-1.357366,1
54843,201301,-1.744405,8,-1.357366,1
77292,201301,-1.744405,8,-1.357366,1
114021,201301,-1.744405,8,-1.357366,1
146670,201301,-1.744405,8,-1.357366,1


### start_month

In [41]:
new_feat_train['start_month'] = train_df['time1'].apply(lambda st: st.month)
new_feat_test['start_month'] = test_df['time1'].apply(lambda st: st.month)

In [42]:
new_feat_train.head()

Unnamed: 0_level_0,year_month,year_month_scaled,start_hour,start_hour_scaler,morning,start_month
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
21669,201301,-1.744405,8,-1.357366,1,1
54843,201301,-1.744405,8,-1.357366,1,1
77292,201301,-1.744405,8,-1.357366,1,1
114021,201301,-1.744405,8,-1.357366,1,1
146670,201301,-1.744405,8,-1.357366,1,1


## Посчитйте ROC AUC на отложенной выборке для выборки с:

### сайтами, start_month и start_hour

In [43]:
X_train_sparse_start_month_start_hour = csr_matrix(hstack([X_train_sparse_new,
                                                           new_feat_train['start_hour'].values.reshape(-1,1)]))

In [44]:
X_test_sparse_start_month_start_hour = csr_matrix(hstack([X_test_sparse_new,
                                                          new_feat_test['start_hour'].values.reshape(-1,1)]))

In [45]:
X_train_sparse_new.shape, X_train_sparse_start_month_start_hour.shape

((253561, 48372), (253561, 48373))

In [46]:
%%time
auc_train_start_month_start_hour = get_auc_lr_valid(X_train_sparse_start_month_start_hour, y_train)
print('Точность увеличилась на: ', auc_train_start_month_start_hour - 0.9198903563591923)

Точность увеличилась на:  0.03663296778408576
Wall time: 7.82 s


In [47]:
auc_train_start_month_start_hour

0.956523324143278

### сайтами, start_month и morning

In [48]:
X_train_sparse_start_month_morning = csr_matrix(hstack([X_train_sparse_new,
                                                           new_feat_train['morning'].values.reshape(-1,1)]))

In [49]:
X_train_sparse_new.shape, X_train_sparse_start_month_morning.shape

((253561, 48372), (253561, 48373))

In [50]:
%%time
auc_train_start_month_morning = get_auc_lr_valid(X_train_sparse_start_month_morning, y_train)
print('Точность увеличилась на: ', auc_train_start_month_morning - 0.956523324143278)

Точность увеличилась на:  -0.008772517749124908
Wall time: 7.31 s


In [51]:
auc_train_start_month_morning

0.9477508063941531

### сайтами, start_month, start_hour и morning

In [52]:
X_train_sparse_start_month_hour_morning = csr_matrix(hstack([X_train_sparse_start_month_start_hour,
                                                             new_feat_train['morning'].values.reshape(-1,1)]))

In [53]:
X_test_sparse_start_month_hour_morning = csr_matrix(hstack([X_test_sparse_start_month_start_hour,
                                                            new_feat_test['morning'].values.reshape(-1,1)]))

In [54]:
X_train_sparse_new.shape, X_train_sparse_start_month_hour_morning.shape

((253561, 48372), (253561, 48374))

In [55]:
auc_train_start_month_hour_morning = get_auc_lr_valid(X_train_sparse_start_month_hour_morning, y_train)
print('Точность увеличилась на: ', auc_train_start_month_hour_morning - auc_train_start_month_start_hour)

Точность увеличилась на:  0.0037988984009784676


In [56]:
auc_train_start_month_hour_morning

0.9603222225442565

# Построение модели

In [57]:
X_test_sparse_start_month_hour_morning.shape

(82797, 48374)

In [58]:
logit_96 = LogisticRegression(n_jobs=-1, random_state=17)
logit_96.fit(X_train_sparse_start_month_hour_morning, y_train)

LogisticRegression(n_jobs=-1, random_state=17)

In [59]:
test_pred_96 = logit_96.predict_proba(X_test_sparse_start_month_hour_morning)[:,1]

In [60]:
test_pred_96.shape

(82797,)

# Accuracy

In [61]:
print(f'Accuracy - : {logit_96.score(X_train_sparse_start_month_hour_morning, y_train):.3f}')

Accuracy - : 0.992


In [62]:
#write_to_submission_file(test_pred_96, 'alice_submit_96.csv')

In [63]:
# base_score = get_auc_lr_valid(X_train_sparse_start_month_hour_morning, y_train)
# base_score

# 4. Подбор коэффицициента регуляризации

## Посчитайте качество на отложенной выборке с коэффициентом регуляризации, который по умолчанию C=1:

In [64]:
# %%time
# C=1
# logit_4= LogisticRegression(C=C, n_jobs=-1, random_state=17)
# logit_4.fit(X_train_sparse, y_train)

In [65]:
#get_auc_lr_valid(X_train_sparse, y_train)

In [66]:
#test_4_pred = logit.predict_proba(X_test_sparse)[:,1]

In [67]:
#test_4_pred.shape

## Найдите C из np.logspace(-3, 1, 10), при котором ROC AUC на отложенной выборке максимален.

In [68]:
def get_auc_lr_valid_c(X, y, C, ratio = 0.9, seed=17):
    '''
    X, y – выборка
    ratio – в каком отношении поделить выборку
    C, seed – коэф-т регуляризации и random_state 
              логистической регрессии
    '''
    train_len = int(ratio * X.shape[0])
    X_train = X[:train_len, :]
    X_valid = X[train_len:, :]
    y_train = y[:train_len]
    y_valid = y[train_len:]
    
    logit = LogisticRegression(C=C, random_state=seed, n_jobs=-1)
    
    logit.fit(X_train,y_train)
    
    valid_pred = logit.predict_proba(X_valid)[:, 1]
    
    return roc_auc_score(y_valid, valid_pred)

In [69]:
np.logspace(-3,1,10)

array([1.00000000e-03, 2.78255940e-03, 7.74263683e-03, 2.15443469e-02,
       5.99484250e-02, 1.66810054e-01, 4.64158883e-01, 1.29154967e+00,
       3.59381366e+00, 1.00000000e+01])

In [70]:
%%time
max = 0
max_c = 0
for i in np.logspace(-3,1,10):
    mb = get_auc_lr_valid_c(X_train_sparse_start_month_hour_morning, y_train, i)
    if mb > max:
        max = mb
        max_c = i

Wall time: 1min


In [71]:
max, max_c

(0.9616045734111529, 0.1668100537200059)

## Наконец, обучите модель с найденным оптимальным значением коэффициента регуляризации и с построенными признаками start_hour, start_month и morning. Если вы все сделали правильно и загрузите это решение, то повторите второй бенчмарк соревнования.

In [72]:
# logit_5 = LogisticRegression(C=max_c, random_state=17, n_jobs=-1)
# logit_5.fit(X_train_sparse_start_month_hour_morning, y_train)
# test_predict_5_new_fet = logit_5.predict_proba(X_test_sparse_start_month_hour_morning)[:,1]

In [73]:
#test_predict_5_new_fet.shape

In [74]:
#write_to_submission_file(test_predict_5_new_fet, 'alice_submit_finall.csv')

# Hyperparameter optimization

In [75]:
#logModel = LogisticRegression()

In [76]:
# param_grid = [    
#     {
#     'C' : np.logspace(-3,1,10)
#     }
# ]

In [77]:
# param_grid = [    
#     {'penalty' : ['l1', 'l2'],
#     'C' : np.logspace(-3,1,10),
#     'solver' : ['newton-cg', 'sag'],
#     'max_iter' : [100, 1000]
#     }
# ]

In [78]:
#clf = GridSearchCV(logModel, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

In [79]:
#best_clf = clf.fit(X_train_sparse_start_month_hour_morning, y_train)

In [80]:
#best_clf.best_estimator_

In [81]:
#print (f'Accuracy - : {best_clf.score(X_train_sparse_start_month_hour_morning, y_train):.3f}')

In [82]:
#test_grid_pred = best_clf.best_estimator_.predict_proba(X_test_sparse_start_month_hour_morning)[:,1]

In [83]:
#test_grid_pred.shape

In [84]:
#write_to_submission_file(test_grid_pred, 'alice_grid_pred.csv')

In [85]:
full_sites_sparse.shape, y_train.shape

((336358, 48371), (253561,))

In [86]:
logit_wtf = LogisticRegression(C=0.1668100537200059, random_state=17, n_jobs=-1)
logit_wtf.fit(X_train_sparse, y_train)
test_predict_6 = logit_wtf.predict_proba(X_test_sparse)[:,1]

In [87]:
print(f'Accuracy - : {logit_wtf.score(X_train_sparse, y_train):.3f}')

Accuracy - : 0.992


In [88]:
test_predict_6.shape

(82797,)

In [89]:
write_to_submission_file(test_predict_6, 'alice_submit_wtf.csv')