In [1]:
import pickle

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm_notebook

%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

In [11]:
train_df = pd.read_csv("train_sessions.csv", index_col="session_id")
test_df = pd.read_csv("test_sessions.csv", index_col="session_id")

In [19]:
train_df["site3"].nunique()

15759

In [20]:
# приведем колонки time1, ..., time10 к временному формату
times = ["time%s" % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)


In [24]:
train_df.shape

(253561, 21)

In [26]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253561 entries, 1 to 253561
Data columns (total 21 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   site1   253561 non-null  int64         
 1   time1   253561 non-null  datetime64[ns]
 2   site2   250098 non-null  float64       
 3   time2   250098 non-null  datetime64[ns]
 4   site3   246919 non-null  float64       
 5   time3   246919 non-null  datetime64[ns]
 6   site4   244321 non-null  float64       
 7   time4   244321 non-null  datetime64[ns]
 8   site5   241829 non-null  float64       
 9   time5   241829 non-null  datetime64[ns]
 10  site6   239495 non-null  float64       
 11  time6   239495 non-null  datetime64[ns]
 12  site7   237297 non-null  float64       
 13  time7   237297 non-null  datetime64[ns]
 14  site8   235224 non-null  float64       
 15  time8   235224 non-null  datetime64[ns]
 16  site9   233084 non-null  float64       
 17  time9   233084 non-null  date

In [27]:
# отсортируем данные по времени
train_df = train_df.sort_values(by="time1")

In [29]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [30]:
# приведем колонки site1, ..., site10 к целочисленному формату и заменим пропуски нулями
sites = ["site%s" % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype("int")
test_df[sites] = test_df[sites].fillna(0).astype("int")

In [31]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22,0


In [33]:
# загрузим словарик сайтов
with open(r"site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

In [39]:
# датафрейм словарика сайтов
sites_dict_df = pd.DataFrame(
    list(site_dict.keys()), index=list(site_dict.values()), columns=["site"]
)
sites_dict_df.sort_index(inplace=True)
print(u"всего сайтов:", sites_dict_df.shape[0])
sites_dict_df.head()

всего сайтов: 48371


Unnamed: 0,site
1,fpdownload2.macromedia.com
2,hotmail.fr
3,login.live.com
4,mail.live.com
5,dub122.mail.live.com


In [40]:
# наша целевая переменная
y_train = train_df["target"]

In [68]:
# объединенная таблица исходных данных
full_df = pd.concat([train_df.drop("target", axis=1), test_df])
full_df.shape

(336358, 20)

In [42]:
# индекс, по которому будем отделять обучающую выборку от тестовой
idx_split = train_df.shape[0]

In [43]:
idx_split

253561

In [44]:
# табличка с индексами посещенных сайтов в сессии
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [46]:
full_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0,NaT
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,945,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,946,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,952,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22


In [47]:
from scipy.sparse import csr_matrix

In [48]:
csr_matrix?

In [49]:
# последовательность с индексами
sites_flatten = full_sites.values.flatten()

In [50]:
sites_flatten

array([  56,   55,    0, ..., 1098, 1098, 1098])

In [55]:
sites_flatten.shape

(3363580,)

In [56]:
# искомая матрица
full_sites_sparse = csr_matrix(
    (
        [1] * sites_flatten.shape[0],
        sites_flatten,
        range(0, sites_flatten.shape[0] + 10, 10),
    )
)[:, 1:]

In [60]:
full_sites_sparse.shape

(336358, 48371)

In [169]:
X_train_sparse = full_sites_sparse[:idx_split]
X_test_sparse = full_sites_sparse[idx_split:]

In [67]:
X_train_sparse.shape

(253561, 48371)

In [90]:
def get_auc_lr_valid(X, y, C=1.0, ratio=0.9, seed=17):
    """
    X, y – выборка
    ratio – в каком отношении поделить выборку
    C, seed – коэф-т регуляризации и random_state 
              логистической регрессии
    """
    train_len = int(ratio * X.shape[0])
    X_train = X[:train_len, :]
    X_holdout = X[train_len:, :]
    y_train1 = y[:train_len]
    y_holdout = y[train_len:]
    logit = LogisticRegression(C=C, n_jobs=-1,random_state=seed)
    logit.fit(X_train, y_train1)
    holdout_pred = logit.predict_proba(X_holdout)[:,1]
    return roc_auc_score(y_holdout, holdout_pred)

In [91]:
%%time
logit = LogisticRegression(n_jobs=-1,random_state=17)
logit.fit(X_train_sparse, y_train)

CPU times: user 24.9 ms, sys: 29.8 ms, total: 54.7 ms
Wall time: 2.98 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(n_jobs=-1, random_state=17)

In [92]:
logit.predict_proba(X_test_sparse[0:5,:])

array([[9.97780636e-01, 2.21936381e-03],
       [9.99999997e-01, 2.51890707e-09],
       [9.99999994e-01, 6.16001107e-09],
       [9.99999987e-01, 1.32266119e-08],
       [9.99972709e-01, 2.72908248e-05]])

In [93]:
logit.predict_proba(X_test_sparse[0:5,:])[:,1]

array([2.21936381e-03, 2.51890707e-09, 6.16001107e-09, 1.32266119e-08,
       2.72908248e-05])

In [94]:
%%time
get_auc_lr_valid(X_train_sparse, y_train)

CPU times: user 36 ms, sys: 18.8 ms, total: 54.9 ms
Wall time: 2.73 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9197952555886043

In [98]:
test_pred = logit.predict_proba(X_test_sparse)[:,1]

In [99]:
test_pred.shape

(82797,)

In [100]:
# функция для записи прогнозов в файл
def write_to_submission_file(
    predicted_labels, out_file, target="target", index_label="session_id"
):
    predicted_df = pd.DataFrame(
        predicted_labels,
        index=np.arange(1, predicted_labels.shape[0] + 1),
        columns=[target],
    )
    predicted_df.to_csv(out_file, index_label=index_label)

In [208]:
    pd.Series(test_pred, index=range(1, test_pred.shape[0]+1), name="target").to_csv(
        'benchmark1.csv', header=True, index_label="session_id")

In [209]:
!head benchmark1.csv

session_id,target
1,0.002219363806458602
2,2.5189070719913367e-09
3,6.160011074673938e-09
4,1.322661186415569e-08
5,2.7290824830065995e-05
6,0.0001511784746722188
7,0.0004423739851154876
8,0.00010124653100774808
9,0.0007771572100006697


In [111]:
times = ['time%d' % i for i in range(1,11)]

In [114]:
train_df[times].head()

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,2013-01-12 08:05:57,2013-01-12 08:05:57,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
54843,2013-01-12 08:37:23,2013-01-12 08:37:23,2013-01-12 09:07:07,2013-01-12 09:07:09,NaT,NaT,NaT,NaT,NaT,NaT
77292,2013-01-12 08:50:13,2013-01-12 08:50:14,2013-01-12 08:50:15,2013-01-12 08:50:15,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:17,2013-01-12 08:50:17
114021,2013-01-12 08:50:17,2013-01-12 08:50:17,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:20
146670,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:22,2013-01-12 08:50:22,2013-01-12 08:50:22


In [125]:
test_df["time1"].apply(lambda ts: 100 * ts.year + ts.month).head()

session_id
1    201410
2    201407
3    201412
4    201411
5    201405
Name: time1, dtype: int64

In [187]:
new_feat_train = pd.DataFrame(index=train_df.index)
new_feat_test = pd.DataFrame(index=test_df.index)

In [188]:
new_feat_train['year_month'] = train_df["time1"].apply(lambda ts: 100 * ts.year + ts.month)
new_feat_test['year_month'] = test_df["time1"].apply(lambda ts: 100 * ts.year + ts.month)

In [189]:
scaler = StandardScaler()
scaler_test = StandardScaler()
scaler.fit(new_feat_train['year_month'].values.reshape(-1,1))
new_feat_train['year_month_scaled'] = scaler.transform(new_feat_train['year_month'].values.reshape(-1,1))
scaler_test.fit(new_feat_test['year_month'].values.reshape(-1,1))
new_feat_test['year_month_scaled'] = scaler_test.transform(new_feat_test['year_month'].values.reshape(-1,1))

In [190]:
new_feat_train.median(),new_feat_test['year_month_scaled'].shape

(year_month           201402.000000
 year_month_scaled         0.634518
 dtype: float64,
 (82797,))

In [241]:
new_feat_train[(new_feat_train["year_month_scaled"]<=5)&(new_feat_train["year_month_scaled"]>=0.9)]

Unnamed: 0_level_0,year_month,year_month_scaled,start_day,start_month,start_hour,month,morning,day,evening,night,start_hour_scaled,start_day_scaled,start_month_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


In [192]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparse, new_feat_train["year_month_scaled"].values.reshape(-1,1)]))


In [193]:
X_train_sparse_new.shape[0], X_train_sparse.shape[0], full_sites_sparse.shape[0], X_test_sparse.shape[0], new_feat_test.shape[0]

(253561, 253561, 336358, 82797, 82797)

In [194]:
X_test_sparse_new = csr_matrix(hstack([X_test_sparse, new_feat_test["year_month_scaled"].values.reshape(-1,1)]))

In [195]:
X_test_sparse.shape

(82797, 48371)

In [196]:
%%time
get_auc_lr_valid(X_train_sparse_new, y_train)

CPU times: user 60.2 ms, sys: 126 ms, total: 187 ms
Wall time: 3.58 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9198902054055882

In [201]:
test_pred_1 = logit1.predict_proba(X_test_sparse_new)[:,1]

In [199]:
%%time
logit1 = LogisticRegression(n_jobs=-1,random_state=17)
logit1.fit(X_train_sparse_new, y_train)

CPU times: user 24 ms, sys: 22.6 ms, total: 46.6 ms
Wall time: 3 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(n_jobs=-1, random_state=17)

In [206]:
pd.Series(test_pred, index=range(1, test_pred_1.shape[0]+1), name="target").to_csv(
    'benchmark2.csv', header=True, index_label="session_id")

In [207]:
!head "benchmark2.csv"

session_id,target
1,0.002219363806458602
2,2.5189070719913367e-09
3,6.160011074673938e-09
4,1.322661186415569e-08
5,2.7290824830065995e-05
6,0.0001511784746722188
7,0.0004423739851154876
8,0.00010124653100774808
9,0.0007771572100006697


In [211]:
train_df["time1"].apply(lambda ts:  ts.day).nunique()

25

In [212]:
new_feat_train['start_day'] = train_df["time1"].apply(lambda ts:  ts.day)
new_feat_test['start_day'] = test_df["time1"].apply(lambda ts:  ts.day)

In [216]:
new_feat_train['month'] = train_df["time1"].apply(lambda ts:  ts.month)
new_feat_test['month'] = test_df["time1"].apply(lambda ts:  ts.month)

In [214]:
new_feat_train['start_hour'] = train_df["time1"].apply(lambda ts:  ts.hour)
new_feat_test['start_hour'] = test_df["time1"].apply(lambda ts:  ts.hour)

In [224]:
new_feat_train['morning'] = train_df['time1'].apply(lambda st: 1 if (st.hour <= 11)&(st.hour >5) else 0)
new_feat_test['morning'] = test_df['time1'].apply(lambda st: 1 if (st.hour <= 11)&(st.hour > 5) else 0) 

In [225]:
new_feat_train['day'] = train_df['time1'].apply(lambda st: 1 if (st.hour > 11)&(st.hour <=17 ) else 0)
new_feat_test['day'] = test_df['time1'].apply(lambda st: 1 if (st.hour > 11)&(st.hour <= 17) else 0) 

In [226]:
new_feat_train['evening'] = train_df['time1'].apply(lambda st: 1 if (st.hour <= 23)&(st.hour >17 ) else 0)
new_feat_test['evening'] = test_df['time1'].apply(lambda st: 1 if (st.hour <=23)&(st.hour > 17) else 0) 

In [227]:
new_feat_train['night'] = train_df['time1'].apply(lambda st: 1 if (st.hour <= 5 ) else 0)
new_feat_test['night'] = test_df['time1'].apply(lambda st: 1 if (st.hour <= 5) else 0) 

In [229]:
new_feat_train.tail()

Unnamed: 0_level_0,year_month,year_month_scaled,start_day,start_month,start_hour,month,morning,day,evening,night
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
12224,201404,0.681626,30,4,23,4,0,0,1,0
164438,201404,0.681626,30,4,23,4,0,0,1,0
12221,201404,0.681626,30,4,23,4,0,0,1,0
156968,201404,0.681626,30,4,23,4,0,0,1,0
204762,201404,0.681626,30,4,23,4,0,0,1,0


In [232]:
scaler2 = StandardScaler()
scaler2.fit(new_feat_train['start_hour'].values.reshape(-1,1))
new_feat_train['start_hour_scaled'] = scaler2.transform(new_feat_train['start_hour'].values.reshape(-1,1))
new_feat_test['start_hour_scaled'] = scaler2.transform(new_feat_test['start_hour'].values.reshape(-1,1))

In [234]:
scaler3 = StandardScaler()
scaler3.fit(new_feat_train['start_day'].values.reshape(-1,1))
new_feat_train['start_day_scaled'] = scaler3.transform(new_feat_train['start_day'].values.reshape(-1,1))
new_feat_test['start_day_scaled'] = scaler3.transform(new_feat_test['start_day'].values.reshape(-1,1))

In [235]:
scaler4 = StandardScaler()
scaler4.fit(new_feat_train['start_month'].values.reshape(-1,1))
new_feat_train['start_month_scaled'] = scaler4.transform(new_feat_train['start_month'].values.reshape(-1,1))
new_feat_test['start_month_scaled'] = scaler4.transform(new_feat_test['start_month'].values.reshape(-1,1))

In [236]:
new_feat_train

Unnamed: 0_level_0,year_month,year_month_scaled,start_day,start_month,start_hour,month,morning,day,evening,night,start_hour_scaled,start_day_scaled,start_month_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
21669,201301,-1.744405,12,1,8,1,1,0,0,0,-1.357366,-0.857231,-0.943567
54843,201301,-1.744405,12,1,8,1,1,0,0,0,-1.357366,-0.857231,-0.943567
77292,201301,-1.744405,12,1,8,1,1,0,0,0,-1.357366,-0.857231,-0.943567
114021,201301,-1.744405,12,1,8,1,1,0,0,0,-1.357366,-0.857231,-0.943567
146670,201301,-1.744405,12,1,8,1,1,0,0,0,-1.357366,-0.857231,-0.943567
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12224,201404,0.681626,30,4,23,4,0,0,1,0,3.390349,1.502272,-0.137395
164438,201404,0.681626,30,4,23,4,0,0,1,0,3.390349,1.502272,-0.137395
12221,201404,0.681626,30,4,23,4,0,0,1,0,3.390349,1.502272,-0.137395
156968,201404,0.681626,30,4,23,4,0,0,1,0,3.390349,1.502272,-0.137395


In [238]:
new_feat_train[new_feat_train['start_month_scaled']>0]

Unnamed: 0_level_0,year_month,year_month_scaled,start_day,start_month,start_hour,month,morning,day,evening,night,start_hour_scaled,start_day_scaled,start_month_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
27876,201305,-1.650190,12,5,8,5,1,0,0,0,-1.357366,-0.857231,0.131329
113470,201305,-1.650190,12,5,8,5,1,0,0,0,-1.357366,-0.857231,0.131329
248880,201305,-1.650190,12,5,8,5,1,0,0,0,-1.357366,-0.857231,0.131329
168463,201305,-1.650190,12,5,8,5,1,0,0,0,-1.357366,-0.857231,0.131329
60553,201305,-1.650190,12,5,8,5,1,0,0,0,-1.357366,-0.857231,0.131329
...,...,...,...,...,...,...,...,...,...,...,...,...,...
137692,201312,-1.485314,28,12,21,12,0,0,1,0,2.757320,1.240105,2.012399
170829,201312,-1.485314,28,12,22,12,0,0,1,0,3.073835,1.240105,2.012399
197571,201312,-1.485314,28,12,22,12,0,0,1,0,3.073835,1.240105,2.012399
171583,201312,-1.485314,28,12,23,12,0,0,1,0,3.390349,1.240105,2.012399


In [242]:
X_train_sparse_new

<253561x48372 sparse matrix of type '<class 'numpy.float64'>'
	with 1683237 stored elements in Compressed Sparse Row format>

### start_month + start_hour

In [246]:
X_train_sparse_new_1 = csr_matrix(hstack([X_train_sparse, new_feat_train["start_month"].values.reshape(-1,1)]))
X_train_sparse_new_1 = csr_matrix(hstack([X_train_sparse_new_1, new_feat_test["start_hour"].values.reshape(-1,1)]))

In [247]:
X_test_sparse_new_1 = csr_matrix(hstack([X_test_sparse, new_feat_test["start_month"].values.reshape(-1,1)]))
X_test_sparse_new_1 = csr_matrix(hstack([X_test_sparse_new_1, new_feat_test["start_hour"].values.reshape(-1,1)]))

In [248]:
%%time
get_auc_lr_valid(X_train_sparse_new_1, y_train)

CPU times: user 67.5 ms, sys: 138 ms, total: 206 ms
Wall time: 3.94 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9561679793592079

In [260]:
%%time
logit_1 = LogisticRegression(n_jobs=-1,random_state=17)
logit_1.fit(X_train_sparse_new_1, y_train)
test_pred_1 = logit_1.predict_proba(X_test_sparse_new_1)[:,1]

CPU times: user 34.5 ms, sys: 40.5 ms, total: 75 ms
Wall time: 3.24 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [261]:
pd.Series(test_pred_1, index=range(1, test_pred_1.shape[0]+1), name="target").to_csv(
    'benchmark_md.csv', header=True, index_label="session_id")

In [262]:
!head 'benchmark_md.csv'

session_id,target
1,0.002045161499296301
2,7.404737622814331e-17
3,2.8041663159359286e-12
4,1.0740095641989812e-07
5,1.7739616767522177e-05
6,0.0001522755286591329
7,0.0008009626563439108
8,0.0005631713286019478
9,0.0003626054391721216


In [256]:
test_pred_1[0:6]

array([1.60067112e-03, 8.85470616e-09, 1.82445834e-09, 4.85686236e-09,
       5.41302732e-05, 7.75215810e-05])

### -||-  + full scale 

In [266]:
%%time
X_train_sparse_new_1_sc = csr_matrix(hstack([X_train_sparse, new_feat_train["start_month_scaled"].values.reshape(-1,1)]))
X_train_sparse_new_1_sc = csr_matrix(hstack([X_train_sparse_new_1_sc, new_feat_train["start_hour_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_1_sc = csr_matrix(hstack([X_test_sparse, new_feat_test["start_month_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_1_sc = csr_matrix(hstack([X_test_sparse_new_1_sc, new_feat_test["start_hour_scaled"].values.reshape(-1,1)]))
logit_1_sc = LogisticRegression(n_jobs=-1,random_state=17)
logit_1_sc.fit(X_train_sparse_new_1_sc, y_train)
test_pred_1_sc = logit_1_sc.predict_proba(X_test_sparse_new_1_sc)[:,1]
pd.Series(test_pred_1_sc, index=range(1, test_pred_1_sc.shape[0]+1), name="target").to_csv(
    'benchmark_md_sc.csv', header=True, index_label="session_id")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


CPU times: user 398 ms, sys: 198 ms, total: 596 ms
Wall time: 6.55 s


In [267]:
get_auc_lr_valid(X_train_sparse_new_1_sc, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9568427419695702

In [268]:
get_auc_lr_valid(X_train_sparse_new_1, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9561679793592079

### month + time of a day

In [270]:
%%time
X_train_sparse_new_2_sc = csr_matrix(hstack([X_train_sparse, new_feat_train["start_month"].values.reshape(-1,1)]))
X_train_sparse_new_2_sc = csr_matrix(hstack([X_train_sparse_new_2_sc, new_feat_train["morning"].values.reshape(-1,1)]))
X_train_sparse_new_2_sc = csr_matrix(hstack([X_train_sparse_new_2_sc, new_feat_train["day"].values.reshape(-1,1)]))
X_train_sparse_new_2_sc = csr_matrix(hstack([X_train_sparse_new_2_sc, new_feat_train["evening"].values.reshape(-1,1)]))
X_train_sparse_new_2_sc = csr_matrix(hstack([X_train_sparse_new_2_sc, new_feat_train["night"].values.reshape(-1,1)]))
X_test_sparse_new_2_sc = csr_matrix(hstack([X_test_sparse, new_feat_test["start_month"].values.reshape(-1,1)]))
X_test_sparse_new_2_sc = csr_matrix(hstack([X_test_sparse_new_2_sc, new_feat_test["morning"].values.reshape(-1,1)]))
X_test_sparse_new_2_sc = csr_matrix(hstack([X_test_sparse_new_2_sc, new_feat_test["day"].values.reshape(-1,1)]))
X_test_sparse_new_2_sc = csr_matrix(hstack([X_test_sparse_new_2_sc, new_feat_test["evening"].values.reshape(-1,1)]))
X_test_sparse_new_2_sc = csr_matrix(hstack([X_test_sparse_new_2_sc, new_feat_test["night"].values.reshape(-1,1)]))
logit_2_sc = LogisticRegression(n_jobs=-1,random_state=17)
logit_2_sc.fit(X_train_sparse_new_2_sc, y_train)
test_pred_2_sc = logit_2_sc.predict_proba(X_test_sparse_new_2_sc)[:,1]
pd.Series(test_pred_2_sc, index=range(1, test_pred_2_sc.shape[0]+1), name="target").to_csv(
    'benchmark_md_sc.csv', header=True, index_label="session_id")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


CPU times: user 483 ms, sys: 171 ms, total: 654 ms
Wall time: 4.03 s


In [271]:
get_auc_lr_valid(X_train_sparse_new_2_sc, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9467687022458273

### month + time of a day + scale

In [303]:
X_train_sparse_new_2 = csr_matrix(hstack([X_train_sparse, new_feat_train["start_month_scaled"].values.reshape(-1,1)]))
X_train_sparse_new_2 = csr_matrix(hstack([X_train_sparse_new_2, new_feat_train["morning"].values.reshape(-1,1)]))
X_train_sparse_new_2 = csr_matrix(hstack([X_train_sparse_new_2, new_feat_train["day"].values.reshape(-1,1)]))
X_train_sparse_new_2 = csr_matrix(hstack([X_train_sparse_new_2, new_feat_train["evening"].values.reshape(-1,1)]))
X_train_sparse_new_2 = csr_matrix(hstack([X_train_sparse_new_2, new_feat_train["night"].values.reshape(-1,1)]))
X_test_sparse_new_2 = csr_matrix(hstack([X_test_sparse, new_feat_test["start_month_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_2 = csr_matrix(hstack([X_test_sparse_new_2, new_feat_test["morning"].values.reshape(-1,1)]))
X_test_sparse_new_2 = csr_matrix(hstack([X_test_sparse_new_2, new_feat_test["day"].values.reshape(-1,1)]))
X_test_sparse_new_2 = csr_matrix(hstack([X_test_sparse_new_2, new_feat_test["evening"].values.reshape(-1,1)]))
X_test_sparse_new_2 = csr_matrix(hstack([X_test_sparse_new_2, new_feat_test["night"].values.reshape(-1,1)]))
logit_2 = LogisticRegression(n_jobs=-1,random_state=17, C=0.46415888336127775)
logit_2.fit(X_train_sparse_new_2, y_train)
test_pred_2 = logit_2.predict_proba(X_test_sparse_new_2)[:,1]
pd.Series(test_pred_2, index=range(1, test_pred_2.shape[0]+1), name="target").to_csv(
    'benchmark_md_sc.csv', header=True, index_label="session_id")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [304]:
get_auc_lr_valid(X_train_sparse_new_2, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9469832073172647

In [276]:
get_auc_lr_valid(X_train_sparse_new_2, y_train)-get_auc_lr_valid(X_train_sparse_new_2_sc, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.0002145050714373875

## Scale better than without scale

In [301]:
X_train_sparse_new_2 = csr_matrix(hstack([X_train_sparse_new_2, new_feat_train["start_hour_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_2 = csr_matrix(hstack([X_test_sparse_new_2, new_feat_test["start_hour_scaled"].values.reshape(-1,1)]))
X_train_sparse_new_2 = csr_matrix(hstack([X_train_sparse_new_2, new_feat_train["start_day_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_2 = csr_matrix(hstack([X_test_sparse_new_2, new_feat_test["start_day_scaled"].values.reshape(-1,1)]))
X_train_sparse_new_2 = csr_matrix(hstack([X_train_sparse_new_2, new_feat_train["year_month_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_2 = csr_matrix(hstack([X_test_sparse_new_2, new_feat_test["year_month_scaled"].values.reshape(-1,1)]))
logit_2 = LogisticRegression(n_jobs=-1,random_state=17, C=0.46415888336127775)
logit_2.fit(X_train_sparse_new_2, y_train)
test_pred_2 = logit_2.predict_proba(X_test_sparse_new_2)[:,1]
pd.Series(test_pred_2, index=range(1, test_pred_2.shape[0]+1), name="target").to_csv(
    'benchmark_top_sc.csv', header=True, index_label="session_id")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [302]:
get_auc_lr_valid(X_train_sparse_new_2, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9627897856338059

In [282]:
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse, new_feat_train["start_month_scaled"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["morning"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["day"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["evening"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["night"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse, new_feat_test["start_month_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["morning"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["day"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["evening"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["night"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["start_hour_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["start_hour_scaled"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["start_day_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["start_day_scaled"].values.reshape(-1,1)]))



In [283]:
logit_3 = LogisticRegression(n_jobs=-1,random_state=17)
logit_3.fit(X_train_sparse_new_3, y_train)
test_pred_3 = logit_3.predict_proba(X_test_sparse_new_3)[:,1]
pd.Series(test_pred_3, index=range(1, test_pred_3.shape[0]+1), name="target").to_csv(
    'benchmark_top1_sc.csv', header=True, index_label="session_id")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [284]:
get_auc_lr_valid(X_train_sparse_new_3, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9627173279038341

In [285]:
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse, new_feat_train["start_month_scaled"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["morning"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["day"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["evening"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["night"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse, new_feat_test["start_month_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["morning"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["day"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["evening"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["night"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["start_day_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["start_day_scaled"].values.reshape(-1,1)]))




In [295]:
logit_3 = LogisticRegression(n_jobs=-1,random_state=17)
logit_3.fit(X_train_sparse_new_3, y_train)
test_pred_3 = logit_3.predict_proba(X_test_sparse_new_3)[:,1]
pd.Series(test_pred_3, index=range(1, test_pred_3.shape[0]+1), name="target").to_csv(
    'benchmark_top1_sc.csv', header=True, index_label="session_id")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [294]:
get_auc_lr_valid(X_train_sparse_new_3, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9574430089763051

In [293]:
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse, new_feat_train["start_month_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse, new_feat_test["start_month_scaled"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["start_day_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["start_day_scaled"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["start_hour_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["start_hour_scaled"].values.reshape(-1,1)]))
X_train_sparse_new_3 = csr_matrix(hstack([X_train_sparse_new_3, new_feat_train["year_month_scaled"].values.reshape(-1,1)]))
X_test_sparse_new_3 = csr_matrix(hstack([X_test_sparse_new_3, new_feat_test["year_month_scaled"].values.reshape(-1,1)]))


