In [1]:
import pickle

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler


%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
train_df=pd.read_csv(r'C:\\Users\\01\\Desktop\\GENERAL_ACCES\\train_sessions.csv', index_col="session_id")
test_df=pd.read_csv(r'C:\\Users\\01\\Desktop\\GENERAL_ACCES\\test_sessions.csv', index_col="session_id")

In [3]:
times = ["time%s" % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# отсортируем данные по времени
train_df = train_df.sort_values(by="time1")

# посмотрим на заголовок обучающей выборки
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [4]:
# приведем колонки site1, ..., site10 к целочисленному формату и заменим пропуски нулями
sites = ["site%s" % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype("int")
test_df[sites] = test_df[sites].fillna(0).astype("int")

# загрузим словарик сайтов
with open(r'C:\\Users\\01\\Desktop\\GENERAL_ACCES\\site_dic.pkl', 'rb') as input_file:
    site_dict = pickle.load(input_file)

# датафрейм словарика сайтов
sites_dict_df = pd.DataFrame(
    list(site_dict.keys()), index=list(site_dict.values()), columns=["site"]
)
print(u"всего сайтов:", sites_dict_df.shape[0])
sites_dict_df.head()

всего сайтов: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [5]:
# наша целевая переменная
y_train = train_df["target"]

# объединенная таблица исходных данных
full_df = pd.concat([train_df.drop("target", axis=1), test_df])

# индекс, по которому будем отделять обучающую выборку от тестовой
idx_split = train_df.shape[0]

In [6]:
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [7]:
from scipy.sparse import csr_matrix

In [8]:
# последовательность с индексами
sites_flatten = full_sites.values.flatten()

# искомая матрица
full_sites_sparse = csr_matrix(
    (
        [1] * sites_flatten.shape[0],
        sites_flatten,
        range(0, sites_flatten.shape[0] + 10, 10),
    )
)[:, 1:]

In [9]:
full_sites_sparse

<336358x48371 sparse matrix of type '<class 'numpy.intc'>'
	with 3195430 stored elements in Compressed Sparse Row format>

In [10]:
X_train_sparse = full_sites_sparse[:idx_split]
X_test_sparse = full_sites_sparse[idx_split:]

In [11]:
X_train_sparse

<253561x48371 sparse matrix of type '<class 'numpy.intc'>'
	with 2412880 stored elements in Compressed Sparse Row format>

In [12]:
def get_auc_lr_valid(X, y, C=1.0, ratio=0.9, seed=17):
    x_len = int(X.shape[0]*ratio)
    X_train = X[:x_len, :]
    X_valid = X[x_len:, :]
    y_train = y[:x_len]
    y_valid = y[x_len:]
    logit = LogisticRegression(C=C, random_state=seed, n_jobs=-1)
    logit.fit(X_train, y_train)
    
    valid_pred = logit.predict_proba(X_valid)[:,1]
    return roc_auc_score(y_valid, valid_pred)

In [13]:
%%time
get_auc_lr_valid(X_train_sparse, y_train)

Wall time: 4.72 s


0.9197951046350002

In [14]:
def write_to_submission_file(
    predicted_labels, out_file, target="target", index_label="session_id"
):
    predicted_df = pd.DataFrame(
        predicted_labels,
        index=np.arange(1, predicted_labels.shape[0] + 1),
        columns=[target],
    )
    predicted_df.to_csv(out_file, index_label=index_label)

In [15]:
%%time 
logit = LogisticRegression(n_jobs=-1, random_state=17)
logit.fit(X_train_sparse, y_train)

Wall time: 4.31 s


LogisticRegression(n_jobs=-1, random_state=17)

In [17]:
prediction = logit.predict_proba(X_test_sparse)

In [20]:
write_to_submission_file(
    prediction[:,1], r'C:\\Users\\01\\Desktop\\GENERAL_ACCES\\submission.csv'
)

In [22]:
time = ['time%d' % i for i in range(1,11)]
train_df[time].head()

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,2013-01-12 08:05:57,2013-01-12 08:05:57,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
54843,2013-01-12 08:37:23,2013-01-12 08:37:23,2013-01-12 09:07:07,2013-01-12 09:07:09,NaT,NaT,NaT,NaT,NaT,NaT
77292,2013-01-12 08:50:13,2013-01-12 08:50:14,2013-01-12 08:50:15,2013-01-12 08:50:15,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:17,2013-01-12 08:50:17
114021,2013-01-12 08:50:17,2013-01-12 08:50:17,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:20
146670,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:22,2013-01-12 08:50:22,2013-01-12 08:50:22


In [49]:
train_df['time1'].apply(lambda ts: 100*ts.year+ts.month).head()

session_id
21669     201301
54843     201301
77292     201301
114021    201301
146670    201301
Name: time1, dtype: int64

In [50]:
new_feat_train = pd.DataFrame(index=train_df.index)
new_feat_test = pd.DataFrame(index=test_df.index)


In [60]:
new_feat_train['year_month'] = train_df['time1'].apply(lambda ts: 100 * ts.year+ts.month)

In [61]:
new_feat_test['year_month'] = test_df['time1'].apply(lambda ts: ts.year*100+ts.month)

In [59]:
new_feat_train.head()

Unnamed: 0_level_0,year_month
session_id,Unnamed: 1_level_1
21669,201301.0
54843,201301.0
77292,201301.0
114021,201301.0
146670,201301.0


In [62]:
scaler = StandardScaler()
scaler.fit(new_feat_train['year_month'].values.reshape(-1,1))
new_feat_train['year_month_scaled'] = scaler.transform(new_feat_train['year_month'].values.reshape(-1,1))
new_feat_test['year_month_scaled'] = scaler.transform(new_feat_test['year_month'].values.reshape(-1,1))

In [97]:
new_feat_test

Unnamed: 0_level_0,year_month,year_month_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,201410,0.822948
2,201407,0.752287
3,201412,0.870055
4,201411,0.846501
5,201405,0.705179
...,...,...
82793,201410,0.822948
82794,201405,0.705179
82795,201405,0.705179
82796,201405,0.705179


In [98]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparse, new_feat_train['year_month_scaled'].values.reshape(-1,1)]))


In [99]:
X_train_sparse_new.shape

(253561, 48372)

In [100]:
%%time
get_auc_lr_valid(X_train_sparse_new, y_train)

Wall time: 3.59 s


0.9198902054055882

In [114]:
new_feat_train['hour'] = train_df['time1'].apply(lambda ts: ts.hour)
new_feat_test['hour'] = test_df['time1'].apply(lambda ts: ts.hour)

new_feat_train['morning'] = (new_feat_train['hour']<11).astype(int)
new_feat_test['morning'] = (new_feat_test['hour']<11).astype(int)

new_feat_train['month'] = train_df['time1'].apply(lambda ts: ts.month)
new_feat_test['month'] = test_df['time1'].apply(lambda ts: ts.month)

In [115]:
new_feat_train

Unnamed: 0_level_0,year_month,year_month_scaled,hour,morning,month
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21669,201301,-1.744405,8,1,1
54843,201301,-1.744405,8,1,1
77292,201301,-1.744405,8,1,1
114021,201301,-1.744405,8,1,1
146670,201301,-1.744405,8,1,1
...,...,...,...,...,...
12224,201404,0.681626,23,0,4
164438,201404,0.681626,23,0,4
12221,201404,0.681626,23,0,4
156968,201404,0.681626,23,0,4


In [120]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparse, new_feat_train['hour'].values.reshape(-1,1),\
                                        new_feat_train['morning'].values.reshape(-1,1),\
                                       new_feat_train['month'].values.reshape(-1,1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse, new_feat_test['hour'].values.reshape(-1,1),\
                                        new_feat_test['morning'].values.reshape(-1,1),\
                                       new_feat_test['month'].values.reshape(-1,1)]))

In [121]:
X_train_sparse_new

<253561x48374 sparse matrix of type '<class 'numpy.int64'>'
	with 2027925 stored elements in Compressed Sparse Row format>

In [122]:
%%time
get_auc_lr_valid(X_train_sparse_new, y_train)

Wall time: 4.71 s


0.9549766535155886

In [138]:
%%time 
logit = LogisticRegression(n_jobs=-1, random_state=17, C=3.593813663804626)
logit.fit(X_train_sparse_new, y_train)

Wall time: 4.33 s


LogisticRegression(C=3.593813663804626, n_jobs=-1, random_state=17)

In [139]:
prediction = logit.predict_proba(X_test_sparse_new)

In [140]:
write_to_submission_file(
    prediction[:,1], r'C:\\Users\\01\\Desktop\\GENERAL_ACCES\\submission5.csv'
)

In [132]:
%%time
roc = {}
for i in np.logspace(-3,1,10):
    roc[str(i)] = get_auc_lr_valid(X_train_sparse_new, y_train, C=i)
    

Wall time: 35.1 s


In [133]:
max(roc)

'3.593813663804626'

In [134]:
roc

{'0.001': 0.935651422164095,
 '0.0027825594022071257': 0.9436975511702528,
 '0.007742636826811269': 0.9517743237580443,
 '0.021544346900318832': 0.9580535408281194,
 '0.05994842503189409': 0.9601374553328286,
 '0.1668100537200059': 0.958833970961357,
 '0.46415888336127775': 0.9551177951354295,
 '1.2915496650148828': 0.9510006865369915,
 '3.593813663804626': 0.9521135165064748,
 '10.0': 0.9562585515216728}