In [134]:
import pickle
import pandas as pd
import os
from scipy.sparse import csr_matrix, hstack
import numpy as np
from datetime import datetime
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.metrics import roc_auc_score,make_scorer
from sklearn.model_selection import StratifiedKFold,GridSearchCV
from sklearn.svm import LinearSVC

In [135]:
PATH_TO_DATA = ('/Users/Roman/Documents/Machine_Learning_and_Data_Analysis.MPTI' +
                '/6_course/User_Identification/Week_1/capstone_user_identification/kaggle')

In [136]:
#обучающая и тестовая выборка
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
                       index_col='session_id')
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
                      index_col='session_id')

In [137]:
#объединение выборок
train_test_df = pd.concat([train_df, test_df])

In [138]:
train_test_df_sites = train_test_df[['site%d' % i for i in range(1, 11)]].fillna(0).astype('int')

In [139]:
#функция для слияния списка списков
def listmerge(lstlst):
    whole_lst=[]
    for lst in lstlst:
        whole_lst.extend(lst)
    return whole_lst

#функция, подготавлювающая параметры для создания разреженной матрицы
def param_for_csr(X,session_length=10):
    indices = np.array(listmerge(X))
    indptr = [i*session_length for i in range(int(indices.size/session_length+1))]
    data = np.array([1]*indices.size)
    return data,indices,indptr

In [140]:
#создания разреженной матрицы частот сайтов
train_test_sparse = csr_matrix(param_for_csr(train_test_df_sites.values,session_length=10))[:,1:]

In [141]:
#функция для гененрации признаков
def more_features(frame):
    sites = frame[['site%d' % i for i in range(1, 11)]].fillna(0).astype('int').values
    times_str = frame[['time%d' % i for i in range(1, 11)]].fillna(0).values
    
    times_format = []
    for session in times_str:
        times_format.append([datetime.strptime(t,"%Y-%m-%d %H:%M:%S") for t in session if t!=0])
     
    session_timespan = []
    start_hour = []
    day_of_week = []
    max_delta = []
    num_of_small_time = []
    for session in times_format:
        deltas_list = [(session[i+1]-session[i]).seconds for i in range(len(session)-1)]
        
        session_timespan.append(sum(deltas_list))
        start_hour.append(session[0].hour)
        day_of_week.append(session[0].weekday())
        if len(deltas_list)!= 0:
            max_delta.append(max(deltas_list))
        else: 
            max_delta.append(0)
            
        num_of_small_time.append(sum([1  for d in deltas_list if (d<20 and d>5)]))
    
    unique_sites = [] 
    top_site_shares = [] 

    
    for session in sites:
        unique_sites.append(np.unique([session[i]  for i in range(len(session)) if session[i]!=0]).size)   
        top_site_shares.append(sum([1  for top in (range(1,31)) if list(session).count(top)!=0])/np.unique([session[i]  for i in range(len(session)) if session[i]!=0]).size)
       
    return np.hstack(
        [np.array(session_timespan)[:,np.newaxis],np.array(start_hour)[:,np.newaxis],
        np.array(day_of_week)[:,np.newaxis], np.array(unique_sites)[:,np.newaxis],
        np.array(top_site_shares)[:,np.newaxis], np.array(max_delta)[:,np.newaxis],
        np.array(num_of_small_time)[:,np.newaxis]]
        
    )

In [142]:
features = more_features(train_test_df)

In [143]:
#функция для нормирования признаков
def norm(X):
    m = np.mean(X)
    st = np.std(X)
    return [(x-m)/st for x in X]

In [144]:
features_norm = pd.DataFrame(features).apply(norm)

In [145]:
#добавление признаков
train_test_sparse_extended = hstack([train_test_sparse,features_norm.values]).tocsr()

In [146]:
X_train_sparse_extended  = train_test_sparse[:len(train_df)]
X_test_sparse_extended  = train_test_sparse[len(train_df):]
y = train_df['target'].values

In [147]:
#создание обучающей и валидационной выборки
train_share = int(.7 * X_train_sparse_extended.shape[0])
X_train, y_train = X_train_sparse_extended[:train_share, :], y[:train_share]
X_valid, y_valid  = X_train_sparse_extended[train_share:, :], y[train_share:]

In [148]:
#логит с l2
logit = LogisticRegression(random_state=17,n_jobs=-1,penalty='l2')
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [149]:
logit_proba = logit.predict_proba(X_valid)

In [150]:
ROC_AUC_valid = roc_auc_score(y_valid, logit_proba[:,1])
ROC_AUC_valid

0.95831249715783184

In [157]:
C_values = np.linspace(1,2,20)

In [152]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17)

In [153]:
def roc_auc_score_proba(y_true, proba):
    return roc_auc_score(y_true, proba[:, 1])

auc = make_scorer(roc_auc_score_proba, needs_proba=True)

In [154]:
#подбор l2
logitCV = LogisticRegressionCV(Cs = C_values,random_state=17,n_jobs=-1,penalty='l2',scoring=auc ,cv=skf)
logitCV.fit(X_train, y_train)

LogisticRegressionCV(Cs=array([ 1.     ,  1.05263,  1.10526,  1.15789,  1.21053,  1.26316,
        1.31579,  1.36842,  1.42105,  1.47368,  1.52632,  1.57895,
        1.63158,  1.68421,  1.73684,  1.78947,  1.84211,  1.89474,
        1.94737,  2.     ]),
           class_weight=None,
           cv=StratifiedKFold(n_splits=3, random_state=17, shuffle=True),
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=-1, penalty='l2',
           random_state=17, refit=True,
           scoring=make_scorer(roc_auc_score_proba, needs_proba=True),
           solver='lbfgs', tol=0.0001, verbose=0)

In [164]:
logitCV_proba = logitCV.predict_proba(X_valid)

In [165]:
#результат на валидационной выборки 
ROC_AUC_valid = roc_auc_score(y_valid, logitCV_proba[:,1])
ROC_AUC_valid

0.95848061163994092

In [168]:
#обучение на всей обучающей выборки с оптимальным коэффициентом С
logit1 = LogisticRegression(C = 1.21,random_state=17,n_jobs=-1,penalty='l2')
logit1.fit(X_train_sparse_extended, y)

LogisticRegression(C=1.21, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [80]:
#подбор l1
logitCV2 = LogisticRegressionCV(Cs = C_values,random_state=17,n_jobs=-1,penalty='l1',
                                scoring=auc ,cv=skf,solver='liblinear')
logitCV2.fit(X_train, y_train)

LogisticRegressionCV(Cs=array([ 1.     ,  1.05263,  1.10526,  1.15789,  1.21053,  1.26316,
        1.31579,  1.36842,  1.42105,  1.47368,  1.52632,  1.57895,
        1.63158,  1.68421,  1.73684,  1.78947,  1.84211,  1.89474,
        1.94737,  2.     ]),
           class_weight=None,
           cv=StratifiedKFold(n_splits=3, random_state=17, shuffle=True),
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=-1, penalty='l1',
           random_state=17, refit=True,
           scoring=make_scorer(roc_auc_score_proba, needs_proba=True),
           solver='liblinear', tol=0.0001, verbose=0)

In [158]:
logitCV2_proba = logitCV2.predict_proba(X_valid)

In [159]:
#результат на валидационной выборки 
ROC_AUC_valid = roc_auc_score(y_valid, logitCV2_proba[:,1])
ROC_AUC_valid

0.95718107461522317

In [89]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [170]:
#обучение на всей обучающей выборки с оптимальным коэффициентом С
logit2 = LogisticRegression(C = 1.52631579, random_state=17,n_jobs=-1,penalty='l1')
logit2.fit(X_train_sparse_extended, y)

LogisticRegression(C=1.52631579, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=17,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [173]:
logit_test_proba1 = logit1.predict_proba(X_test_sparse_extended)[:,1]
logit_test_proba2 = logit2.predict_proba(X_test_sparse_extended)[:,1]

In [174]:
write_to_submission_file(logit_test_proba1, os.path.join(PATH_TO_DATA,'predictions1.csv'))
write_to_submission_file(logit_test_proba2, os.path.join(PATH_TO_DATA,'predictions2.csv'))