In [24]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split

In [40]:
def mean_roc_auc(y,x,estimator):
    rocs = []
    for i in range(150):
        y_binary = [1 if user_id==i+1 else 0 for user_id in y]
        pred_for_y_binary = estimator.predict_proba(x)[:,i]
        rocs.append(roc_auc_score(y_binary,pred_for_y_binary))
    return sum(rocs)/150

In [3]:
PATH_TO_DATA = ('/Users/Roman/Documents/Machine_Learning_and_Data_Analysis.MPTI' +
                '/6_course/User_Identification/Week_1/capstone_user_identification')

In [7]:
#загрука матриц частот сайтов ("мешка слов") для выборки из 150 пользователей
with open(os.path.join(PATH_TO_DATA, 'X_sparse_150users.pkl'), 'rb') as X_sparse_150users_pkl:
    X_sparse_150users = pickle.load(X_sparse_150users_pkl)
with open(os.path.join(PATH_TO_DATA, 'y_150users.pkl'), 'rb') as y_150users_pkl:
    y_150users = pickle.load(y_150users_pkl)

In [198]:
#логит на "мешке слов"

X_train, X_valid, y_train, y_valid = train_test_split(X_sparse_150users, y_150users, 
                                                      test_size=0.3, 
                                                      random_state=17, stratify=y_150users)

logit = LogisticRegression(C=1,multi_class='ovr',n_jobs=-1,random_state=17)
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [201]:
mean_roc_auc_base = mean_roc_auc(y_valid,X_valid,logit)
mean_roc_auc_base

0.95495671895934875

In [205]:
#логит на TF-IDF

vectorizer = TfidfTransformer(use_idf=True,smooth_idf=False)
vectorizer.fit(X_sparse_150users)

X_sparse_tf_idf = vectorizer.transform(X_sparse_150users)

X_train, X_valid, y_train, y_valid = train_test_split(X_sparse_tf_idf, y_150users, 
                                                      test_size=0.3, 
                                                      random_state=17, stratify=y_150users)

logit_tf_idf = LogisticRegression(C=1,multi_class='ovr',n_jobs=-1,random_state=17)
logit_tf_idf.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [206]:
mean_roc_auc_tf_idf = mean_roc_auc(y_valid,X_valid,logit_tf_idf)
mean_roc_auc_tf_idf

0.94594848990675073

In [6]:
#загрузка основных признаков
with open(os.path.join(PATH_TO_DATA,'new_features_150users.pkl'), 'rb') as f:
    main_features = pickle.load(f)

main_features.head()

Unnamed: 0,session_timespan,#unique_sites,start_hour,day_of_week
0,57,5,8,4
1,1618,8,8,4
2,2010,5,8,4
3,2006,4,9,4
4,10,5,9,4


In [7]:
#нормализация признаков
def norm(X):
    m = np.mean(X)
    st = np.std(X)
    return [(x-m)/st for x in X]

main_features_norm = main_features.apply(norm)
main_features_norm.head()

Unnamed: 0,session_timespan,#unique_sites,start_hour,day_of_week
0,-0.164929,-0.580513,-1.471501,1.358234
1,-0.020534,0.745713,-1.471501,1.358234
2,0.015727,-0.580513,-1.471501,1.358234
3,0.015357,-1.022588,-1.167433,1.358234
4,-0.169276,-0.580513,-1.167433,1.358234


In [223]:
#логит на матрице частот слов и основных признаках

X_sparse_150user_extended = hstack([X_sparse_150users,main_features_norm]).tocsr()
X_train, X_valid, y_train, y_valid = train_test_split(X_sparse_150user_extended, y_150users, 
                                                      test_size=0.3, 
                                                      random_state=17, stratify=y_150users)

logit_extended = LogisticRegression(C=1,multi_class='ovr',n_jobs=-1,random_state=17)
logit_extended.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [224]:
mean_roc_auc_extended = mean_roc_auc(y_valid,X_valid,logit_extended)
mean_roc_auc_extended

0.95983936496093136

In [8]:
#загрузка дополнительных признаков
with open(os.path.join(PATH_TO_DATA,'selected_features_150users.pkl'), 'rb') as f:
    additional_features = pickle.load(f)

additional_features.head()

Unnamed: 0,top_site_share,max_diff,top_site_time_share,num_of_small_time
0,0.6,49,1.0,0
1,0.25,1504,0.0,1
2,0.4,1917,0.048841,3
3,0.25,1460,0.004978,1
4,0.2,4,0.8,0


In [9]:
additional_features_norm = additional_features.apply(norm)
additional_features_norm.head()

Unnamed: 0,top_site_share,max_diff,top_site_time_share,num_of_small_time
0,0.777572,-0.170535,1.591245,-0.695077
1,-0.432798,-0.03367,-0.964353,0.067126
2,0.085932,0.005179,-0.839536,1.591531
3,-0.432798,-0.037809,-0.951632,0.067126
4,-0.605708,-0.174768,1.080126,-0.695077


In [236]:
#логит на матрице частот слов, основных признаках и дополнительных
X_sparse_150user_extended2 = hstack([X_sparse_150user_extended,additional_features_norm]).tocsr()
X_train, X_valid, y_train, y_valid = train_test_split(X_sparse_150user_extended2, y_150users, 
                                                      test_size=0.3, 
                                                      random_state=17, stratify=y_150users)

logit_extended2 = LogisticRegression(C=1,multi_class='ovr',n_jobs=-1,random_state=17)
logit_extended2.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [238]:
mean_roc_auc_extended2 = mean_roc_auc(y_valid,X_valid,logit_extended2)
mean_roc_auc_extended2

0.95984048013781431

In [28]:
#загрука матриц частот сайтов ("мешка слов") для ширины окна 5 и длины сессии 15
with open(os.path.join(PATH_TO_DATA, 'X_sparse_150users_s15_w5.pkl'), 'rb') as X_sparse_150users_pkl:
    X_sparse_150users_s15_w5 = pickle.load(X_sparse_150users_pkl)
with open(os.path.join(PATH_TO_DATA, 'y_150users_s15_w5.pkl'), 'rb') as y_150users_pkl:
    y_150users_s15_w5 = pickle.load(y_150users_pkl)

In [31]:
#загрузка основных признаков для ширины окна 5 и длины сессии 15
with open(os.path.join(PATH_TO_DATA,'new_features_150users_s15_w5.pkl'), 'rb') as f:
    main_features_s15_w5 = pickle.load(f)

main_features_s15_w5.head()

Unnamed: 0,session_timespan,#unique_sites,start_hour,day_of_week
0,84,7,8,4
1,1668,10,8,4
2,1637,10,8,4
3,3593,7,8,4
4,2029,7,8,4


In [32]:
#загрузка дополнительных признаков для ширины окна 5 и длины сессии 15
with open(os.path.join(PATH_TO_DATA,'selected_features_150users_s15_w5.pkl'), 'rb') as f:
    additional_features_s15_w5 = pickle.load(f)

additional_features_s15_w5.head()

Unnamed: 0,top_site_share,max_diff,top_site_time_share,num_of_small_time
0,0.571429,49,0.606383,1
1,0.4,1504,0.02994,1
2,0.3,1504,0.013309,3
3,0.285714,1917,0.027424,3
4,0.285714,1917,0.046577,3


In [33]:
#нормализация признаков
main_features_s15_w5_norm = main_features_s15_w5.apply(norm)
additional_features_s15_w5_norm = additional_features_s15_w5.apply(norm)

In [34]:
#добавление основных признаков
X_sparse_150users15_w5_extended = hstack([X_sparse_150users_s15_w5,main_features_s15_w5_norm]).tocsr()

In [35]:
#добавление дополнительных признаков
X_sparse_150users15_w5_extended2 = hstack([X_sparse_150users15_w5_extended,additional_features_s15_w5_norm]).tocsr()

In [38]:
#логит на матрице частот слов, основных признаках и дополнительных, для ширины окна 5 и длины сессии 15
X_train, X_valid, y_train, y_valid = train_test_split(X_sparse_150users15_w5_extended2, y_150users_s15_w5, 
                                                      test_size=0.3, 
                                                      random_state=17, stratify=y_150users_s15_w5)

logit_extended_s15_w5= LogisticRegression(C=1,multi_class='ovr',n_jobs=-1,random_state=17)
logit_extended_s15_w5.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=17, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
mean_roc_auc_extended_s15_w5 = mean_roc_auc(y_valid,X_valid,logit_extended_s15_w5)
mean_roc_auc_extended_s15_w5

0.98884204305798373