In [1]:
# Nativos
import random as rn
import os
import sys
import gc

#calculo
import numpy as np
import pandas as pd
import scipy as sp

#grafico
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
%matplotlib inline
sns.set(style="whitegrid")

#warning ignore future
import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
gc.collect()

BASE_DIR = os.path.dirname(os.getcwd())
if BASE_DIR not in sys.path: sys.path.append(BASE_DIR)

from utils import *
from graphs import *

SEED = 29082013
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
rn.seed(SEED)

subfolder = "data"
os.listdir(subfolder)

['CONTENT_CATEGORY.csv',
 '.ipynb_checkpoints',
 'device_data.csv',
 'sampleSubmission.csv',
 'SITE_ID.csv',
 'PAGE.csv',
 'CONTENT_CATEGORY_TOP.csv',
 'conversiones.csv',
 'CONTENT_CATEGORY_BOTTOM.csv',
 'pageviews.csv']

## LOAD PAGE VIEWS INFO

In [2]:
col_user = 'USER_ID'
pageview = pd.read_csv("{}/pageviews.csv".format(subfolder), parse_dates=["FEC_EVENT"]).sort_values([col_user, "FEC_EVENT"])
pageview.shape

(17936934, 8)

In [3]:
null_verificator(pageview)

Unnamed: 0,0
msje,DATA LIMPIA DE NULOS


In [4]:
pageview.dtypes

FEC_EVENT                  datetime64[ns]
PAGE                                int64
CONTENT_CATEGORY                    int64
CONTENT_CATEGORY_TOP                int64
CONTENT_CATEGORY_BOTTOM             int64
SITE_ID                             int64
ON_SITE_SEARCH_TERM                 int64
USER_ID                             int64
dtype: object

In [5]:
for col in pageview.columns:
    print(col, pageview[col].unique().shape[0])

FEC_EVENT 10496454
PAGE 1725
CONTENT_CATEGORY 65
CONTENT_CATEGORY_TOP 13
CONTENT_CATEGORY_BOTTOM 65
SITE_ID 4
ON_SITE_SEARCH_TERM 294
USER_ID 11676


In [6]:
cols_select = ['PAGE', 'CONTENT_CATEGORY', 'CONTENT_CATEGORY_TOP', 'SITE_ID', 'ON_SITE_SEARCH_TERM', 'weekday', 'schedule']

def get_schedule(val):
    if val < 6:
        return 0
    elif val < 9:
        return 1
    elif val < 13:
        return 2
    elif val < 16:
        return 3
    elif val < 19:
        return 4
    else:
        return 5
    
def add_schedule_weekday(data):
    data['weekday'] = data['FEC_EVENT'].dt.weekday
    data['hour'] = data['FEC_EVENT'].dt.hour
    data['schedule'] = data['hour'].apply(get_schedule) 

    for col in ['CONTENT_CATEGORY_BOTTOM', 'hour', 'month']:
        if col in data.columns:
            del data[col]
    
    return data

def get_format_data(data):
    X_data = []
    
    for col in cols_select:
        print("-----> ", col)

        temp = pd.crosstab(
            data.USER_ID, data[col]
        )
        temp.columns = [col + "_" + str(v) for v in temp.columns]
        X_data.append(
            temp.apply(lambda x: x / x.sum(), axis=1)
        )

    return pd.concat(X_data, axis=1)

def get_trimestre(val):
    if val <= 3:
        return 1
    elif val <= 6:
        return 2
    elif val <= 9:
        return 3
    elif val <= 12:
        return 4
    return 0

pageview['month'] = pageview['FEC_EVENT'].dt.month
pageview['trimestre'] = pageview['month'].apply(get_trimestre)
## del pageview['month']
pageview['trimestre'].unique()

array([1, 2, 3, 4])

In [7]:
pageview.columns[1:]

Index(['PAGE', 'CONTENT_CATEGORY', 'CONTENT_CATEGORY_TOP',
       'CONTENT_CATEGORY_BOTTOM', 'SITE_ID', 'ON_SITE_SEARCH_TERM', 'USER_ID',
       'month', 'trimestre'],
      dtype='object')

In [8]:
pageview[list(pageview.columns[1:])].corr()

Unnamed: 0,PAGE,CONTENT_CATEGORY,CONTENT_CATEGORY_TOP,CONTENT_CATEGORY_BOTTOM,SITE_ID,ON_SITE_SEARCH_TERM,USER_ID,month,trimestre
PAGE,1.0,0.175382,0.14158,0.175382,0.040502,0.007231,-0.004054,0.089182,0.086931
CONTENT_CATEGORY,0.175382,1.0,0.643041,1.0,-0.207466,0.019454,-0.005654,-0.079462,-0.076358
CONTENT_CATEGORY_TOP,0.14158,0.643041,1.0,0.643041,0.061494,0.02528,-0.002259,0.025535,0.02447
CONTENT_CATEGORY_BOTTOM,0.175382,1.0,0.643041,1.0,-0.207466,0.019454,-0.005654,-0.079462,-0.076358
SITE_ID,0.040502,-0.207466,0.061494,-0.207466,1.0,-0.010708,0.000961,0.146864,0.141987
ON_SITE_SEARCH_TERM,0.007231,0.019454,0.02528,0.019454,-0.010708,1.0,0.002691,-0.006056,-0.005437
USER_ID,-0.004054,-0.005654,-0.002259,-0.005654,0.000961,0.002691,1.0,-0.086435,-0.11047
month,0.089182,-0.079462,0.025535,-0.079462,0.146864,-0.006056,-0.086435,1.0,0.975801
trimestre,0.086931,-0.076358,0.02447,-0.076358,0.141987,-0.005437,-0.11047,0.975801,1.0


In [9]:
if 'CONTENT_CATEGORY_BOTTOM' in pageview.columns:
    del pageview['CONTENT_CATEGORY_BOTTOM']

if 'month' in pageview.columns:
    del pageview['month']

In [10]:
pageview.groupby(['trimestre']).count()[col_user]

trimestre
1    4861441
2    5677125
3    2497836
4    4900532
Name: USER_ID, dtype: int64

In [11]:
all_user = set(list(pageview[col_user].unique()))
len(all_user)

11676

In [12]:
X_test_k = get_format_data(add_schedule_weekday(pageview.copy()))
X_test_k.shape

----->  PAGE
----->  CONTENT_CATEGORY
----->  CONTENT_CATEGORY_TOP
----->  SITE_ID
----->  ON_SITE_SEARCH_TERM
----->  weekday
----->  schedule


(11676, 2114)

In [13]:
all_columns = list(X_test_k.columns)
X_test_k_mini = X_test_k[list(X_test_k.columns)[:2]]
display(X_test_k_mini.head())

len(all_columns), type(X_test_k)

Unnamed: 0_level_0,PAGE_1,PAGE_2
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.0283,0.12844
1,0.103742,0.157271
2,0.032517,0.168074
3,0.005168,0.192765
4,0.010628,0.120451


(2114, pandas.core.frame.DataFrame)

In [14]:
data_view = []
cols_common = set()

for val in list(pageview['trimestre'].unique())[:-1]:
    print("/"*20, val, end=' ')
    page_trim = pageview[pageview['trimestre'] <= val]
    unique_users = set(list(page_trim[col_user].unique()))
    
    print(page_trim.shape, len(unique_users))

    # FORMAT DATA
    X_test = get_format_data(add_schedule_weekday(page_trim))
    print("X_test: ", X_test.shape)
    
    # SELECT COMMON COLUMNS
    cols_common = set(X_test.columns) if not cols_common else cols_common.intersection(set(X_test.columns))
    print("cols_common:: ", len(cols_common))
    
    data_view.append(X_test.copy())
    del X_test
    del page_trim

# data_view = pd.concat(data_view, axis=0)
del pageview

//////////////////// 1 (4861441, 8) 10936
----->  PAGE
----->  CONTENT_CATEGORY
----->  CONTENT_CATEGORY_TOP
----->  SITE_ID
----->  ON_SITE_SEARCH_TERM
----->  weekday
----->  schedule
X_test:  (10936, 1606)
cols_common::  1606
//////////////////// 2 (10538566, 8) 11314
----->  PAGE
----->  CONTENT_CATEGORY
----->  CONTENT_CATEGORY_TOP
----->  SITE_ID
----->  ON_SITE_SEARCH_TERM
----->  weekday
----->  schedule
X_test:  (11314, 1843)
cols_common::  1606
//////////////////// 3 (13036402, 8) 11387
----->  PAGE
----->  CONTENT_CATEGORY
----->  CONTENT_CATEGORY_TOP
----->  SITE_ID
----->  ON_SITE_SEARCH_TERM
----->  weekday
----->  schedule
X_test:  (11387, 1945)
cols_common::  1606


In [15]:
for dat in data_view:
    display(dat.head())

Unnamed: 0_level_0,PAGE_1,PAGE_2,PAGE_3,PAGE_4,PAGE_5,PAGE_6,PAGE_7,PAGE_8,PAGE_9,PAGE_10,...,weekday_3,weekday_4,weekday_5,weekday_6,schedule_0,schedule_1,schedule_2,schedule_3,schedule_4,schedule_5
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.058121,0.067898,0.0239,0.008148,0.001086,0.006518,0.000543,0.000543,0.042368,0.177621,...,0.217816,0.180337,0.047257,0.023357,0.0,0.1195,0.456274,0.109723,0.112439,0.202064
1,0.125201,0.120385,0.027287,0.014446,0.001605,0.001605,0.0,0.0,0.046549,0.176565,...,0.163724,0.300161,0.051364,0.072231,0.014446,0.022472,0.536116,0.150883,0.083467,0.192616
2,0.038186,0.157518,0.076372,0.02148,0.02148,0.0,0.0,0.0,0.045346,0.145585,...,0.202864,0.24821,0.019093,0.009547,0.004773,0.0,0.288783,0.582339,0.073986,0.050119
3,0.0,0.115607,0.023121,0.00289,0.00289,0.0,0.0,0.0,0.0,0.0,...,0.141618,0.144509,0.106936,0.063584,0.023121,0.037572,0.147399,0.317919,0.361272,0.112717
4,0.019516,0.096019,0.07338,0.0,0.018735,0.021077,0.0,0.000781,0.002342,0.270882,...,0.311475,0.071819,0.029664,0.035129,0.215457,0.051522,0.188134,0.196721,0.111632,0.236534


Unnamed: 0_level_0,PAGE_1,PAGE_2,PAGE_3,PAGE_4,PAGE_5,PAGE_6,PAGE_7,PAGE_8,PAGE_9,PAGE_10,...,weekday_3,weekday_4,weekday_5,weekday_6,schedule_0,schedule_1,schedule_2,schedule_3,schedule_4,schedule_5
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.049499,0.090226,0.022556,0.007832,0.00188,0.004699,0.000313,0.000313,0.067043,0.176065,...,0.194862,0.180764,0.073622,0.015038,0.0,0.098371,0.432331,0.159148,0.12312,0.18703
1,0.120096,0.120897,0.016813,0.007206,0.001601,0.000801,0.0,0.0,0.07526,0.184147,...,0.178543,0.217774,0.039231,0.048038,0.064852,0.054444,0.427542,0.1249,0.080064,0.248199
2,0.031083,0.168274,0.078242,0.013934,0.018221,0.001072,0.0,0.0,0.063237,0.130761,...,0.199357,0.144695,0.016077,0.039657,0.007503,0.013934,0.222937,0.601286,0.108253,0.046088
3,0.0,0.169557,0.108863,0.032755,0.030829,0.006744,0.0,0.0,0.0,0.0,...,0.154143,0.106936,0.100193,0.088632,0.007707,0.012524,0.199422,0.259152,0.288054,0.233141
4,0.01453,0.092891,0.076803,0.002076,0.028023,0.02439,0.0,0.000519,0.001557,0.243902,...,0.261546,0.051375,0.035288,0.046186,0.217955,0.063311,0.197198,0.186819,0.090815,0.243902


Unnamed: 0_level_0,PAGE_1,PAGE_2,PAGE_3,PAGE_4,PAGE_5,PAGE_6,PAGE_7,PAGE_8,PAGE_9,PAGE_10,...,weekday_3,weekday_4,weekday_5,weekday_6,schedule_0,schedule_1,schedule_2,schedule_3,schedule_4,schedule_5
USER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.037052,0.109324,0.042142,0.020562,0.009161,0.004886,0.000204,0.000204,0.058225,0.150041,...,0.209283,0.17101,0.075733,0.027077,0.0,0.07329,0.45399,0.170806,0.116042,0.185871
1,0.113219,0.130967,0.03366,0.011016,0.002448,0.001836,0.0,0.0,0.070991,0.165851,...,0.171971,0.247246,0.034884,0.050184,0.059364,0.082007,0.399021,0.141983,0.084455,0.23317
2,0.032615,0.172976,0.110076,0.022132,0.023879,0.004077,0.0,0.0,0.038439,0.082702,...,0.217239,0.139779,0.029703,0.056494,0.043098,0.027956,0.300524,0.395457,0.126383,0.106581
3,0.001287,0.187902,0.132561,0.039254,0.039897,0.01094,0.0,0.0,0.0,0.0,...,0.151866,0.143501,0.093951,0.094595,0.006435,0.010296,0.21686,0.259974,0.272201,0.234234
4,0.013317,0.103309,0.080307,0.004843,0.037934,0.021388,0.0,0.000404,0.001211,0.221146,...,0.262308,0.062954,0.028249,0.046408,0.184019,0.053672,0.274818,0.156174,0.114205,0.217111


In [16]:
for dat in data_view:
    print(dat.shape)

(10936, 1606)
(11314, 1843)
(11387, 1945)


## LOAD CONVERTIONS

In [17]:
convertion = pd.read_csv("{}/conversiones.csv".format(subfolder)).drop_duplicates()[['USER_ID','mes']].sort_values(['USER_ID','mes']).reset_index(drop=True)

display(convertion.head())
convertion.shape

Unnamed: 0,USER_ID,mes
0,13.0,8.0
1,22.0,12.0
2,40.0,1.0
3,57.0,6.0
4,92.0,3.0


(1564, 2)

In [18]:
for col in convertion.columns:
    convertion[col] = convertion[col].astype(int)

convertion.dtypes

USER_ID    int64
mes        int64
dtype: object

In [19]:
convertion['trimestre'] = convertion['mes'].apply(get_trimestre)
for col in convertion.columns:
    print(col, convertion[col].unique().shape)

del convertion['mes']
convertion.head()

USER_ID (1086,)
mes (12,)
trimestre (4,)


Unnamed: 0,USER_ID,trimestre
0,13,3
1,22,4
2,40,1
3,57,2
4,92,1


In [20]:
len(cols_common), type(cols_common)

(1606, set)

In [21]:
data_conv = []
col_target = 'TARGET'


def get_convertions(data_trim, data_analysis):
    users_data = data_analysis.index
    current = pd.Series(0, index=users_data)
    
    idx = set(
        data_trim.USER_ID.unique()
    ).intersection(
        set(users_data)
    )

    current.loc[list(idx)] = 1
    return current

for val in sorted(list(convertion['trimestre'].unique())):
    print("/"*20, val)
    conv_trim = convertion[convertion['trimestre'] == val].drop_duplicates().reset_index(drop=True)
    
    if val <= len(data_view):
        data_view[val-1]['current'] = get_convertions(conv_trim, data_view[val-1])
    elif val == len(data_view) + 1:
        X_test_k['current'] = get_convertions(conv_trim, X_test_k)  
        
    if val > 1:
        data_view[val-2][col_target] = get_convertions(conv_trim, data_view[val-2])
            

    
cols_common.add('current')
len(cols_common)

//////////////////// 1
//////////////////// 2
//////////////////// 3
//////////////////// 4


1607

In [22]:
data = pd.concat([_[list(cols_common) + [col_target]] for _ in data_view], axis=0)
X_test_k = X_test_k[list(cols_common)]
data.shape, X_test_k.shape

((33637, 1608), (11676, 1607))

In [23]:
set(data.columns) - set(X_test_k.columns)

{'TARGET'}

In [24]:
set(X_test_k.columns) - set(data.columns)

set()

In [25]:
col_user in X_test_k.columns, col_user in data.columns

(False, False)

In [26]:
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,PAGE_699,PAGE_99,PAGE_496,PAGE_845,PAGE_618,PAGE_869,PAGE_870,PAGE_662,PAGE_989,ON_SITE_SEARCH_TERM_110,...,PAGE_312,PAGE_910,PAGE_330,PAGE_624,PAGE_1012,ON_SITE_SEARCH_TERM_233,ON_SITE_SEARCH_TERM_101,PAGE_423,PAGE_665,TARGET
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.004684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [27]:
X_test_k = X_test_k.reset_index(drop=True)
X_test_k.head()

Unnamed: 0,PAGE_699,PAGE_99,PAGE_496,PAGE_845,PAGE_618,PAGE_869,PAGE_870,PAGE_662,PAGE_989,ON_SITE_SEARCH_TERM_110,...,ON_SITE_SEARCH_TERM_185,PAGE_312,PAGE_910,PAGE_330,PAGE_624,PAGE_1012,ON_SITE_SEARCH_TERM_233,ON_SITE_SEARCH_TERM_101,PAGE_423,PAGE_665
0,0.0,0.001244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000845,0.0,0.0,0.0,0.0,0.0
3,0.0,0.00155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.004187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
data[col_target].value_counts(dropna=False)

0    32490
1     1147
Name: TARGET, dtype: int64

## PREPARATE DATA

In [29]:
index_test = X_test_k.index
X_test_k = sp.sparse.csr_matrix(X_test_k.values)
y_train = data[col_target]
X_train = sp.sparse.csr_matrix(data.drop([col_target], axis=1).values)

del data_view
del data

In [30]:
X_train.shape, type(X_train), y_train.shape, type(y_train), X_test_k.shape, type(X_test_k)

((33637, 1607),
 scipy.sparse.csr.csr_matrix,
 (33637,),
 pandas.core.series.Series,
 (11676, 1607),
 scipy.sparse.csr.csr_matrix)

In [31]:
from lightgbm import LGBMClassifier
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

In [50]:
intent = 0

In [32]:
param_grid = {
    'n_estimators': [5000],
    'feature_fraction': [0.5, 0.7],
    'bagging_fraction': [0.5, 0.7],
    'learning_rate': [0.05, 0.15],
    'max_depth': [50, 100]
}
kfold_off = StratifiedKFold(
    n_splits=10, 
    shuffle=False, 
    random_state=SEED
)
time_split = TimeSeriesSplit(
    n_splits=11
)
model_lgbm = LGBMClassifier(seed=SEED)

grid = GridSearchCV(
    model_lgbm, param_grid, cv=kfold_off, 
    n_jobs=-1, scoring='roc_auc', verbose=1
)
grid.fit(X_train, y_train)

best_params_ = grid.best_params_
model_ = grid.best_estimator_
score_ = grid.best_score_

print(score_, best_params_)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 126.5min finished


0.8066692968229289 {'feature_fraction': 0.5, 'lambda_l2': 0.01, 'learning_rate': 0.01, 'n_estimators': 5000}


In [33]:
predict_test = grid.best_estimator_.predict_proba(X_test_k)[:,1]   # 0.7757083940054187
predict_test, predict_test.shape

(array([8.53433505e-04, 5.63361521e-04, 1.98607958e-05, ...,
        7.60813694e-05, 1.82662479e-04, 4.41205509e-03]), (11676,))

In [34]:
predict_test = pd.DataFrame(
    {'USER_ID':index_test, 'SCORE':predict_test} 
)
predict_test.head()

Unnamed: 0,USER_ID,SCORE
0,0,0.000853
1,1,0.000563
2,2,2e-05
3,3,0.001569
4,4,0.000535


In [35]:
print(predict_test[predict_test['SCORE'] < 0.01].shape)
print(predict_test[predict_test['SCORE'] > 0.5].shape)
print(predict_test[predict_test['SCORE'] > 0.99].shape)
predict_test.to_csv('analysis_lgbm_{}.csv'.format(intent), index=False)

(9654, 2)
(47, 2)
(0, 2)


In [43]:
c_values = np.logspace(0.01, 0.75, 20)
c_values

array([1.02329299, 1.11930223, 1.22431942, 1.33918971, 1.46483758,
       1.60227421, 1.75260566, 1.91704178, 2.09690593, 2.29364561,
       2.50884415, 2.74423344, 3.00170785, 3.28333949, 3.59139487,
       3.92835318, 4.2969262 , 4.70008014, 5.14105952, 5.62341325])

In [51]:
from sklearn.linear_model import LogisticRegression

param_log = {
    'C': c_values,
    'class_weight': ['balanced'],
    'solver': ['saga', 'newton-cg', 'sag'],
    'penalty': ['l2']
}
kfold_off = StratifiedKFold(
    n_splits=10, 
    shuffle=False, 
    random_state=SEED
)
time_split = TimeSeriesSplit(
    n_splits=11
)
model_log = LogisticRegression(random_state=SEED, n_jobs = 4)

grid = GridSearchCV(
    model_log, param_log, cv=time_split, 
    n_jobs=-1, scoring='roc_auc', verbose=1
)
grid.fit(X_train, y_train)

best_params_ = grid.best_params_
model_ = grid.best_estimator_
score_ = grid.best_score_

print(score_, best_params_)

Fitting 11 folds for each of 20 candidates, totalling 220 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 220 out of 220 | elapsed:  3.2min finished


0.7821863717117165 {'C': 3.001707850482483, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'}


In [52]:
predict_test_log = grid.best_estimator_.predict_proba(X_test_k)[:,1]   # 0.7757083940054187
predict_test_log, predict_test_log.shape

(array([0.00582846, 0.0017522 , 0.01191862, ..., 0.06527961, 0.02452655,
        0.07277416]), (11676,))

In [53]:
predict_test_log = pd.DataFrame(
    {'USER_ID':index_test, 'SCORE':predict_test_log} 
)
predict_test_log.head()

Unnamed: 0,USER_ID,SCORE
0,0,0.005828
1,1,0.001752
2,2,0.011919
3,3,0.013372
4,4,0.030038


In [54]:
print(predict_test_log[predict_test_log['SCORE'] < 0.01].shape)
print(predict_test_log[predict_test_log['SCORE'] > 0.5].shape)
print(predict_test_log[predict_test_log['SCORE'] > 0.99].shape)
predict_test_log.to_csv('analysis_log_{}.csv'.format(intent), index=False)

(1641, 2)
(573, 2)
(297, 2)


In [None]:
"""
(2033, 2)
(330, 2)
(8, 2)

(31, 2)
(6123, 2)
(153, 2)

"""