In [1]:
# Nativos
import random as rn
import os
import sys
import gc

#calculo
import numpy as np
import pandas as pd
import scipy as sp

#grafico
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
%matplotlib inline
sns.set(style="whitegrid")

#warning ignore future
import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
gc.collect()

BASE_DIR = os.path.dirname(os.getcwd())
if BASE_DIR not in sys.path: sys.path.append(BASE_DIR)

from utils import *
from graphs import *

SEED = 29082013
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
rn.seed(SEED)

subfolder = "data"
os.listdir(subfolder)

['CONTENT_CATEGORY.csv',
 '.ipynb_checkpoints',
 'device_data.csv',
 'sampleSubmission.csv',
 'SITE_ID.csv',
 'PAGE.csv',
 'CONTENT_CATEGORY_TOP.csv',
 'conversiones.csv',
 'CONTENT_CATEGORY_BOTTOM.csv',
 'pageviews.csv']

In [2]:
def get_schedule(val):
    if val < 6:
        return 'madrugada'
    elif val < 9:
        return 'antes_del_trabajo'
    elif val < 13:
        return 'trabajo_manana'
    elif val < 16:
        return 'almuerzo'
    elif val < 19:
        return 'trabajo_tarde'
    else:
        return 'luego_del_trabajo'

In [3]:
col_user = 'USER_ID'
device_data = pd.read_csv(
    "{}/device_data.csv".format(subfolder), parse_dates=["FEC_EVENT"]
).sort_values(
    [col_user, "FEC_EVENT"]
)
device_data.shape

(2871406, 4)

In [4]:
device_data.columns

Index(['FEC_EVENT', 'CONNECTION_SPEED', 'IS_MOBILE_DEVICE', 'USER_ID'], dtype='object')

In [5]:
data = pd.read_csv(
    "{}/pageviews.csv".format(subfolder), parse_dates=["FEC_EVENT"]
).sort_values(
    [col_user, "FEC_EVENT"]
)
data.shape

(17936934, 8)

In [6]:
data.columns

Index(['FEC_EVENT', 'PAGE', 'CONTENT_CATEGORY', 'CONTENT_CATEGORY_TOP',
       'CONTENT_CATEGORY_BOTTOM', 'SITE_ID', 'ON_SITE_SEARCH_TERM', 'USER_ID'],
      dtype='object')

In [7]:
del data['CONTENT_CATEGORY_BOTTOM']

data['weekday_view'] = data['FEC_EVENT'].dt.weekday
device_data['weekday_device'] = device_data['FEC_EVENT'].dt.weekday

data['hour'] = data['FEC_EVENT'].dt.hour
device_data['hour'] = device_data['FEC_EVENT'].dt.hour
data['schedule_view'] = data['hour'].apply(get_schedule)
device_data['schedule_device'] = device_data['hour'].apply(get_schedule)

del data['hour']
del device_data['hour']

In [8]:
cols = list(data.columns)[1:-3] + ['weekday_view', 'schedule_view']
cols_device = list(device_data.columns)[1:-3] + ['weekday_device', 'schedule_device']

cols, cols_device

(['PAGE',
  'CONTENT_CATEGORY',
  'CONTENT_CATEGORY_TOP',
  'SITE_ID',
  'ON_SITE_SEARCH_TERM',
  'weekday_view',
  'schedule_view'],
 ['CONNECTION_SPEED', 'IS_MOBILE_DEVICE', 'weekday_device', 'schedule_device'])

In [9]:
trimestre_data = data[data.FEC_EVENT.dt.month > 0]
trimestre_data_device = device_data[device_data.FEC_EVENT.dt.month > 0]

X_test = []

for c in cols:
    print("-->", c)
    temp = pd.crosstab(trimestre_data.USER_ID, trimestre_data[c]).sort_index()
    temp.columns = [c + "_" + str(v) for v in temp.columns]
    X_test.append(temp.apply(lambda x: x / x.sum(), axis=1))
    del temp

for c in cols_device:
    print("-->", c)
    temp = pd.crosstab(trimestre_data_device.USER_ID, trimestre_data_device[c]).sort_index()
    temp.columns = [c + "_" + str(v) for v in temp.columns]
    X_test.append(temp.apply(lambda x: x / x.sum(), axis=1))
    del temp
    
X_test = pd.concat(X_test, axis=1)
del trimestre_data
del trimestre_data_device
X_test.shape

--> PAGE
--> CONTENT_CATEGORY
--> CONTENT_CATEGORY_TOP
--> SITE_ID
--> ON_SITE_SEARCH_TERM
--> weekday_view
--> schedule_view
--> CONNECTION_SPEED
--> IS_MOBILE_DEVICE
--> weekday_device
--> schedule_device


(11676, 2133)

In [10]:
X_test.isnull().sum().sum()

0

In [11]:
trimestre_data = data[data.FEC_EVENT.dt.month < 10]
trimestre_data_device = device_data[device_data.FEC_EVENT.dt.month < 10]

X_train = []
X_train_device = []

for c in cols:
    print("-->", c)
    temp = pd.crosstab(trimestre_data.USER_ID, trimestre_data[c]).sort_index()
    temp.columns = [c + "_" + str(v) for v in temp.columns]
    X_train.append(temp.apply(lambda x: x / x.sum(), axis=1))
    del temp

for c in cols_device:
    print("-->", c)
    temp = pd.crosstab(trimestre_data_device.USER_ID, trimestre_data_device[c]).sort_index()
    temp.columns = [c + "_" + str(v) for v in temp.columns]
    X_train_device.append(temp.apply(lambda x: x / x.sum(), axis=1))
    del temp
    
X_train = pd.concat(X_train, axis=1)
X_train_device = pd.concat(X_train_device, axis=1)

X_train[col_user] = X_train.index
del X_train.index.name
X_train_device[col_user] = X_train_device.index
del X_train_device.index.name
print(X_train.shape, X_train_device.shape)

X_train = pd.merge(X_train, X_train_device, on=col_user, how='inner')


del X_train_device
del trimestre_data
del trimestre_data_device
del data
X_train.shape

--> PAGE
--> CONTENT_CATEGORY
--> CONTENT_CATEGORY_TOP
--> SITE_ID
--> ON_SITE_SEARCH_TERM
--> weekday_view
--> schedule_view
--> CONNECTION_SPEED
--> IS_MOBILE_DEVICE
--> weekday_device
--> schedule_device
(11387, 1946) (11528, 20)


(11386, 1965)

In [12]:
X_train.isnull().sum().sum(), X_train.shape, X_test.shape

(0, (11386, 1965), (11676, 2133))

In [13]:
list(X_test.columns[-10:]), list(X_train.columns[-10:])

(['weekday_device_3',
  'weekday_device_4',
  'weekday_device_5',
  'weekday_device_6',
  'schedule_device_almuerzo',
  'schedule_device_antes_del_trabajo',
  'schedule_device_luego_del_trabajo',
  'schedule_device_madrugada',
  'schedule_device_trabajo_manana',
  'schedule_device_trabajo_tarde'],
 ['weekday_device_3',
  'weekday_device_4',
  'weekday_device_5',
  'weekday_device_6',
  'schedule_device_almuerzo',
  'schedule_device_antes_del_trabajo',
  'schedule_device_luego_del_trabajo',
  'schedule_device_madrugada',
  'schedule_device_trabajo_manana',
  'schedule_device_trabajo_tarde'])

In [14]:
features = list(set(X_train.columns).intersection(set(X_test.columns)))
print("features: ", len(features))
X_train = X_train[features]
X_test = X_test[features]

features:  1964


In [15]:
y_prev = pd.read_csv("data/conversiones.csv")
y_train = pd.Series(0, index=X_train.index)
idx = set(y_prev[y_prev.mes >= 10].USER_ID.unique()).intersection(
        set(X_train.index))
y_train.loc[list(idx)] = 1

In [16]:
X_train.shape, X_test.shape, y_train.shape

((11386, 1964), (11676, 1964), (11386,))

In [17]:
convertion = pd.read_csv("{}/conversiones.csv".format(subfolder)).drop_duplicates()[['USER_ID','mes']].sort_values(['USER_ID','mes']).reset_index(drop=True)
for col in convertion.columns:
    convertion[col] = convertion[col].astype(int)

convertion.dtypes
def get_trimestre(val):
    if val <= 3:
        return 1
    elif val <= 6:
        return 2
    elif val <= 9:
        return 3
    elif val <= 12:
        return 4
    return 0

convertion['trimestre'] = convertion['mes'].apply(get_trimestre)
for col in convertion.columns:
    print(col, convertion[col].unique().shape)

del convertion['mes']
convertion.head()


USER_ID (1086,)
mes (12,)
trimestre (4,)


Unnamed: 0,USER_ID,trimestre
0,13,3
1,22,4
2,40,1
3,57,2
4,92,1


In [18]:

col_user = 'USER_ID'
pageview = pd.read_csv("{}/pageviews.csv".format(subfolder), parse_dates=["FEC_EVENT"]).sort_values([col_user, "FEC_EVENT"])
all_user = set(list(pageview[col_user].unique()))
print(len(all_user))
del pageview

data_conv = []

for val in sorted(list(convertion['trimestre'].unique())):
    col_target = 'TARGET_{}'.format(val)
    conv_trim = convertion[convertion['trimestre'] == val].drop_duplicates().reset_index(drop=True)
    unique_users_conv = set(list(conv_trim[col_user].unique()))
    conv_trim.index = conv_trim[col_user]
    
    conv_trim[col_target] = 1
    
    del conv_trim.index.name
    del conv_trim[col_user]
    del conv_trim['trimestre']

    temp0 = pd.DataFrame(0, index=list(all_user - unique_users_conv), columns=[col_target])
    X_test_intern = pd.concat([conv_trim, temp0]).sort_index()

    print("X_test_intern post:: ", X_test_intern.shape)
    
    data_conv.append(X_test_intern.copy())
    del X_test_intern
    del conv_trim


data_conv = pd.concat(data_conv, axis=1)
del convertion

display(data_conv.head(15))


11676
X_test_intern post::  (11676, 1)
X_test_intern post::  (11676, 1)
X_test_intern post::  (11676, 1)
X_test_intern post::  (11676, 1)


Unnamed: 0,TARGET_1,TARGET_2,TARGET_3,TARGET_4
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
5,0,0,0,0
6,0,0,0,0
7,0,0,0,0
8,0,0,0,0
9,0,0,0,0


In [19]:

current_col = 'current_convertion'
previous_col = 'previous_convertion'
before_previous_col = 'before_previous_convertion'

X_train[current_col] = data_conv['TARGET_3']
X_train[previous_col] = data_conv['TARGET_2']
X_train[before_previous_col] = data_conv['TARGET_1']

X_test[current_col] = data_conv['TARGET_4']
X_test[previous_col] = data_conv['TARGET_3']
X_test[before_previous_col] = data_conv['TARGET_2']

del data_conv


In [20]:
X_train.shape, X_test.shape, y_train.shape

((11386, 1967), (11676, 1967), (11386,))

In [21]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif
num_cols = [_*int(X_train.shape[1] / 7) for _ in range(1,7)]
num_cols

[281, 562, 843, 1124, 1405, 1686]

In [22]:
index_test = X_test.index
prediction_group = []

for num_col in num_cols:
    sb = SelectKBest(f_classif, num_col)
    
    X_train_part = sb.fit_transform(X_train, y_train)
    X_test_part = sb.transform(X_test)
    print(X_train_part.shape, X_test_part.shape, y_train.shape)
    
    prediction_group.append(
        (X_train_part, X_test_part)
    )

(11386, 281) (11676, 281) (11386,)
(11386, 562) (11676, 562) (11386,)
(11386, 843) (11676, 843) (11386,)
(11386, 1124) (11676, 1124) (11386,)
(11386, 1405) (11676, 1405) (11386,)
(11386, 1686) (11676, 1686) (11386,)


In [23]:
y_train[y_train == 0].shape[0], y_train[y_train == 1].shape[0]

(10998, 388)

In [24]:
ratio_pos = np.sum(y_train == 1) / np.sum(y_train == 0)
ratio_pos

0.035279141662120386

In [47]:
from lightgbm import LGBMClassifier
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

#   QUIT  'scale_pos_weight': [ratio_pos], AND SET is_unbalance = True

param_grid_lgbm = {
    'n_estimators': [500],
    'feature_fraction': [0.1],
    'bagging_fraction': [0.95, 0.85],
    'max_depth':[55],
    'learning_rate': [0.025],
    'boosting': ['dart'],
    'min_gain_to_split': [1],
    'is_unbalance': [True],
    'metric': ['auc'],
    'tree_learner': ['serial'],
    'xgboost_dart_mode': [False],
    'skip_drop': [0.15, 0.25],
    'num_threads': [4]
}
    
kfold_off = StratifiedKFold(
    n_splits=10, 
    shuffle=False, 
    random_state=SEED
)
model_lgbm = LGBMClassifier(
    seed=SEED, 
    feature_fraction_seed=SEED, 
    drop_seed=SEED
)

for X_train_part, X_test_part in prediction_group:
    print("="*50, X_train_part.shape, X_test_part.shape)
    grid = GridSearchCV(
        model_lgbm, param_grid_lgbm, cv=kfold_off, 
        n_jobs=4, scoring='roc_auc', verbose=1
    )
    grid.fit(X_train_part, y_train)

    best_params_ = grid.best_params_
    model_ = grid.best_estimator_
    score_ = grid.best_score_
    print(score_, best_params_)
    print("#"*100)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed: 11.5min finished


0.8325793608981065 {'bagging_fraction': 0.95, 'boosting': 'dart', 'feature_fraction': 0.1, 'is_unbalance': True, 'learning_rate': 0.025, 'max_depth': 55, 'metric': 'auc', 'min_gain_to_split': 1, 'n_estimators': 500, 'num_threads': 4, 'skip_drop': 0.15, 'tree_learner': 'serial', 'xgboost_dart_mode': False}
####################################################################################################
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [77]:
X_train_part, X_test_part = prediction_group[2]

param_grid_final = {
    'n_estimators': [500],
    'feature_fraction': [0.1],
    'bagging_fraction': [0.95],
    'max_depth':[55],
    'learning_rate': [0.05],
    'boosting': ['dart'],
    'min_gain_to_split': [1],
    'is_unbalance': [True],
    'metric': ['auc'],
    'tree_learner': ['serial'],
    'xgboost_dart_mode': [False],
    'skip_drop': [0.45]
}

grid = GridSearchCV(
    model_lgbm, param_grid_final, cv=kfold_off, 
    n_jobs=-1, scoring='roc_auc', verbose=1
)
grid.fit(X_train_part, y_train)

best_params_ = grid.best_params_
model_ = grid.best_estimator_
score_ = grid.best_score_
print(score_, best_params_)

predict_test = grid.best_estimator_.predict_proba(X_test_part)[:,1]
print(predict_test, predict_test.shape)
predict_test = pd.DataFrame(
    {'USER_ID':index_test, 'SCORE':predict_test} 
)
display(predict_test.head())

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.8min finished


0.8174835229258388 {'bagging_fraction': 0.95, 'boosting': 'dart', 'feature_fraction': 0.1, 'is_unbalance': True, 'learning_rate': 0.05, 'max_depth': 55, 'metric': 'auc', 'min_gain_to_split': 1, 'n_estimators': 500, 'skip_drop': 0.45, 'tree_learner': 'serial', 'xgboost_dart_mode': False}
[0.04836903 0.05744049 0.04874279 ... 0.10705118 0.06378427 0.06132863] (11676,)


Unnamed: 0,USER_ID,SCORE
0,0,0.048369
1,1,0.05744
2,2,0.048743
3,3,0.094482
4,4,0.202073


In [78]:
predict_test.shape

(11676, 2)

In [79]:
print(predict_test[predict_test['SCORE'] < 0.01].shape)
print(predict_test[predict_test['SCORE'] > 0.5].shape)
print(predict_test[predict_test['SCORE'] > 0.99].shape)

(14, 2)
(875, 2)
(0, 2)


In [66]:
"""
# 1 / 7
(0, 2)
(1444, 2)
(0, 2)

# 2 / 7
(0, 2)
(1359, 2)
(0, 2)

# 3 / 7
(0, 2)
(1277, 2)
(0, 2)

# 4 / 7
(0, 2)
(1299, 2) -----------> 0.82098
(0, 2)

# 5 / 7
(0, 2)
(1263, 2)
(0, 2)

REPLACE is_umbalanced --> scale_pos_weight 

# 6 / 7 - 5 / 7
(0, 2)
(3578, 2)
(0, 2)

# 4 / 7 
(0, 2)
(4501, 2)
(0, 2)

RETURN umbalanced add more parameter dart

# 1 / 7
(0, 2)
(1894, 2) ----------->  0.80580
(0, 2)

# 2 / 7
(0, 2)
(1879, 2)
(0, 2)

# 3 / 7
(0, 2)
(1823, 2)
(0, 2)

# 4 / 7
(0, 2)
(1775, 2)
(0, 2)

# REACTIVATE skip_drop

# 4 / 7  0.5
(0, 2)
(1241, 2)  --------->  0.81764
(0, 2)

# 4 / 7  0.6
(0, 2)
(1077, 2)
(0, 2)

# 
"""

'\n# 1 / 7\n(0, 2)\n(1444, 2)\n(0, 2)\n\n# 2 / 7\n(0, 2)\n(1359, 2)\n(0, 2)\n\n# 3 / 7\n(0, 2)\n(1277, 2)\n(0, 2)\n\n# 4 / 7\n(0, 2)\n(1299, 2) -----------> 0.82098\n(0, 2)\n\n# 5 / 7\n(0, 2)\n(1263, 2)\n(0, 2)\n\nREPLACE is_umbalanced --> scale_pos_weight \n\n# 6 / 7 - 5 / 7\n(0, 2)\n(3578, 2)\n(0, 2)\n\n# 4 / 7 \n(0, 2)\n(4501, 2)\n(0, 2)\n\nRETURN umbalanced add more parameter dart\n\n# 1 / 7\n(0, 2)\n(1894, 2) ----------->  0.80580\n(0, 2)\n\n# 2 / 7\n(0, 2)\n(1879, 2)\n(0, 2)\n\n# 3 / 7\n(0, 2)\n(1823, 2)\n(0, 2)\n\n'

In [81]:
predict_test.to_csv('add_vars_scale_is_umbalanced_skd045__slr005_fm3-7.csv', index=False)