In [64]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pandas_summary import DataFrameSummary
from pathlib import Path

import os, math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tqdm import tqdm

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()
os.chdir('/home/krivas/projects/analysis-project/')
from src.utils import convert_int, convert_date,\
                        add_median_features, add_date_diffs, add_days_count, add_date_features,\
                        add_embeds_features, make_set, run_model, apply_cats


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
PROCESSED   = DATA/'processed'

# Reading data

In [3]:
csv = pd.read_csv(PROCESSED/'HistoricoJob.csv')

In [4]:
csv.columns

Index(['Id_HistoricoJob', 'Id_Job', 'Id_Malla', 'Fecha_Carga_Scheduler',
       'Fecha_Ejec_Inicio', 'Fecha_Ejec_Fin', 'Duracion', 'Promedio', 'Agente',
       'Mxrc', 'Maxcmpc', 'Grupo', 'Force_Complete', 'CCF', 'Estado',
       'duracion_int', 'promedio_int', 'Fecha_Ejec_Inicio_Int',
       'Hora_Ejec_Inicio_Int', 'Fecha_Ejec_Fin_Int', 'Hora_Ejec_Fin_Int',
       'DiaSemana', 'hubo_error'],
      dtype='object')

In [5]:
target_date = int(csv.Fecha_Ejec_Inicio_Int.max())
start_date = 20180401

In [62]:
%%time
days_data = []
for date in range(start_date, target_date + 1):
    if date in csv.Fecha_Ejec_Inicio_Int.unique():
        print(date)
        days_data.append(make_set(csv.loc[csv['Fecha_Ejec_Inicio_Int'] == date], csv.loc[(csv['Fecha_Ejec_Inicio_Int'] == target_date) & (csv['hubo_error'] == 0)]))

20180401
20180402
20180403
20180404
20180405
20180406
20180407
20180408
20180409
20180410
20180411
20180412
20180413
20180414
20180415
20180416
20180417
20180418
20180419
20180420
20180421
20180422
20180423
20180424
20180425
20180426
20180427
20180428
20180429
20180430
20180501
20180502
20180503
20180504
20180505
20180506
20180507
20180508
20180509
20180510
20180511
20180514
20180515
CPU times: user 52.6 s, sys: 20 ms, total: 52.6 s
Wall time: 52.6 s


# Preprocessing

In [65]:
%%time
for temp in tqdm(days_data):
    add_date_diffs(temp, csv)
    add_days_count(temp, csv)
    add_date_features(temp)
    add_median_features(temp, csv, 3)

100%|██████████| 43/43 [09:26<00:00, 13.18s/it]

CPU times: user 7min 39s, sys: 1min 47s, total: 9min 26s
Wall time: 9min 26s





## Testing with SVD

In [66]:
# Reading dumps
from surprise import dump

svd = dump.load(PROCESSED/'svd_20.dump')[1]

In [67]:
%%time
# Adding biases
for temp in tqdm(days_data):
    temp['fail_bias'] = temp.apply(lambda row: svd.predict(row['Id_Job'], row['Id_Malla'])[3], axis=1)

100%|██████████| 43/43 [00:14<00:00,  2.93it/s]

CPU times: user 15 s, sys: 1.26 s, total: 16.3 s
Wall time: 14.7 s





### Model

In [68]:
#from src.utils import run_model
from lightgbm import LGBMRegressor

In [69]:
target_col = 'duracion_int'
pred_col = 'Prediccion'
id_cols = ['Fecha_Ejec_Inicio_Int', 'Id_Job', 'Id_Malla']
metric_names = ['mse']

days_data[len(days_data) - 1][pred_col] = 0

In [70]:
%%time
results = None
output = []
for i, w in enumerate(days_data[1:]):
    train, val, test = pd.concat(days_data[:i+1], axis=0), w, days_data[-1]
    
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
    y_test, _, results, model = run_model(
            LGBMRegressor(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results,
            early_stopping=True)
    output.append([y_test, model])

[1]	valid_0's l2: 1.31686e+08
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's l2: 1.08239e+08
[3]	valid_0's l2: 8.93386e+07
[4]	valid_0's l2: 7.42434e+07
[5]	valid_0's l2: 6.18628e+07
[6]	valid_0's l2: 5.19552e+07
[7]	valid_0's l2: 4.38677e+07
[8]	valid_0's l2: 3.73719e+07
[9]	valid_0's l2: 3.21133e+07
[10]	valid_0's l2: 2.78738e+07
[11]	valid_0's l2: 2.44763e+07
[12]	valid_0's l2: 2.1719e+07
[13]	valid_0's l2: 1.95288e+07
[14]	valid_0's l2: 1.77412e+07
[15]	valid_0's l2: 1.63139e+07
[16]	valid_0's l2: 1.51979e+07
[17]	valid_0's l2: 1.42804e+07
[18]	valid_0's l2: 1.35795e+07
[19]	valid_0's l2: 1.30335e+07
[20]	valid_0's l2: 1.25759e+07
[21]	valid_0's l2: 1.22451e+07
[22]	valid_0's l2: 1.19644e+07
[23]	valid_0's l2: 1.17452e+07
[24]	valid_0's l2: 1.15978e+07
[25]	valid_0's l2: 1.14655e+07
[26]	valid_0's l2: 1.13824e+07
[27]	valid_0's l2: 1.13064e+07
[28]	valid_0's l2: 1.12687e+07
[29]	valid_0's l2: 1.12301e+07
[30]	valid_0's l2: 1.12117e+07
[31]	valid_0's l2:

In [71]:
model.feature_importances_

array([573, 391, 588,  76, 252, 535, 603, 582])

In [72]:
y_test = np.mean([x[0] for x in output], axis=0)

In [73]:
from sklearn.metrics import mean_squared_error

In [20]:
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

384473.58492286416

In [16]:
# 10 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

407903.3582005827

In [44]:
# 15 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

379839.54668777814

In [74]:
# 20 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

382186.9104456504

## Testing with NMF

In [75]:
nmf = dump.load(PROCESSED/'nmf_20.dump')[1]

In [76]:
%%time
# Adding biases
for temp in tqdm(days_data):
    temp['fail_bias'] = temp.apply(lambda row: nmf.predict(row['Id_Job'], row['Id_Malla'])[3], axis=1)

100%|██████████| 43/43 [00:14<00:00,  3.06it/s]

CPU times: user 14.2 s, sys: 1.18 s, total: 15.4 s
Wall time: 14.1 s





### Model

In [77]:
%%time
results = None
output = []
for i, w in enumerate(days_data[1:]):
    train, val, test = pd.concat(days_data[:i+1], axis=0), w, days_data[-1]
    
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
    y_test, _, results, model = run_model(
            LGBMRegressor(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results,
            early_stopping=True)
    output.append([y_test, model])

[1]	valid_0's l2: 1.31686e+08
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's l2: 1.08239e+08
[3]	valid_0's l2: 8.93373e+07
[4]	valid_0's l2: 7.42371e+07
[5]	valid_0's l2: 6.18542e+07
[6]	valid_0's l2: 5.19461e+07
[7]	valid_0's l2: 4.38586e+07
[8]	valid_0's l2: 3.73628e+07
[9]	valid_0's l2: 3.2103e+07
[10]	valid_0's l2: 2.78643e+07
[11]	valid_0's l2: 2.44613e+07
[12]	valid_0's l2: 2.17048e+07
[13]	valid_0's l2: 1.95126e+07
[14]	valid_0's l2: 1.7733e+07
[15]	valid_0's l2: 1.63366e+07
[16]	valid_0's l2: 1.51925e+07
[17]	valid_0's l2: 1.42838e+07
[18]	valid_0's l2: 1.3584e+07
[19]	valid_0's l2: 1.30378e+07
[20]	valid_0's l2: 1.25815e+07
[21]	valid_0's l2: 1.22255e+07
[22]	valid_0's l2: 1.19701e+07
[23]	valid_0's l2: 1.17509e+07
[24]	valid_0's l2: 1.16015e+07
[25]	valid_0's l2: 1.14692e+07
[26]	valid_0's l2: 1.13857e+07
[27]	valid_0's l2: 1.131e+07
[28]	valid_0's l2: 1.12678e+07
[29]	valid_0's l2: 1.12261e+07
[30]	valid_0's l2: 1.12135e+07
[31]	valid_0's l2: 1.1

In [78]:
model.feature_importances_

array([580, 421, 570,  91, 246, 554, 654, 484])

In [79]:
y_test = np.mean([x[0] for x in output], axis=0)

In [27]:
# With 3 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

393907.00309290644

In [22]:
# 10 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

383913.14897290384

In [50]:
# 15 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

383975.47492162086

In [80]:
# 20 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

390343.37071155617

# Test with fail and bias

### SVD

In [81]:
%%time
# Adding biases
uid = svd.trainset._raw2inner_id_users
iid = svd.trainset._raw2inner_id_items

for temp in tqdm(days_data):
    temp['id_job_bias'] = temp.Id_Job.apply(lambda row: svd.bu[uid[row]])
    temp['id_mesh_bias'] = temp.Id_Malla.apply(lambda row: svd.bi[iid[row]])


100%|██████████| 43/43 [00:00<00:00, 99.29it/s]

CPU times: user 428 ms, sys: 4 ms, total: 432 ms
Wall time: 435 ms





In [82]:
target_col = 'duracion_int'
pred_col = 'Prediccion'
id_cols = ['Fecha_Ejec_Inicio_Int', 'Id_Job', 'Id_Malla']
metric_names = ['mse']

days_data[len(days_data) - 1][pred_col] = 0

In [83]:
%%time
results = None
output = []
for i, w in enumerate(days_data[1:]):
    train, val, test = pd.concat(days_data[:i+1], axis=0), w, days_data[-1]
    
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
    y_test, _, results, model = run_model(
            LGBMRegressor(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results,
            early_stopping=True)
    output.append([y_test, model])

[1]	valid_0's l2: 1.31686e+08
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's l2: 1.08238e+08
[3]	valid_0's l2: 8.93394e+07
[4]	valid_0's l2: 7.4239e+07
[5]	valid_0's l2: 6.18564e+07
[6]	valid_0's l2: 5.19406e+07
[7]	valid_0's l2: 4.38531e+07
[8]	valid_0's l2: 3.73563e+07
[9]	valid_0's l2: 3.20956e+07
[10]	valid_0's l2: 2.78566e+07
[11]	valid_0's l2: 2.4457e+07
[12]	valid_0's l2: 2.16982e+07
[13]	valid_0's l2: 1.95066e+07
[14]	valid_0's l2: 1.77247e+07
[15]	valid_0's l2: 1.63311e+07
[16]	valid_0's l2: 1.51876e+07
[17]	valid_0's l2: 1.43082e+07
[18]	valid_0's l2: 1.35799e+07
[19]	valid_0's l2: 1.30036e+07
[20]	valid_0's l2: 1.25751e+07
[21]	valid_0's l2: 1.22152e+07
[22]	valid_0's l2: 1.1961e+07
[23]	valid_0's l2: 1.17437e+07
[24]	valid_0's l2: 1.15971e+07
[25]	valid_0's l2: 1.149e+07
[26]	valid_0's l2: 1.13894e+07
[27]	valid_0's l2: 1.13314e+07
[28]	valid_0's l2: 1.12801e+07
[29]	valid_0's l2: 1.12512e+07
[30]	valid_0's l2: 1.12239e+07
[31]	valid_0's l2: 1.1

In [84]:
model.feature_importances_

array([501, 352, 481,  60, 185, 492, 526, 256, 321, 426])

In [85]:
y_test = np.mean([x[0] for x in output], axis=0)

In [33]:
# 3 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

412534.3886767157

In [28]:
# 10 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

412148.3109664779

In [56]:
# 15 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

430025.82863678905

In [86]:
# 20 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

397217.2693600183

### NMF

In [87]:
%%time
# Adding biases
uid = nmf.trainset._raw2inner_id_users
iid = nmf.trainset._raw2inner_id_items

for temp in tqdm(days_data):
    temp['id_job_bias'] = temp.Id_Job.apply(lambda row: nmf.bu[uid[row]])
    temp['id_mesh_bias'] = temp.Id_Malla.apply(lambda row: nmf.bi[iid[row]])


100%|██████████| 43/43 [00:00<00:00, 114.40it/s]

CPU times: user 376 ms, sys: 4 ms, total: 380 ms
Wall time: 377 ms





In [88]:
%%time
results = None
output = []
for i, w in enumerate(days_data[1:]):
    train, val, test = pd.concat(days_data[:i+1], axis=0), w, days_data[-1]
    
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
    y_test, _, results, model = run_model(
            LGBMRegressor(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results,
            early_stopping=True)
    output.append([y_test, model])

[1]	valid_0's l2: 1.31689e+08
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's l2: 1.08239e+08
[3]	valid_0's l2: 8.93408e+07
[4]	valid_0's l2: 7.42405e+07
[5]	valid_0's l2: 6.18586e+07
[6]	valid_0's l2: 5.19501e+07
[7]	valid_0's l2: 4.38626e+07
[8]	valid_0's l2: 3.73663e+07
[9]	valid_0's l2: 3.21016e+07
[10]	valid_0's l2: 2.78619e+07
[11]	valid_0's l2: 2.44502e+07
[12]	valid_0's l2: 2.16961e+07
[13]	valid_0's l2: 1.95011e+07
[14]	valid_0's l2: 1.77257e+07
[15]	valid_0's l2: 1.63276e+07
[16]	valid_0's l2: 1.51851e+07
[17]	valid_0's l2: 1.43042e+07
[18]	valid_0's l2: 1.35774e+07
[19]	valid_0's l2: 1.30314e+07
[20]	valid_0's l2: 1.25792e+07
[21]	valid_0's l2: 1.22466e+07
[22]	valid_0's l2: 1.19635e+07
[23]	valid_0's l2: 1.17458e+07
[24]	valid_0's l2: 1.15989e+07
[25]	valid_0's l2: 1.1468e+07
[26]	valid_0's l2: 1.1386e+07
[27]	valid_0's l2: 1.13275e+07
[28]	valid_0's l2: 1.12719e+07
[29]	valid_0's l2: 1.12451e+07
[30]	valid_0's l2: 1.12171e+07
[31]	valid_0's l2: 

In [89]:
model.feature_importances_

array([479, 358, 480,  76, 222, 511, 564, 233, 275, 402])

In [90]:
y_test = np.mean([x[0] for x in output], axis=0)

In [38]:
# 3 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

392298.0333496007

In [33]:
# 10 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

411413.9851800173

In [61]:
# 15 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

410887.0531918605

In [91]:
# 20 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

403592.64593918994