In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pandas_summary import DataFrameSummary
from pathlib import Path

import os, math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()
os.chdir('/home/krivas/projects/analysis-project/')
from src.utils import convert_int, convert_date,\
                        add_mean_features, add_date_diffs, add_days_count, add_date_features,\
                        add_embeds_features, make_set, run_model, apply_cats


In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
PROCESSED   = DATA/'processed'

# Reading data

In [3]:
csv = pd.read_csv(PROCESSED/'HistoricoJob.csv')

In [4]:
csv.columns

Index(['Id_HistoricoJob', 'Id_Job', 'Id_Malla', 'Fecha_Carga_Scheduler',
       'Fecha_Ejec_Inicio', 'Fecha_Ejec_Fin', 'Duracion', 'Promedio', 'Agente',
       'Mxrc', 'Maxcmpc', 'Grupo', 'Force_Complete', 'CCF', 'Estado',
       'duracion_int', 'promedio_int', 'Fecha_Ejec_Inicio_Int',
       'Hora_Ejec_Inicio_Int', 'Fecha_Ejec_Fin_Int', 'Hora_Ejec_Fin_Int',
       'DiaSemana', 'hubo_error'],
      dtype='object')

In [5]:
target_date = int(csv.Fecha_Ejec_Inicio_Int.max())
start_date = 20180401

In [6]:
%%time
days_data = []
for date in range(start_date, target_date + 1):
    if date in csv.Fecha_Ejec_Inicio_Int.unique():
        days_data.append(make_set(csv.loc[csv['Fecha_Ejec_Inicio_Int'] == date], csv.loc[(csv['Fecha_Ejec_Inicio_Int'] == target_date) & (csv['hubo_error'] == 0)]))

CPU times: user 52.4 s, sys: 32 ms, total: 52.4 s
Wall time: 52.4 s


# Preprocessing

In [7]:
from tqdm import tqdm

In [8]:
%%time
for temp in tqdm(days_data):
    add_date_diffs(temp, csv)
    add_days_count(temp, csv)
    add_date_features(temp)
    add_mean_features(temp, csv, 3)

100%|██████████| 43/43 [09:30<00:00, 13.27s/it]

CPU times: user 7min 43s, sys: 1min 46s, total: 9min 30s
Wall time: 9min 30s





In [9]:
days_data[0].head()

Unnamed: 0,Fecha_Ejec_Inicio_Int,Id_Job,Id_Malla,duracion_int,DaysSinceMainframeOp,DaysCountMainframeOp,DiaSemana,promedio_por_dia,promedio_por_semana
0,20180401.0,@AK2ZF29,02FBFCL2,0.0,292,204,6,,1.0
1,20180401.0,@DF2FCL2,02FBFCL4,0.0,292,204,6,,53.0
2,20180401.0,@DF2FCL4,02FBFCL2,0.0,292,204,6,,24.0
3,20180401.0,@DF2IACV,02FBFCL2,0.0,292,247,6,1.5,1.0
4,20180401.0,@DF2LIAB,00FBFCL2,7.0,292,246,6,,7.5


# Testing with SVD

In [10]:
# Reading dumps
from surprise import dump

svd = dump.load(PROCESSED/'svd.dump')[1]

In [11]:
%%time
# Adding embeddings and biases
for temp in tqdm(days_data):
    add_embeds_features(temp, svd)

100%|██████████| 43/43 [00:19<00:00,  2.19it/s]

CPU times: user 19.6 s, sys: 216 ms, total: 19.8 s
Wall time: 19.6 s





## Model

In [12]:
#from src.utils import run_model
from lightgbm import LGBMRegressor

In [13]:
target_col = 'duracion_int'
pred_col = 'Prediccion'
id_cols = ['Fecha_Ejec_Inicio_Int', 'Id_Job', 'Id_Malla']
metric_names = ['mse']

In [14]:
days_data[len(days_data) - 1][pred_col] = 0

In [15]:
%%time
results = None
output = []
for i, w in enumerate(days_data[1:]):
    train, val, test = pd.concat(days_data[:i+1], axis=0), w, days_data[-1]
    
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
    y_test, _, results, model = run_model(
            LGBMRegressor(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results,
            early_stopping=True)
    output.append([y_test, model])

[1]	valid_0's l2: 1.31688e+08
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's l2: 1.0824e+08
[3]	valid_0's l2: 8.93405e+07
[4]	valid_0's l2: 7.42499e+07
[5]	valid_0's l2: 6.18691e+07
[6]	valid_0's l2: 5.19485e+07
[7]	valid_0's l2: 4.38725e+07
[8]	valid_0's l2: 3.73902e+07
[9]	valid_0's l2: 3.21387e+07
[10]	valid_0's l2: 2.79124e+07
[11]	valid_0's l2: 2.45033e+07
[12]	valid_0's l2: 2.17466e+07
[13]	valid_0's l2: 1.9554e+07
[14]	valid_0's l2: 1.77746e+07
[15]	valid_0's l2: 1.6376e+07
[16]	valid_0's l2: 1.52313e+07
[17]	valid_0's l2: 1.43489e+07
[18]	valid_0's l2: 1.36227e+07
[19]	valid_0's l2: 1.30731e+07
[20]	valid_0's l2: 1.26156e+07
[21]	valid_0's l2: 1.22841e+07
[22]	valid_0's l2: 1.20028e+07
[23]	valid_0's l2: 1.18072e+07
[24]	valid_0's l2: 1.16361e+07
[25]	valid_0's l2: 1.15256e+07
[26]	valid_0's l2: 1.14245e+07
[27]	valid_0's l2: 1.13679e+07
[28]	valid_0's l2: 1.13086e+07
[29]	valid_0's l2: 1.12797e+07
[30]	valid_0's l2: 1.12648e+07
[31]	valid_0's l2: 1

In [16]:
model.feature_importances_

array([487, 428, 199, 487, 510, 259, 226, 130, 124, 151, 134, 152, 117,
       196])

In [17]:
days_data[0].columns

Index(['Fecha_Ejec_Inicio_Int', 'Id_Job', 'Id_Malla', 'duracion_int',
       'DaysSinceMainframeOp', 'DaysCountMainframeOp', 'DiaSemana',
       'promedio_por_dia', 'promedio_por_semana', 'id_job_bias',
       'id_mesh_bias', 'fail_bias', 'f_job_0', 'f_mesh_0', 'f_job_1',
       'f_mesh_1', 'f_job_2', 'f_mesh_2'],
      dtype='object')

In [18]:
y_test = np.mean([x[0] for x in output], axis=0)

In [19]:
from sklearn.metrics import mean_squared_error

In [20]:
# With 3 factors
mean_squared_error(days_data[-1].duracion_int, y_test)

434083.6687607116

In [82]:
# With 15 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

3068806.5515911337

# Testing with NMF

In [21]:
nmf = dump.load(PROCESSED/'nmf.dump')[1]

In [22]:
%%time
# Adding embeddings and biases
for temp in tqdm(days_data):
    add_embeds_features(temp, nmf)

100%|██████████| 43/43 [00:19<00:00,  2.18it/s]

CPU times: user 19.6 s, sys: 152 ms, total: 19.8 s
Wall time: 19.7 s





## Model

In [23]:
%%time
results = None
output = []
for i, w in enumerate(days_data[1:]):
    train, val, test = pd.concat(days_data[:i+1], axis=0), w, days_data[-1]
    
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
    y_test, _, results, model = run_model(
            LGBMRegressor(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results,
            early_stopping=True)
    output.append([y_test, model])

[1]	valid_0's l2: 1.31688e+08
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's l2: 1.08261e+08
[3]	valid_0's l2: 8.93595e+07
[4]	valid_0's l2: 7.42694e+07
[5]	valid_0's l2: 6.18849e+07
[6]	valid_0's l2: 5.19657e+07
[7]	valid_0's l2: 4.38876e+07
[8]	valid_0's l2: 3.73965e+07
[9]	valid_0's l2: 3.2135e+07
[10]	valid_0's l2: 2.78991e+07
[11]	valid_0's l2: 2.44893e+07
[12]	valid_0's l2: 2.17299e+07
[13]	valid_0's l2: 1.95373e+07
[14]	valid_0's l2: 1.77602e+07
[15]	valid_0's l2: 1.63611e+07
[16]	valid_0's l2: 1.52217e+07
[17]	valid_0's l2: 1.43079e+07
[18]	valid_0's l2: 1.36088e+07
[19]	valid_0's l2: 1.30325e+07
[20]	valid_0's l2: 1.2605e+07
[21]	valid_0's l2: 1.22457e+07
[22]	valid_0's l2: 1.19895e+07
[23]	valid_0's l2: 1.17976e+07
[24]	valid_0's l2: 1.1628e+07
[25]	valid_0's l2: 1.14912e+07
[26]	valid_0's l2: 1.14075e+07
[27]	valid_0's l2: 1.1331e+07
[28]	valid_0's l2: 1.12897e+07
[29]	valid_0's l2: 1.12475e+07
[30]	valid_0's l2: 1.12339e+07
[31]	valid_0's l2: 1.

In [24]:
model.feature_importances_

array([520, 437, 226, 489, 491, 215, 157, 119, 151, 152, 134, 175, 191,
       143])

In [25]:
y_test = np.mean([x[0] for x in output], axis=0)

In [26]:
# 3 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

425641.63476211013

In [54]:
# 5 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

417082.4579756669

In [100]:
# 15 factors
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

3071225.6929158787