In [373]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pandas_summary import DataFrameSummary
from pathlib import Path

import os, math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()
os.chdir('/home/krivas/projects/analysis-project/')
from src.utils import convert_int, convert_date,\
                        add_mean_features, add_date_diffs, add_days_count, add_date_features,\
                        make_set, run_model, apply_cats


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
DATA        = Path('data')
RAW         = DATA/'raw'
PROCESSED   = DATA/'processed'

# Reading data

In [303]:
csv = pd.read_csv(PROCESSED/'HistoricoJob.csv')

In [552]:
csv.to_csv(PROCESSED/'HistoricoJob.csv')

In [553]:
csv.columns

Index(['Id_HistoricoJob', 'Id_Job', 'Id_Malla', 'Fecha_Carga_Scheduler',
       'Fecha_Ejec_Inicio', 'Fecha_Ejec_Fin', 'Duracion', 'Promedio', 'Agente',
       'Mxrc', 'Maxcmpc', 'Grupo', 'Force_Complete', 'CCF', 'Estado',
       'duracion_int', 'promedio_int', 'Fecha_Ejec_Inicio_Int',
       'Hora_Ejec_Inicio_Int', 'Fecha_Ejec_Fin_Int', 'Hora_Ejec_Fin_Int',
       'DiaSemana', 'Termino'],
      dtype='object')

In [446]:
target_date = 20180521
start_date = 20180401

In [447]:
%%time
days_data = []
for date in range(start_date, target_date):
    if date in csv.Fecha_Ejec_Inicio_Int.unique():
        days_data.append(make_set(csv.loc[csv['Fecha_Ejec_Inicio_Int'] == date], csv.loc[csv['Fecha_Ejec_Inicio_Int'] == target_date]))

CPU times: user 1min 12s, sys: 52 ms, total: 1min 12s
Wall time: 1min 18s


# Preprocessing

In [441]:
from tqdm import tqdm

In [475]:
%%time
for temp in tqdm(days_data):
    add_date_diffs(temp, csv)
    add_days_count(temp, csv)
    add_date_features(temp)
    add_mean_features(temp, csv, 3)

100%|██████████| 43/43 [09:56<00:00, 13.87s/it]

CPU times: user 8min 5s, sys: 1min 50s, total: 9min 56s
Wall time: 9min 56s





In [501]:
days_data[0].head()

Unnamed: 0,Fecha_Ejec_Inicio_Int,Id_Job,Id_Malla,duracion_int,DaysSinceMainframeOp,DaysCountMainframeOp,DiaSemana,promedio_por_dia,promedio_por_semana,DaysCountMainframeOpPerWeek
0,20180401.0,@DF2LIAB,00FBFCL2,7.0,292,246,6,,7.5,4
1,20180401.0,@DF2NO92,02FMIH92,0.0,292,593,6,0.0,0.0,14
2,20180401.0,@E0WA949,02WA192H,0.0,121,2,6,,,0
3,20180401.0,@E0WA969,02WA192H,0.0,121,2,6,,,0
4,20180401.0,@E2A6961,02A6H993,4.0,292,296,6,5.0,5.0,7


In [468]:
# cat_cols = ['DiaSemana']

In [469]:
# %%time
# for w in days_data:
#     for col in cat_cols:
#         w[col] = w[col].astype('category').cat.codes

# Model

In [502]:
#from src.utils import run_model
from lightgbm import LGBMRegressor

In [503]:
target_col = 'duracion_int'
pred_col = 'Prediccion'
id_cols = ['Fecha_Ejec_Inicio_Int', 'Id_Job', 'Id_Malla']
metric_names = ['mse']

In [504]:
days_data[len(days_data) - 1][pred_col] = 0

In [505]:
%%time
results = None
output = []
for i, w in enumerate(days_data[1:]):
    train, val, test = pd.concat(days_data[:i+1], axis=0), w, days_data[-1]
    
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
    y_test, _, results, model = run_model(
            LGBMRegressor(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results,
            early_stopping=True)
    output.append([y_test, model])

[1]	valid_0's l2: 1.48065e+08
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's l2: 1.21024e+08
[3]	valid_0's l2: 9.90256e+07
[4]	valid_0's l2: 8.14216e+07
[5]	valid_0's l2: 6.71241e+07
[6]	valid_0's l2: 5.57149e+07
[7]	valid_0's l2: 4.6457e+07
[8]	valid_0's l2: 3.90957e+07
[9]	valid_0's l2: 3.31369e+07
[10]	valid_0's l2: 2.84231e+07
[11]	valid_0's l2: 2.46168e+07
[12]	valid_0's l2: 2.16164e+07
[13]	valid_0's l2: 1.92061e+07
[14]	valid_0's l2: 1.73213e+07
[15]	valid_0's l2: 1.58139e+07
[16]	valid_0's l2: 1.46468e+07
[17]	valid_0's l2: 1.3723e+07
[18]	valid_0's l2: 1.30167e+07
[19]	valid_0's l2: 1.2463e+07
[20]	valid_0's l2: 1.20504e+07
[21]	valid_0's l2: 1.17289e+07
[22]	valid_0's l2: 1.14937e+07
[23]	valid_0's l2: 1.13185e+07
[24]	valid_0's l2: 1.1195e+07
[25]	valid_0's l2: 1.11137e+07
[26]	valid_0's l2: 1.10536e+07
[27]	valid_0's l2: 1.10261e+07
[28]	valid_0's l2: 1.10065e+07
[29]	valid_0's l2: 1.10005e+07
[30]	valid_0's l2: 1.10153e+07
[31]	valid_0's l2: 1.

In [458]:
model.feature_importances_

array([ 246, 1611,  403,  694,  646])

In [507]:
y_test = np.mean([x[0] for x in output], axis=0)

In [349]:
from sklearn.metrics import mean_squared_error

In [479]:
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

367096.15862610535

In [512]:
mean_squared_error(days_data[-1:][0].duracion_int, y_test) / len(y_test)

179.34868969857297