In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pandas_summary import DataFrameSummary
from pathlib import Path

import os, math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()
os.chdir('/home/krivas/projects/analysis-project/')
from src.utils import convert_int, convert_date,\
                        add_mean_features, add_date_diffs, add_days_count, add_date_features,\
                        make_set, run_model, apply_cats


In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
PROCESSED   = DATA/'processed'

# Reading data

In [3]:
csv = pd.read_csv(PROCESSED/'HistoricoJob.csv')

In [4]:
csv.columns

Index(['Id_HistoricoJob', 'Id_Job', 'Id_Malla', 'Fecha_Carga_Scheduler',
       'Fecha_Ejec_Inicio', 'Fecha_Ejec_Fin', 'Duracion', 'Promedio', 'Agente',
       'Mxrc', 'Maxcmpc', 'Grupo', 'Force_Complete', 'CCF', 'Estado',
       'duracion_int', 'promedio_int', 'Fecha_Ejec_Inicio_Int',
       'Hora_Ejec_Inicio_Int', 'Fecha_Ejec_Fin_Int', 'Hora_Ejec_Fin_Int',
       'DiaSemana', 'hubo_error'],
      dtype='object')

In [5]:
target_date = int(csv.Fecha_Ejec_Inicio_Int.max())
start_date = 20180401

In [6]:
int(csv.Fecha_Ejec_Inicio_Int.max())

20180515

In [7]:
%%time
days_data = []
for date in range(start_date, target_date + 1):
    if date in csv.Fecha_Ejec_Inicio_Int.unique():
        days_data.append(make_set(csv.loc[csv['Fecha_Ejec_Inicio_Int'] == date], csv.loc[(csv['Fecha_Ejec_Inicio_Int'] == target_date) & (csv['hubo_error'] == 0)]))

CPU times: user 52.6 s, sys: 0 ns, total: 52.6 s
Wall time: 52.6 s


In [8]:
days_data[-1].head()

Unnamed: 0,Fecha_Ejec_Inicio_Int,Id_Job,Id_Malla,duracion_int
0,20180515.0,@AK2ZF29,02FBFCL2,0.0
1,20180515.0,@DF2FCL2,02FBFCL4,57.0
2,20180515.0,@DF2FCL4,02FBFCL2,18.0
3,20180515.0,@DF2IACV,02FBFCL2,1.0
4,20180515.0,@DF2LIAB,00FBFCL2,11.0


# Preprocessing

In [9]:
from tqdm import tqdm

In [32]:
%%time
for temp in tqdm(days_data):
    add_date_diffs(temp, csv)
    add_days_count(temp, csv)
    add_date_features(temp)
    add_mean_features(temp, csv, 3)

100%|██████████| 43/43 [02:55<00:00,  4.09s/it]

CPU times: user 2min 3s, sys: 53.1 s, total: 2min 56s
Wall time: 2min 55s





In [11]:
days_data[0].head()

Unnamed: 0,Fecha_Ejec_Inicio_Int,Id_Job,Id_Malla,duracion_int,DaysSinceMainframeOp,DaysSinceMainframeOpFail,DaysCountMainframeOp,DaysCountMainframeOpFail,DiaSemana,promedio_por_dia,promedio_por_semana
0,20180401.0,@AK2ZF29,02FBFCL2,0.0,292,225,204,1,6,,1.0
1,20180401.0,@DF2FCL2,02FBFCL4,0.0,292,274,204,0,6,,53.0
2,20180401.0,@DF2FCL4,02FBFCL2,0.0,292,274,204,0,6,,24.0
3,20180401.0,@DF2IACV,02FBFCL2,0.0,292,274,247,0,6,1.5,1.0
4,20180401.0,@DF2LIAB,00FBFCL2,7.0,292,274,246,0,6,,7.5


# Model

In [24]:
#from src.utils import run_model
from lightgbm import LGBMRegressor

In [25]:
target_col = 'duracion_int'
pred_col = 'Prediccion'
id_cols = ['Fecha_Ejec_Inicio_Int', 'Id_Job', 'Id_Malla']
metric_names = ['mse']

In [26]:
days_data[len(days_data) - 1][pred_col] = 0

In [27]:
%%time
results = None
output = []
for i, w in enumerate(days_data[1:]):
    train, val, test = pd.concat(days_data[:i+1], axis=0), w, days_data[-1]
    
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
    y_test, _, results, model = run_model(
            LGBMRegressor(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results,
            early_stopping=True)
    output.append([y_test, model])

[1]	valid_0's l2: 1.31688e+08
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's l2: 1.08241e+08
[3]	valid_0's l2: 8.93411e+07
[4]	valid_0's l2: 7.42452e+07
[5]	valid_0's l2: 6.18631e+07
[6]	valid_0's l2: 5.19417e+07
[7]	valid_0's l2: 4.38561e+07
[8]	valid_0's l2: 3.73684e+07
[9]	valid_0's l2: 3.21092e+07
[10]	valid_0's l2: 2.78684e+07
[11]	valid_0's l2: 2.44698e+07
[12]	valid_0's l2: 2.17121e+07
[13]	valid_0's l2: 1.95213e+07
[14]	valid_0's l2: 1.77341e+07
[15]	valid_0's l2: 1.63065e+07
[16]	valid_0's l2: 1.51872e+07
[17]	valid_0's l2: 1.43084e+07
[18]	valid_0's l2: 1.3581e+07
[19]	valid_0's l2: 1.30007e+07
[20]	valid_0's l2: 1.25705e+07
[21]	valid_0's l2: 1.22125e+07
[22]	valid_0's l2: 1.19547e+07
[23]	valid_0's l2: 1.17605e+07
[24]	valid_0's l2: 1.1591e+07
[25]	valid_0's l2: 1.14814e+07
[26]	valid_0's l2: 1.13818e+07
[27]	valid_0's l2: 1.13219e+07
[28]	valid_0's l2: 1.12666e+07
[29]	valid_0's l2: 1.12273e+07
[30]	valid_0's l2: 1.12102e+07
[31]	valid_0's l2: 

In [28]:
model.feature_importances_

array([679, 528, 817, 269, 628, 679])

In [30]:
y_test = np.mean([x[0] for x in output], axis=0)

In [18]:
from sklearn.metrics import mean_squared_error

In [61]:
mean_squared_error(days_data[-1:][0].duracion_int, y_test)

389074.1739319008

In [62]:
mean_squared_error(days_data[-1:][0].duracion_int, y_test) / len(y_test)

183.17993122970847