In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pandas_summary import DataFrameSummary
from pathlib import Path

import os, math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()
os.chdir('/home/krivas/projects/analysis-project/')
from src.utils import convert_int, convert_date,\
                        add_median_features, add_date_diffs, add_days_count, add_date_features,\
                        make_set, run_model, apply_cats


In [2]:
from sklearn.metrics import mean_squared_error

In [3]:
DATA        = Path('data')
RAW         = DATA/'raw'
PROCESSED   = DATA/'processed'

# Reading data

In [4]:
csv = pd.read_csv(PROCESSED/'HistoricoJob.csv')

In [5]:
csv.columns

Index(['Id_HistoricoJob', 'Id_Job', 'Id_Malla', 'Fecha_Carga_Scheduler',
       'Fecha_Ejec_Inicio', 'Fecha_Ejec_Fin', 'Duracion', 'Promedio', 'Agente',
       'Mxrc', 'Maxcmpc', 'Grupo', 'Force_Complete', 'CCF', 'Estado',
       'duracion_int', 'promedio_int', 'Fecha_Ejec_Inicio_Int',
       'Hora_Ejec_Inicio_Int', 'Fecha_Ejec_Fin_Int', 'Hora_Ejec_Fin_Int',
       'DiaSemana', 'hubo_error'],
      dtype='object')

In [6]:
target_date = int(csv.Fecha_Ejec_Inicio_Int.max())
start_date = 20180401

In [7]:
%%time
days_data = []
for date in range(start_date, target_date + 1):
    if date in csv.Fecha_Ejec_Inicio_Int.unique():
        days_data.append(make_set(csv.loc[csv['Fecha_Ejec_Inicio_Int'] == date], csv.loc[(csv['Fecha_Ejec_Inicio_Int'] == target_date) & (csv['hubo_error'] == 0)]))

CPU times: user 50.3 s, sys: 68 ms, total: 50.3 s
Wall time: 50.3 s


# Preprocessing

In [8]:
from tqdm import tqdm

In [9]:
%%time
for temp in tqdm(days_data):
    add_date_diffs(temp, csv)
    add_days_count(temp, csv)
    add_date_features(temp)
    add_median_features(temp, csv, 3)

100%|██████████| 43/43 [15:56<00:00, 22.24s/it]

CPU times: user 13min 51s, sys: 2min 5s, total: 15min 56s
Wall time: 15min 56s





In [10]:
# Reading dumps
from surprise import dump

svd = dump.load(PROCESSED/'svd_15.dump')[1]

In [11]:
# Adding biases
uid = svd.trainset._raw2inner_id_users
iid = svd.trainset._raw2inner_id_items

for temp in tqdm(days_data):
    # Adding factors features
    fact_job = np.array([svd.pu[uid[cIdx]] for cIdx in temp.Id_Job])
    fact_mesh = np.array([svd.qi[iid[iIdx]] for iIdx in temp.Id_Malla])
    # for ixf in range(fact_job.shape[1]):
    temp['f_job']  = fact_job.prod(axis=1)
    temp['f_mesh']  = fact_mesh.prod(axis=1)
    temp['fail_bias'] = temp.apply(lambda row: svd.predict(row['Id_Job'], row['Id_Malla'])[3], axis=1)
    

100%|██████████| 43/43 [00:16<00:00,  2.63it/s]


In [12]:
days_data[0].head()

Unnamed: 0,Fecha_Ejec_Inicio_Int,Id_Job,Id_Malla,duracion_int,DaysSinceMainframeOp,DaysSinceMainframeOpFail,DaysCountMainframeOp,DaysCountMainframeOpFail,DiaSemana,promedio_por_dia,promedio_por_semana,promedio_por_mes,promedio_por_semana_anterior,promedio_por_dia_anterior,f_job,f_mesh,fail_bias
0,20180401.0,@DF2LIAB,00FBFCL2,7.0,292,274,246,0,6,,7.5,9.0,,12.0,2.3600199999999997e-19,6.554049e-22,0.046843
1,20180401.0,@DF2NO92,02FMIH92,0.0,292,274,593,0,6,0.0,0.0,0.0,0.0,0.0,-5.027472e-22,-7.863664e-19,0.068794
2,20180401.0,@E0WA949,02WA192H,0.0,121,274,2,0,6,,,1.0,,,1.603025e-16,7.009739999999999e-20,0.108905
3,20180401.0,@E0WA969,02WA192H,0.0,121,274,2,0,6,,,1.0,,,4.318834e-18,7.009739999999999e-20,0.023435
4,20180401.0,@E2A6961,02A6H993,4.0,292,274,296,0,6,5.0,5.0,4.0,6.0,6.0,1.0728790000000001e-17,1.411955e-20,0.001501


# Grid Search

In [13]:
from src.utils import run_model
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import lightgbm

In [14]:
target_col = 'duracion_int'
pred_col = 'Prediccion'
id_cols = ['Fecha_Ejec_Inicio_Int', 'Id_Job', 'Id_Malla']
metric_names = ['mse']
days_data[len(days_data) - 1][pred_col] = 0

In [15]:
%%time

train, val, test = pd.concat(days_data[:len(days_data)-1], axis=0), days_data[-1], days_data[-1]
X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
X_test = test.drop(id_cols + [target_col, pred_col], axis=1)

CPU times: user 72 ms, sys: 56 ms, total: 128 ms
Wall time: 127 ms


In [21]:
estimator = LGBMRegressor(n_jobs=1, random_state=42)

param_grid = {
    'n_estimators': [200, 400],
    'boosting_type': ['dart', 'gbdt'],
    'num_leaves': [31, 50],
    'max_depth': [30, 50]
}

gbm = GridSearchCV(estimator, param_grid, cv=5)

gbm.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm.best_params_)

Best parameters found by grid search are: {'boosting_type': 'gbdt', 'max_depth': 30, 'n_estimators': 400, 'num_leaves': 50}


In [22]:
gbm.best_params_

{'boosting_type': 'gbdt',
 'max_depth': 30,
 'n_estimators': 400,
 'num_leaves': 50}

# Cross validation with best params

In [2]:
from sklearn.metrics import mean_absolute_error
dftrainLGB = lightgbm.Dataset(data = X_train, label = y_train, feature_name = list(X_train))

params = {'objective': 'regression', 'num_leaves': 50, 'max_depth': 50, 'n_estimators': 400, 'random_state': 42}

In [47]:
cv_results = lightgbm.cv(
        params,
        dftrainLGB,
        num_boost_round=100,
        nfold=3,
        metrics='mae',
        early_stopping_rounds=30,
        stratified=False
        )

In [48]:
cv_results

{'l1-mean': [2375.773512545981,
  2144.7419894305726,
  1937.4104068137551,
  1750.9921634585419,
  1583.5519959484702,
  1433.0686383181173,
  1297.9906916096327,
  1176.7763674805028,
  1067.8997514619234,
  970.1451456398308,
  882.5124877242997,
  803.7671725916639,
  733.0707843010324,
  669.628441388504,
  612.7016093796874,
  561.7172411978004,
  515.9886044894183,
  475.0457262117722,
  438.3683006392809,
  405.37024668939284,
  375.8359974063321,
  349.39802145005706,
  325.89979062585263,
  304.65901355260866,
  285.65990914977357,
  268.618930902525,
  253.49575044104367,
  239.8967056680467,
  227.58579720372418,
  216.66152637740194,
  207.01557484753525,
  198.3190678595308,
  190.49348894503035,
  183.46574112911358,
  177.2300945379925,
  171.5914896825358,
  166.68846900148682,
  162.29405641105464,
  158.26121277375708,
  154.790559904289,
  151.71532024798535,
  148.8275779813142,
  146.15392300944401,
  143.96682891654152,
  141.8800810407672,
  140.10531725959606,
