In [52]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pandas_summary import DataFrameSummary
from pathlib import Path

import os, math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()
os.chdir('/home/krivas/projects/analysis-project/')
from src.utils import convertInt, convertDate, add_datediffs, add_dayscount, make_set


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
DATA        = Path('data')
RAW         = DATA/'raw'
PROCESSED   = DATA/'processed'

# Reading data

In [3]:
csv = pd.read_csv(PROCESSED/'HistoricoJob.csv')

In [17]:
csv.columns

Index(['Id_HistoricoJob', 'Id_Job', 'Id_Malla', 'Fecha_Carga_Scheduler',
       'Fecha_Ejec_Inicio', 'Fecha_Ejec_Fin', 'Duracion', 'Promedio', 'Agente',
       'Mxrc', 'Maxcmpc', 'Grupo', 'Force_Complete', 'CCF', 'Estado',
       'duracion_int', 'promedio_int', 'Fecha_Ejec_Inicio_Int',
       'Hora_Ejec_Inicio_Int', 'Fecha_Ejec_Fin_Int', 'Hora_Ejec_Fin_Int'],
      dtype='object')

In [25]:
target_date = 20180521
stop_date = 20180511
start_date = 20180420

In [59]:
%%time
days_data = []
for date in range(start_date, target_date):
    if date in csv.Fecha_Ejec_Inicio_Int.unique():
        print(date)
        days_data.append(make_set(csv.loc[csv['Fecha_Ejec_Inicio_Int'] == date], csv.loc[csv['Fecha_Ejec_Inicio_Int'] == target_date]))

20180420
20180421
20180422
20180423
20180424
20180425
20180426
20180427
20180428
20180429
20180430
20180501
20180502
20180503
20180504
20180505
20180506
20180507
20180508
20180509
20180510
20180511
20180514
20180515
CPU times: user 28.9 s, sys: 16 ms, total: 28.9 s
Wall time: 28.9 s


# Preprocessing

In [60]:
%%time
for temp in days_data:
    add_datediffs(temp, csv)
    add_dayscount(temp, csv)
    #add_datefeatures(temp)

CPU times: user 16 s, sys: 0 ns, total: 16 s
Wall time: 16 s


In [61]:
days_data[0].head()

Unnamed: 0,Fecha_Ejec_Inicio_Int,Id_Job,Id_Malla,duracion_int,DaysSinceMainframeOp,DaysCountMainframeOp
0,20180420.0,@AK2ZF29,02FBFCL2,0.0,244,1
1,20180420.0,@D2MKV99,02MKH993,0.0,293,0
2,20180420.0,@D2MKV99,04MKH993,2.0,293,0
3,20180420.0,@D2RPHF8,02REH992,13.0,37,32
4,20180420.0,@D2VNF49,02NZVNH2,0.0,293,0


# Model

In [110]:
import time, pprint    
from sklearn.metrics import mean_squared_error
pp = pprint.PrettyPrinter(indent=3)

def fit_model(model, X_trn, y_trn, X_val, y_val, early_stopping, cat_indices):
    if X_val is not None:
        early_stopping = 30 if early_stopping else 0
        model.fit(X_trn, y_trn, 
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=early_stopping,
                eval_metric='mse')
    else:
        model.fit(X_trn, y_trn)
        
def calculate_metrics(model, metrics, X_trn, y_trn, X_val, y_val):
    metric_function = {'mse': mean_squared_error}
    dset = {'trn': {'X': X_trn, 'y': y_trn},
            'val': {'X': X_val, 'y': y_val}}
    
    for d in dset:
        if dset[d]['X'] is not None:
            y_pred = model.predict(dset[d]['X'])
            for m in metrics:
                metrics[m][d] += [metric_function[m](dset[d]['y'], y_pred)]
        else:
            for m in metrics:
                metrics[m][d] += [0] # no val set
                
    pp.pprint(metrics)
    print()
    
def run_model(model, X_train, y_train, X_val, y_val, X_test, 
              metric_names, results=None, params_desc='',
              early_stopping=False, cat_indices=None):
    if results is None: results = pd.DataFrame()
    metrics = {metric: {'trn': [], 'val': []} for metric in metric_names}
    y_test = np.zeros((len(X_test)))
    start = time.time()
    
    fit_model(model, X_train, y_train, X_val, y_val, early_stopping, cat_indices)
    calculate_metrics(model, metrics, X_train, y_train, X_val, y_val)
    y_test = model.predict(X_test)
            
    end = time.time()
    means = {f'{d}_{m}_mean': np.mean(metrics[m][d]) for m in metrics \
                                                     for d in metrics[m]}
    metadata = {'params': params_desc, 'time': round(end - start, 2)}
    pp.pprint(means)
    results = results.append(pd.Series({**metadata, **means}),
                             ignore_index=True)
    return y_test, metrics, results, model

In [79]:
#from src.utils import run_model
from lightgbm import LGBMRegressor

In [123]:
days_data[-1:][0]['Prediccion'] = 0

In [125]:
target_col = 'duracion_int'
pred_col = 'Prediccion'
id_cols = ['Fecha_Ejec_Inicio_Int', 'Id_Job', 'Id_Malla']
metric_names = ['mse']

In [None]:
%%time
results = None
output = []
for i, w in enumerate(days_data[1:]):
    train, val, test = days_data[i], w, days_data[-1]
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
    y_test, _, results, model = run_model(
            LGBMRegressor(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results, 
            params_desc='n_estimators=120',
            early_stopping=True)
    output.append([y_test, model])

[1]	valid_0's l2: 7.82678e+06
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's l2: 7.84453e+06
[3]	valid_0's l2: 7.86498e+06
[4]	valid_0's l2: 7.88689e+06
[5]	valid_0's l2: 7.90672e+06
[6]	valid_0's l2: 7.92512e+06
[7]	valid_0's l2: 7.94392e+06
[8]	valid_0's l2: 7.96046e+06
[9]	valid_0's l2: 7.97927e+06
[10]	valid_0's l2: 7.99424e+06
[11]	valid_0's l2: 8.00827e+06
[12]	valid_0's l2: 8.02255e+06
[13]	valid_0's l2: 8.0355e+06
[14]	valid_0's l2: 8.04662e+06
[15]	valid_0's l2: 8.0582e+06
[16]	valid_0's l2: 8.06956e+06
[17]	valid_0's l2: 8.08065e+06
[18]	valid_0's l2: 8.09142e+06
[19]	valid_0's l2: 8.0987e+06
[20]	valid_0's l2: 8.10547e+06
[21]	valid_0's l2: 8.11128e+06
[22]	valid_0's l2: 8.11685e+06
[23]	valid_0's l2: 8.12043e+06
[24]	valid_0's l2: 8.12439e+06
[25]	valid_0's l2: 8.12857e+06
[26]	valid_0's l2: 8.13337e+06
[27]	valid_0's l2: 8.13636e+06
[28]	valid_0's l2: 8.13936e+06
[29]	valid_0's l2: 8.14187e+06
[30]	valid_0's l2: 8.14485e+06
[31]	valid_0's l2: 8

In [108]:
y_test = np.mean([x[0] for x in output], axis=0)

In [109]:
y_test

array([822.96218991, 974.07160106, 974.07160106, ..., 974.07160106,
       974.07160106, 974.07160106])