In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pandas_summary import DataFrameSummary
from pathlib import Path

import surprise
import os, math
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()
os.chdir('/home/krivas/projects/analysis-project/')

In [3]:
DATA        = Path('data')
RAW         = DATA/'raw'
PROCESSED   = DATA/'processed'

In [4]:
from surprise import SVD, NMF, KNNBasic, evaluate, Reader, Dataset
from surprise.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

# Reading data

In [5]:
csv = pd.read_csv(PROCESSED/'HistoricoJob.csv')

In [6]:
csv.columns

Index(['Id_HistoricoJob', 'Id_Job', 'Id_Malla', 'Fecha_Carga_Scheduler',
       'Fecha_Ejec_Inicio', 'Fecha_Ejec_Fin', 'Duracion', 'Promedio', 'Agente',
       'Mxrc', 'Maxcmpc', 'Grupo', 'Force_Complete', 'CCF', 'Estado',
       'duracion_int', 'promedio_int', 'Fecha_Ejec_Inicio_Int',
       'Hora_Ejec_Inicio_Int', 'Fecha_Ejec_Fin_Int', 'Hora_Ejec_Fin_Int',
       'DiaSemana', 'hubo_error'],
      dtype='object')

In [7]:
csv_group = csv.groupby(['Id_Job', 'Id_Malla'])['hubo_error'].max().reset_index()

In [8]:
csv_group.head()

Unnamed: 0,Id_Job,Id_Malla,hubo_error
0,$E2FS992,I176352,0
1,@AK2ZF29,02FBFCL2,1
2,@D2BYJQE,02DBWNH2,0
3,@D2BYVLB,02DBWNH2,1
4,@D2E6Y39,W4746233,0


In [9]:
csv_group.hubo_error.value_counts()

0    37871
1     5877
Name: hubo_error, dtype: int64

# Testing

In [10]:
from surprise import Reader, Dataset
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(csv_group, reader)

In [None]:
data.split(n_folds=5)

In [83]:
%%time
# svd
algo = SVD()
evaluate(algo, data, measures=['RMSE'])



Evaluating RMSE of algorithm SVD.

------------
Fold 1
RMSE: 0.3242
------------
Fold 2
RMSE: 0.3277
------------
Fold 3
RMSE: 0.3210
------------
Fold 4
RMSE: 0.3308
------------
Fold 5
RMSE: 0.3218
------------
------------
Mean RMSE: 0.3251
------------
------------
CPU times: user 11.6 s, sys: 52 ms, total: 11.6 s
Wall time: 11.6 s


In [84]:
%%time
# nmf
algo = NMF()
evaluate(algo, data, measures=['RMSE'])



Evaluating RMSE of algorithm NMF.

------------
Fold 1
RMSE: 0.3340
------------
Fold 2
RMSE: 0.3366
------------
Fold 3
RMSE: 0.3315
------------
Fold 4
RMSE: 0.3414
------------
Fold 5
RMSE: 0.3283
------------
------------
Mean RMSE: 0.3344
------------
------------
CPU times: user 20.5 s, sys: 24 ms, total: 20.5 s
Wall time: 20.5 s


# Testing NMF, SVD on train/test

In [11]:
reader = Reader(rating_scale=(0, 1))
data   = Dataset.load_from_df(csv_group, reader)
trainset = data.build_full_trainset()

In [12]:
csv_group.head()

Unnamed: 0,Id_Job,Id_Malla,hubo_error
0,$E2FS992,I176352,0
1,@AK2ZF29,02FBFCL2,1
2,@D2BYJQE,02DBWNH2,0
3,@D2BYVLB,02DBWNH2,1
4,@D2E6Y39,W4746233,0


### NMF

In [32]:
nmf = NMF(n_factors=20, n_epochs=50, biased=True, verbose=True)

In [33]:
%%time
nmf.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
CPU times:

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7fafc1345390>

In [34]:
%%time
from tqdm import tqdm
train_preds = []
for cIdx, iIdx in tqdm(zip(csv_group.Id_Job, csv_group.Id_Malla), total=len(csv_group)):
    train_preds.append(nmf.predict(cIdx, iIdx)[3])

100%|██████████| 43748/43748 [00:00<00:00, 144070.87it/s]

CPU times: user 332 ms, sys: 112 ms, total: 444 ms
Wall time: 306 ms





In [35]:
roc_auc_score(csv_group.hubo_error, train_preds)

0.8029172063728317

In [36]:
confusion_matrix(csv_group.hubo_error, np.round(train_preds))

array([[33260,  4611],
       [ 1725,  4152]])

In [37]:
tn, fp, fn, tp = confusion_matrix(csv_group.hubo_error, np.round(train_preds)).ravel()

In [38]:
tp, fp

(4152, 4611)

### SVD

In [39]:
svd = SVD(n_factors=20, n_epochs=50, biased=True, verbose=True)

In [40]:
%%time
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
CPU times:

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fafc1346160>

In [42]:
%%time
from tqdm import tqdm
train_preds = []
for cIdx, iIdx in tqdm(zip(csv_group.Id_Job, csv_group.Id_Malla), total=len(csv_group)):
    train_preds.append(svd.predict(cIdx, iIdx)[3])

100%|██████████| 43748/43748 [00:00<00:00, 111504.46it/s]

CPU times: user 428 ms, sys: 148 ms, total: 576 ms
Wall time: 397 ms





In [43]:
roc_auc_score(csv_group.hubo_error, train_preds)

0.9866918030894369

In [44]:
confusion_matrix(csv_group.hubo_error, np.round(train_preds))

array([[37776,    95],
       [ 3373,  2504]])

In [45]:
tn, fp, fn, tp = confusion_matrix(csv_group.hubo_error, np.round(train_preds)).ravel()

In [46]:
tp, fp

(2504, 95)

# Saving weights

In [47]:
from surprise.dump import dump

In [48]:
dump(PROCESSED/'svd_20.dump', algo=svd)

In [49]:
dump(PROCESSED/'nmf_20.dump', algo=nmf)