In [1]:
%load_ext kedro.extras.extensions.ipython

In [2]:
%reload_kedro

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import math

In [None]:
master_table = catalog.load("master_table@spark")

### Create cyclical features for the time

In [None]:
master_table = master_table.withColumn(
    "SEC2",
    F.col("SEC") - (F.col("DAY") - 1) *  86400
).withColumn(
    "SEC_SIN",
    F.sin(2 * math.pi * F.col("SEC2")/86400)
).withColumn(
    "SEC_COS",
    F.cos(2 * math.pi * F.col("SEC2")/86400)
)

### Create dummies for DAY

In [None]:
expresiones_columnas_no_numericas = []
columna = "DAY"

unicos_columna = (
    master_table.select(columna)
    .distinct()
    .rdd.flatMap(lambda x: x)
    .collect()
)

unicos_columna.sort()

expr_unicos_columna = [
    F.when(
        F.col(columna)
        == valor_unico,
        1,
    )
    .otherwise(0)
    .alias(
        "{columna}_{valor}".format(
            columna=columna,
            valor=int(valor_unico),
        )
    )
    for valor_unico in unicos_columna
    if valor_unico != 1 #tiene un efecto parecido que drop_first=True
]

expresiones_columnas_no_numericas += expr_unicos_columna

In [None]:
master_table = master_table.select("*", *expresiones_columnas_no_numericas)

In [None]:
master_table = master_table.select(
    'SEC_SIN',
    'SEC_COS',
    'Ph1',
    'Ph2',
    'Ir1',
    'Fo1',
    'Fo2',
    'Di3',
    'Di4',
    'Ph3',
    'Ph4',
    'Ph5',
    'Ph6',
    'Co1',
    'Co2',
    'Co3',
    'So1',
    'So2',
    'Di1',
    'Di2',
    'Te1',
    'Fo3',
    'DAY_2',
    'DAY_3',
    'DAY_4',
    'DAY_5',
    'DAY_6',
    'DAY_7',
    'DAY_8',
    'DAY_9',
    'DAY_10',
    'DAY_11',
    'DAY_12',
    'DAY_13',
    'DAY_14',
    'DAY_15',
    'DAY_16',
    'DAY_17',
    'DAY_18',
    'DAY_19',
    'DAY_20',
    'DAY_21',
    'DAY_22',
    'DAY_23',
    'DAY_24',
    'DAY_25',
    'DAY_26',
    'DAY_27',
    'DAY_28',
    'DAY_29',
    'DAY_30',
    'Lag3_LABEL',
    'Lag2_LABEL',
    'Lag1_LABEL',
    'LABEL'
)

In [None]:
catalog.save("featured_master_table@spark", master_table)

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, train_test_split, KFold
import xgboost as xgb
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

### Train and test split

In [5]:
X_aras = catalog.load("featured_master_table@pandas")

In [6]:
y_aras = X_aras['LABEL']
X_aras = X_aras.drop(columns=['LABEL'])

In [7]:
X_aras_train, X_aras_test, y_aras_train, y_aras_test = train_test_split(X_aras, y_aras, test_size = 0.3, random_state = 0)

In [8]:
X_aras_train.shape, X_aras_test.shape, y_aras_train.shape, y_aras_test.shape

[1m([0m[1m([0m[1;36m1811203[0m, [1;36m54[0m[1m)[0m, [1m([0m[1;36m776230[0m, [1;36m54[0m[1m)[0m, [1m([0m[1;36m1811203[0m,[1m)[0m, [1m([0m[1;36m776230[0m,[1m)[0m[1m)[0m

## Experiments

### Cross Validation

### 1. Model with XGBoost and encoded activities in one label. No windowing.

In [9]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0,
        'tree_method':'hist', 
        'device': 'cuda:1',
        'predictor':'gpu_predictor',
        'n_jobs': -1
    }

In [10]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']), tree_method=space['tree_method'], device=space['device'],
                    n_jobs=space['n_jobs'], eval_metric="auc", early_stopping_rounds=10
    )
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation,
            verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [11]:
kf = KFold(n_splits=10)
best_hyperparams = []

for i, (train_index, test_index) in enumerate(kf.split(X_aras_train)):
    print(f"Fold {i}:")

    X_train = X_aras_train.iloc[train_index]
    y_train = y_aras_train.iloc[train_index]
    
    X_test = X_aras_train.iloc[test_index]
    y_test = y_aras_train.iloc[test_index]

    trials = Trials()
    best_hyperparams += [fmin(fn = objective,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 100,
                            trials = trials)]

Fold 0:
  0%|                                                                                                                             | 0/100 [00:00<?, ?trial/s, best loss=?]

  0%|                                                                                                                             | 0/100 [00:16<?, ?trial/s, best loss=?]


In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

### Not used ↓↓↓↓

In [None]:
X_aras_train = X_aras.loc[(
    (X_aras['DAY_26'] != 1) & 
    (X_aras['DAY_27'] != 1) & 
    (X_aras['DAY_28'] != 1) & 
    (X_aras['DAY_29'] != 1) &
    (X_aras['DAY_30'] != 1) 
)]

X_aras_test = X_aras.loc[(
    (X_aras['DAY_26'] == 1) | 
    (X_aras['DAY_27'] == 1) | 
    (X_aras['DAY_28'] == 1) | 
    (X_aras['DAY_29'] == 1) | 
    (X_aras['DAY_30'] == 1) 
)]

In [None]:
y_aras_train = X_aras_train['LABEL']
y_aras_test = X_aras_test['LABEL']


X_aras_train = X_aras_train.drop(columns=['LABEL'])
X_aras_test = X_aras_test.drop(columns=['LABEL'])

In [None]:
X_aras_train.shape, y_aras_train.shape, X_aras_test.shape, y_aras_test.shape