In [28]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.metrics import balanced_accuracy_score,precision_score,recall_score,f1_score, roc_auc_score

In [29]:
df = pd.read_csv('../../data/crawler/unified-events-time-statistics.csv')
df.head()

Unnamed: 0,golId,time,result,blueTopGP,blueTopWR,blueTopKDA,blueJungleGP,blueJungleWR,blueJungleKDA,blueMidGP,...,RED:third_tower_top,RED:third_tower_mid,RED:third_tower_bot,RED:inhibitor_top,RED:inhibitor_mid,RED:inhibitor_bot,RED:baron,RED:elder_dragon,RED:nexus_tower,RED:nexus
0,35797,0,0,0,0.0,0.0,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1,35797,1,0,0,0.0,0.0,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
2,35797,2,0,0,0.0,0.0,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
3,35797,3,0,0,0.0,0.0,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
4,35797,4,0,0,0.0,0.0,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
y = df['result'].copy()
X = df.drop(['golId','result'],axis=1)
X.head()

Unnamed: 0,time,blueTopGP,blueTopWR,blueTopKDA,blueJungleGP,blueJungleWR,blueJungleKDA,blueMidGP,blueMidWR,blueMidKDA,...,RED:third_tower_top,RED:third_tower_mid,RED:third_tower_bot,RED:inhibitor_top,RED:inhibitor_mid,RED:inhibitor_bot,RED:baron,RED:elder_dragon,RED:nexus_tower,RED:nexus
0,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
def preprocess_input(X,y):
    X = X.copy()
    X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=42, stratify=y)
    scaler = StandardScaler()   
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train,X_test,y_train,y_test,scaler
X_train,X_test,y_train,y_test,scaler = preprocess_input(X,y)

In [32]:
X_train

array([[-0.51110497, -0.82497407, -0.99700583, ..., -0.08630673,
        -0.16005203, -0.12198181],
       [-1.0001392 , -0.82497407, -0.99700583, ..., -0.08630673,
        -0.16005203, -0.12198181],
       [ 0.17354294, -0.82497407, -0.99700583, ..., -0.08630673,
        -0.16005203, -0.12198181],
       ...,
       [-0.70671866, -0.82497407, -0.99700583, ..., -0.08630673,
        -0.16005203, -0.12198181],
       [-1.58698026, -0.82497407, -0.99700583, ..., -0.08630673,
        -0.16005203, -0.12198181],
       [-0.90233235, -0.52850402, -0.99700583, ..., -0.08630673,
        -0.16005203, -0.12198181]])

In [33]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = [600,900,1200]
max_depth = [50]
min_samples_split = [2]
min_samples_leaf = [1] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)
forest = RandomForestClassifier(random_state=42)
gridF = GridSearchCV(forest, hyperF, cv = 3, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [34]:
print("Best: %f using %s" % (bestF.best_score_, bestF.best_params_))

Best: 0.991100 using {'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1200}


In [35]:
forestOpt = RandomForestClassifier(random_state=42,max_depth=50,n_estimators=600,min_samples_split=2,min_samples_leaf=1)
                                   
modelOpt = forestOpt.fit(X_train, y_train)

scores_list = []
scores_list.append({
    'Balanced Accuracy': balanced_accuracy_score(y_test,modelOpt.predict(X_test)),
    'Precision':  precision_score(y_test,modelOpt.predict(X_test)),
    'Recall': recall_score(y_test,modelOpt.predict(X_test)),
    'F1-Score': f1_score(y_test,modelOpt.predict(X_test)),
    'ROC/AUC Score': roc_auc_score(y_test,modelOpt.predict(X_test))
})
scores = pd.DataFrame(scores_list)

In [36]:
scores

Unnamed: 0,Balanced Accuracy,Precision,Recall,F1-Score,ROC/AUC Score
0,0.99614,0.995752,0.996836,0.996294,0.99614


In [37]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Definindo o modelo
model = RandomForestClassifier(max_depth=50, n_estimators=600, min_samples_split=2, min_samples_leaf=1)

# Definindo as métricas
scoring = {
    'Balanced Accuracy': make_scorer(balanced_accuracy_score),
    'Precision': make_scorer(precision_score, average='weighted'),
    'Recall': make_scorer(recall_score, average='weighted'),
    'F1': make_scorer(f1_score, average='weighted'),
    'AUC': make_scorer(roc_auc_score, needs_proba=True, average='weighted')
}

# Realizando a validação cruzada
cv_results = cross_validate(model, X, y, cv=3, scoring=scoring)

# Extraindo os resultados
for metric, scores in cv_results.items():
    print(f'{metric}: {scores.mean()} (+/- {scores.std()})')


fit_time: 176.05710156758627 (+/- 2.5175256727150783)
score_time: 16.675747950871784 (+/- 0.07211719488442522)
test_Balanced Accuracy: 0.7133769585082362 (+/- 0.002748281837438643)
test_Precision: 0.7142767586083156 (+/- 0.0029496495838287683)
test_Recall: 0.714328103449514 (+/- 0.0029105108898230983)
test_F1: 0.7140559982300715 (+/- 0.0028219610474486995)
test_AUC: 0.7963109873167871 (+/- 0.003028988577841375)


In [38]:
import pickle

with open("../../models/eventos-colunas/rf.pkl", "wb") as f:
    pickle.dump(modelOpt, f)

with open("../../scalers/eventos-colunas/rf.pkl", "wb") as f:
    pickle.dump(scaler, f)