In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import joblib
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [5]:
train_data = pd.read_csv('../files/t1_win_NullCorretion.csv')

# Remove rows with missing target values
train_data.dropna(axis=0, subset=['t1_fl_match_win'], inplace=True)
y = train_data.t1_fl_match_win # Target variable
train_data.drop(['t1_fl_match_win'], axis=1, inplace=True) # Removing target variable from training data


# Select numeric columns only
numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
X = train_data[numeric_cols].copy()

print("Shape of input data: {} and shape of target variable: {}".format(X.shape, y.shape))

X.head() # Show first 5 training examples

Shape of input data: (44090, 30) and shape of target variable: (44090,)


Unnamed: 0,t1_all_total5_fl_match_win,t1_all_total5_fl_match_draw,t1_all_total5_fl_match_lost,t1_all_total10_fl_match_win,t1_all_total10_fl_match_draw,t1_all_total10_fl_match_lost,t2_all_total5_fl_match_win,t2_all_total5_fl_match_draw,t2_all_total5_fl_match_lost,t2_all_total10_fl_match_win,...,t2_away_total5_fl_match_lost,t2_away_total10_fl_match_win,t2_away_total10_fl_match_draw,t2_away_total10_fl_match_lost,t1_h2h_total5_fl_match_win,t1_h2h_total5_fl_match_draw,t1_h2h_total5_fl_match_lost,t1_h2h_total10_fl_match_win,t1_h2h_total10_fl_match_draw,t1_h2h_total10_fl_match_lost
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
def modelLinearRegression():

    # the other way of doing the same thing (more explicit)
    lm = LinearRegression()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)

    # create a KFold object with 20 splits
    folds = KFold(n_splits = 20, shuffle = True, random_state = 100)
    print ("Resultados do modelo Linear Regression\n")
    scoresR2 = cross_val_score(lm, X, y, scoring='r2', cv=folds)
    print("\nResultado do r2 :{}".format(scoresR2))
    scoresNMSE = cross_val_score(lm, X, y, scoring='neg_mean_squared_error', cv=folds)
    print("\nResultado do NMSE :{}".format(scoresNMSE))
    scoresME = cross_val_score(lm, X, y, scoring='max_error', cv=folds)
    print("\nResultado do ME :{}".format(scoresME))

In [7]:
def modelRandomForestRegressor():
    # the other way of doing the same thing (more explicit)
    from sklearn import linear_model, tree, ensemble
    lm = ensemble.RandomForestRegressor()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)

    # create a KFold object with 5 splits
    folds = KFold(n_splits = 20, shuffle = True, random_state = 100)
    print ("\nResultados do modelo Random Forest Regressor\n")
    scoresR2 = cross_val_score(lm, X, y, scoring='r2', cv=folds)
    print("\nResultado do r2 :{}".format(scoresR2))
    scoresNMSE = cross_val_score(lm, X, y, scoring='neg_mean_squared_error', cv=folds)
    print("\nResultado do NMSE :{}".format(scoresNMSE))
    scoresME = cross_val_score(lm, X, y, scoring='max_error', cv=folds)
    print("\nResultado do ME :{}".format(scoresME))

In [8]:
def modelDecisionTreeRegressor():
    from sklearn import linear_model, tree, ensemble

    # the other way of doing the same thing (more explicit)
    lm = tree.DecisionTreeRegressor()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)

    # create a KFold object with 5 splits
    folds = KFold(n_splits = 20, shuffle = True, random_state = 100)
    print ("\nResultados do modelo Decision Tree Regressor\n")
    scoresR2 = cross_val_score(lm, X, y, scoring='r2', cv=folds)
    print("\nResultado do r2 :{}".format(scoresR2))
    scoresNMSE = cross_val_score(lm, X, y, scoring='neg_mean_squared_error', cv=folds)
    print("\nResultado do NMSE :{}".format(scoresNMSE))
    scoresME = cross_val_score(lm, X, y, scoring='max_error', cv=folds)
    print("\nResultado do ME :{}".format(scoresME))
    filename = 'modelo_final-kf.sav'
    joblib.dump(lm, open(filename, 'wb'))


In [9]:
modelLinearRegression()
modelDecisionTreeRegressor()
#modelRandomForestRegressor() não chega a correr fica em loop no modelo de cima

Resultados do modelo Linear Regression


Resultado do r2 :[0.05969749 0.04838249 0.0830501  0.06107574 0.05899927 0.06390341
 0.06454815 0.05614907 0.04965345 0.07288673 0.05691575 0.05690892
 0.05799665 0.07612536 0.04434285 0.06782187 0.04492739 0.06093386
 0.0664866  0.06183301]

Resultado do NMSE :[-0.22870469 -0.22923178 -0.22362562 -0.22905115 -0.22859039 -0.22849265
 -0.22773268 -0.22921055 -0.23107649 -0.22728536 -0.23228499 -0.23061448
 -0.22653675 -0.22691455 -0.23322868 -0.22668803 -0.23416563 -0.22734002
 -0.22672986 -0.2314267 ]

Resultado do ME :[-0.94716466 -0.95992393 -0.90725178 -0.92636637 -0.91849732 -0.92385873
 -0.88218965 -0.93205092 -0.97957271 -0.93186525 -0.94163633 -1.01523991
 -0.94467574 -0.89813819 -0.92777997 -0.90520902 -0.97148034 -0.98342307
 -0.94709358 -0.93196262]

Resultados do modelo Decision Tree Regressor


Resultado do r2 :[-0.82666296 -0.81942553 -0.82249777 -0.8495004  -0.88325776 -0.85854347
 -0.94375377 -0.79950922 -0.9359402  -0.86435115 -0

In [10]:
# load the model from disk
filename = 'modelo_final-kf.sav'
loaded_model = joblib.load(open(filename, 'rb'))
# novo registo a prever
dNew = pd.read_csv('../files/t1_win_Sample.csv')
dNew = dNew.drop('t1_fl_match_win', axis=1)   #Remover o campo vitoria
dNew.head(2)     #Mostra as primeiras n linhas do conjunto de dados   Display the first n rows of the dataset

Unnamed: 0,t1_all_total5_fl_match_win,t1_all_total5_fl_match_draw,t1_all_total5_fl_match_lost,t1_all_total10_fl_match_win,t1_all_total10_fl_match_draw,t1_all_total10_fl_match_lost,t2_all_total5_fl_match_win,t2_all_total5_fl_match_draw,t2_all_total5_fl_match_lost,t2_all_total10_fl_match_win,...,t2_away_total5_fl_match_lost,t2_away_total10_fl_match_win,t2_away_total10_fl_match_draw,t2_away_total10_fl_match_lost,t1_h2h_total5_fl_match_win,t1_h2h_total5_fl_match_draw,t1_h2h_total5_fl_match_lost,t1_h2h_total10_fl_match_win,t1_h2h_total10_fl_match_draw,t1_h2h_total10_fl_match_lost
0,3,0,2,6,1,3,0,4,1,2,...,0,2,4,0,0,0,0,0,0,0


In [11]:
#Fazer uma previsão com o novo registo
yNew = pd.Series(loaded_model.predict(dNew))
if yNew[0] == 0:
  print("Equipa da casa perde")
else:
  print("Equipa da casa ganha")

Equipa da casa perde
