In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import joblib
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [9]:
train_data = pd.read_csv('View_data_nullZero.csv')

# Remove rows with missing target values
train_data.dropna(axis=0, subset=['t1_fl_match_win'], inplace=True)
y = train_data.t1_fl_match_win # Target variable
train_data.drop(['t1_fl_match_win'], axis=1, inplace=True) # Removing target variable from training data


# Select numeric columns only
numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
X = train_data[numeric_cols].copy()

print("Shape of input data: {} and shape of target variable: {}".format(X.shape, y.shape))

X.head() # Show first 5 training examples

Shape of input data: (43823, 49) and shape of target variable: (43823,)


Unnamed: 0,id_match_competition,t1_all_total5_fl_match_win,t1_all_total5_fl_match_draw,t1_all_total5_fl_match_lost,t1_all_total10_fl_match_win,t1_all_total10_fl_match_draw,t1_all_total10_fl_match_lost,t2_all_total5_fl_match_win,t2_all_total5_fl_match_draw,t2_all_total5_fl_match_lost,...,t1_h2h_home_total5_fl_match_lost,t1_h2h_home_total10_fl_match_win,t1_h2h_home_total10_fl_match_draw,t1_h2h_home_total10_fl_match_lost,t2_h2h_away_total5_fl_match_win,t2_h2h_away_total5_fl_match_draw,t2_h2h_away_total5_fl_match_lost,t2_h2h_away_total10_fl_match_win,t2_h2h_away_total10_fl_match_draw,t2_h2h_away_total10_fl_match_lost
0,8910,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8910,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8910,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8910,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8910,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
def modelLinearRegression():

    # the other way of doing the same thing (more explicit)
    lm = LinearRegression()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)

    # create a KFold object with 5 splits
    folds = KFold(n_splits = 20, shuffle = True, random_state = 100)
    print ("Resultados do modelo Linear Regression\n")
    scoresR2 = cross_val_score(lm, X, y, scoring='r2', cv=folds)
    print("\nResultado do r2 :{}".format(scoresR2))
    scoresNMSE = cross_val_score(lm, X, y, scoring='neg_mean_squared_error', cv=folds)
    print("\nResultado do NMSE :{}".format(scoresNMSE))
    scoresME = cross_val_score(lm, X, y, scoring='max_error', cv=folds)
    print("\nResultado do ME :{}".format(scoresME))

In [11]:
def modelRandomForestRegressor():
    # the other way of doing the same thing (more explicit)
    from sklearn import linear_model, tree, ensemble
    lm = ensemble.RandomForestRegressor()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)

    # create a KFold object with 5 splits
    folds = KFold(n_splits = 20, shuffle = True, random_state = 100)
    print ("\nResultados do modelo Random Forest Regressor\n")
    scoresR2 = cross_val_score(lm, X, y, scoring='r2', cv=folds)
    print("\nResultado do r2 :{}".format(scoresR2))
    scoresNMSE = cross_val_score(lm, X, y, scoring='neg_mean_squared_error', cv=folds)
    print("\nResultado do NMSE :{}".format(scoresNMSE))
    scoresME = cross_val_score(lm, X, y, scoring='max_error', cv=folds)
    print("\nResultado do ME :{}".format(scoresME))

In [12]:
def modelDecisionTreeRegressor():
    from sklearn import linear_model, tree, ensemble

    # the other way of doing the same thing (more explicit)
    lm = tree.DecisionTreeRegressor()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)

    # create a KFold object with 5 splits
    folds = KFold(n_splits = 20, shuffle = True, random_state = 100)
    print ("\nResultados do modelo Decision Tree Regressor\n")
    scoresR2 = cross_val_score(lm, X, y, scoring='r2', cv=folds)
    print("\nResultado do r2 :{}".format(scoresR2))
    scoresNMSE = cross_val_score(lm, X, y, scoring='neg_mean_squared_error', cv=folds)
    print("\nResultado do NMSE :{}".format(scoresNMSE))
    scoresME = cross_val_score(lm, X, y, scoring='max_error', cv=folds)
    print("\nResultado do ME :{}".format(scoresME))
    filename = 'modelo_final-kf.sav'
    joblib.dump(lm, open(filename, 'wb'))


In [13]:
modelLinearRegression()
modelDecisionTreeRegressor()
#modelRandomForestRegressor() não chega a correr fica em loop no modelo de cima

Resultados do modelo Linear Regression


Resultado do r2 :[0.05556662 0.05700194 0.06703828 0.06956073 0.07043157 0.07110507
 0.05288159 0.05391133 0.05732672 0.05936354 0.0586067  0.05729002
 0.05079548 0.07963323 0.04270014 0.07186515 0.04311359 0.06166924
 0.06825859 0.06017104]

Resultado do NMSE :[-0.23132492 -0.22967019 -0.2285151  -0.22894537 -0.22740387 -0.22612726
 -0.23049329 -0.22719895 -0.23054653 -0.22891582 -0.23295385 -0.22746491
 -0.22715708 -0.22662336 -0.23304207 -0.22471954 -0.23433953 -0.2255052
 -0.22661174 -0.2317245 ]

Resultado do ME :[-0.93495796 -0.95048809 -0.90188645 -0.94471345 -0.95834157 -0.936553
 -0.89435468 -1.02067837 -0.9494592  -0.90870755 -0.93443964 -0.92236509
 -0.97494936 -0.89697772 -0.94080846 -0.92558327 -0.98533653 -1.00908148
 -0.94873926 -0.93084087]

Resultados do modelo Decision Tree Regressor


Resultado do r2 :[-0.8873442  -0.88587469 -0.9043563  -0.82678861 -0.87330628 -0.83716768
 -0.87430984 -0.93075597 -0.87374236 -0.83536264 -0.87

In [14]:
# load the model from disk
filename = 'modelo_final-kf.sav'
loaded_model = joblib.load(open(filename, 'rb'))
# novo registo a prever
dNew = pd.read_csv('../files/data_sample_match_win.csv')
dNew = dNew.drop('t1_fl_match_win', axis=1)   #Remover a tabela Insolvencia  #Remove the Insolvency table
dNew.head(2)     #Mostra as primeiras n linhas do conjunto de dados   Display the first n rows of the dataset

Unnamed: 0,id_match_competition,t1_all_total5_fl_match_win,t1_all_total5_fl_match_draw,t1_all_total5_fl_match_lost,t1_all_total10_fl_match_win,t1_all_total10_fl_match_draw,t1_all_total10_fl_match_lost,t2_all_total5_fl_match_win,t2_all_total5_fl_match_draw,t2_all_total5_fl_match_lost,...,t1_h2h_home_total5_fl_match_lost,t1_h2h_home_total10_fl_match_win,t1_h2h_home_total10_fl_match_draw,t1_h2h_home_total10_fl_match_lost,t2_h2h_away_total5_fl_match_win,t2_h2h_away_total5_fl_match_draw,t2_h2h_away_total5_fl_match_lost,t2_h2h_away_total10_fl_match_win,t2_h2h_away_total10_fl_match_draw,t2_h2h_away_total10_fl_match_lost
0,328,5,0,0,6,2,0,0,0,4,...,0,4,0,0,0,0,2,0,0,4


In [15]:
#Fazer uma previsão com o novo registo
yNew = pd.Series(loaded_model.predict(dNew))
if yNew[0] == 0:
  print("Equipa da casa perde")
else:
  print("Equipa da casa ganha")

Equipa da casa ganha
