In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import joblib
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [17]:
train_data = pd.read_csv('../files/t1_win_NullCorretion.csv')

# Remove rows with missing target values
train_data.dropna(axis=0, subset=['t1_fl_match_win'], inplace=True)
y = train_data.t1_fl_match_win # Target variable
train_data.drop(['t1_fl_match_win'], axis=1, inplace=True) # Removing target variable from training data


# Select numeric columns only
numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
X = train_data[numeric_cols].copy()

print("Shape of input data: {} and shape of target variable: {}".format(X.shape, y.shape))

X.head() # Show first 5 training examples

Shape of input data: (44090, 30) and shape of target variable: (44090,)


Unnamed: 0,t1_all_total5_fl_match_win,t1_all_total5_fl_match_draw,t1_all_total5_fl_match_lost,t1_all_total10_fl_match_win,t1_all_total10_fl_match_draw,t1_all_total10_fl_match_lost,t2_all_total5_fl_match_win,t2_all_total5_fl_match_draw,t2_all_total5_fl_match_lost,t2_all_total10_fl_match_win,...,t2_away_total5_fl_match_lost,t2_away_total10_fl_match_win,t2_away_total10_fl_match_draw,t2_away_total10_fl_match_lost,t1_h2h_total5_fl_match_win,t1_h2h_total5_fl_match_draw,t1_h2h_total5_fl_match_lost,t1_h2h_total10_fl_match_win,t1_h2h_total10_fl_match_draw,t1_h2h_total10_fl_match_lost
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
def modelLinearRegression():

    # the other way of doing the same thing (more explicit)
    lm = LinearRegression()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)

    # create a KFold object with 20 splits
    folds = KFold(n_splits = 20, shuffle = True, random_state = 100)
    print ("Resultados do modelo Linear Regression\n")
    scoresR2 = cross_validate(lm, X, y, scoring=['r2','neg_mean_squared_error','max_error'], cv=folds)
    print("Metrics : ", scoresR2)



In [19]:
def modelRandomForestRegressor():
    # the other way of doing the same thing (more explicit)
    from sklearn import linear_model, tree, ensemble
    lm = ensemble.RandomForestRegressor()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)

    # create a KFold object with 5 splits
    folds = KFold(n_splits = 20, shuffle = True, random_state = 100)
    print ("\nResultados do modelo Random Forest Regressor\n")
    scoresR2 = cross_validate(lm, X, y, scoring=['r2','neg_mean_squared_error','max_error'], cv=folds)
    print("Metrics : ", scoresR2)


In [20]:
def modelDecisionTreeRegressor():
    from sklearn import linear_model, tree, ensemble

    # the other way of doing the same thing (more explicit)
    lm = tree.DecisionTreeRegressor()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)

    # create a KFold object with 5 splits
    folds = KFold(n_splits = 20, shuffle = True, random_state = 100)
    print ("\nResultados do modelo Decision Tree Regressor\n")
    scoresR2 = cross_validate(lm, X, y, scoring=['r2','neg_mean_squared_error','max_error'], cv=folds)
    print("Metrics : ", scoresR2)
    filename = 'modelo_final-kf.sav'
    joblib.dump(lm, open(filename, 'wb'))


In [21]:
modelLinearRegression()
modelDecisionTreeRegressor()
#modelRandomForestRegressor() não chega a correr fica em loop no modelo de cima

Resultados do modelo Linear Regression

Metrics :  {'fit_time': array([0.05200028, 0.04799986, 0.05100179, 0.04699898, 0.05099773,
       0.0539999 , 0.04500222, 0.04999924, 0.05199862, 0.048002  ,
       0.05000281, 0.04900026, 0.0527935 , 0.06399894, 0.06213236,
       0.05199671, 0.05299926, 0.05700111, 0.05299926, 0.13500094]), 'score_time': array([0.00300002, 0.00399899, 0.00200033, 0.00300312, 0.00299931,
       0.0019989 , 0.00300002, 0.00400305, 0.00299954, 0.00299907,
       0.00299883, 0.00300169, 0.00299954, 0.00252151, 0.00251698,
       0.00300002, 0.00400186, 0.00399756, 0.0039978 , 0.00299978]), 'test_r2': array([0.0596363 , 0.04836825, 0.08306296, 0.06115538, 0.05897221,
       0.0638402 , 0.06458955, 0.05613249, 0.04962635, 0.07291467,
       0.05683938, 0.05685243, 0.0580411 , 0.07613269, 0.04427901,
       0.06782379, 0.04495537, 0.06101434, 0.0666734 , 0.06183431]), 'test_neg_mean_squared_error': array([-0.22871957, -0.22923521, -0.22362248, -0.22903172, -0.22859697

In [22]:
# load the model from disk
filename = 'modelo_final-kf.sav'
loaded_model = joblib.load(open(filename, 'rb'))
# novo registo a prever
dNew = pd.read_csv('../files/t1_win_Sample.csv')
dNew = dNew.drop('t1_fl_match_win', axis=1)   #Remover o campo vitoria
dNew.head(2)     #Mostra as primeiras n linhas do conjunto de dados   Display the first n rows of the dataset

Unnamed: 0,t1_all_total5_fl_match_win,t1_all_total5_fl_match_draw,t1_all_total5_fl_match_lost,t1_all_total10_fl_match_win,t1_all_total10_fl_match_draw,t1_all_total10_fl_match_lost,t2_all_total5_fl_match_win,t2_all_total5_fl_match_draw,t2_all_total5_fl_match_lost,t2_all_total10_fl_match_win,...,t2_away_total5_fl_match_lost,t2_away_total10_fl_match_win,t2_away_total10_fl_match_draw,t2_away_total10_fl_match_lost,t1_h2h_total5_fl_match_win,t1_h2h_total5_fl_match_draw,t1_h2h_total5_fl_match_lost,t1_h2h_total10_fl_match_win,t1_h2h_total10_fl_match_draw,t1_h2h_total10_fl_match_lost
0,3,0,2,6,1,3,0,4,1,2,...,0,2,4,0,0,0,0,0,0,0


In [23]:
#Fazer uma previsão com o novo registo
yNew = pd.Series(loaded_model.predict(dNew))
if yNew[0] == 0:
  print("Equipa da casa perde")
else:
  print("Equipa da casa ganha")

Equipa da casa perde
