In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import joblib
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [53]:
train_data = pd.read_csv('../files/t1_win_NullCorretion.csv')

# Remove rows with missing target values
train_data.dropna(axis=0, subset=['t1_fl_match_win'], inplace=True)
y = train_data.t1_fl_match_win # Target variable
train_data.drop(['t1_fl_match_win'], axis=1, inplace=True) # Removing target variable from training data


# Select numeric columns only
numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
X = train_data[numeric_cols].copy()

print("Shape of input data: {} and shape of target variable: {}".format(X.shape, y.shape))

X.head() # Show first 5 training examples

Shape of input data: (44090, 30) and shape of target variable: (44090,)


Unnamed: 0,t1_all_total5_fl_match_win,t1_all_total5_fl_match_draw,t1_all_total5_fl_match_lost,t1_all_total10_fl_match_win,t1_all_total10_fl_match_draw,t1_all_total10_fl_match_lost,t2_all_total5_fl_match_win,t2_all_total5_fl_match_draw,t2_all_total5_fl_match_lost,t2_all_total10_fl_match_win,...,t2_away_total5_fl_match_lost,t2_away_total10_fl_match_win,t2_away_total10_fl_match_draw,t2_away_total10_fl_match_lost,t1_h2h_total5_fl_match_win,t1_h2h_total5_fl_match_draw,t1_h2h_total5_fl_match_lost,t1_h2h_total10_fl_match_win,t1_h2h_total10_fl_match_draw,t1_h2h_total10_fl_match_lost
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
def modelLinearRegression():
    a=5
    # the other way of doing the same thing (more explicit)
    lm = LinearRegression()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)

    # create a KFold object with 20 splits
    print ("Resultados do modelo Linear Regression\n")
    while a<20:
        folds = KFold(n_splits = a, shuffle = True, random_state = 100)
        data = {}
        print('KFold splits {:.2f}'.format(a))
        data['KFold']='{:.2f}'.format(a)
        data['Modelo']= 'Linear Regression'
        scoresR2 = cross_validate(lm, X, y, scoring=['r2','neg_mean_squared_error','max_error'], cv=folds)
        data["NMSE "]=scoresR2['test_neg_mean_squared_error'].tolist()
        data["ME "]=scoresR2['test_max_error'].tolist()
        jsonString = json.dumps(data)
        jsonFile.write(jsonString+",\n")
        a+=4



In [55]:
def modelRandomForestRegressor():
    a=5
    # the other way of doing the same thing (more explicit)
    from sklearn import linear_model, tree, ensemble
    lm = ensemble.RandomForestRegressor()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)
    print ("\nResultados do modelo Random Forest Regressor\n")
    while a<20:
        folds = KFold(n_splits = a, shuffle = True, random_state = 100)
        data = {}
        print('KFold splits {:.2f}'.format(a))
        data['KFold']='{:.2f}'.format(a)
        data['Modelo']= 'Random Forest Regressor'
        scoresR2 = cross_validate(lm, X, y, scoring=['r2','neg_mean_squared_error','max_error'], cv=folds)
        data["NMSE "]=scoresR2['test_neg_mean_squared_error'].tolist()
        data["ME "]=scoresR2['test_max_error'].tolist()
        jsonString = json.dumps(data)
        jsonFile.write(jsonString+",\n")
        a+=4

In [56]:
def modelDecisionTreeRegressor():
    from sklearn import linear_model, tree, ensemble
    a=5

    # the other way of doing the same thing (more explicit)
    lm = tree.DecisionTreeRegressor()
    lm.fit(X, y)
    rfe = RFE(lm, n_features_to_select=6)
    rfe = rfe.fit(X, y)
    print ("\nResultados do modelo Decision Tree Regressor\n")
    while a<20:
        folds = KFold(n_splits = a, shuffle = True, random_state = 100)
        data = {}
        print('KFold splits {:.2f}'.format(a))
        data['KFold']='{:.2f}'.format(a)
        data['Modelo']= 'Decision Tree Regressor'
        scoresR2 = cross_validate(lm, X, y, scoring=['r2','neg_mean_squared_error','max_error'], cv=folds)
        data["NMSE "]=scoresR2['test_neg_mean_squared_error'].tolist()
        data["ME "]=scoresR2['test_max_error'].tolist()
        jsonString = json.dumps(data)
        jsonFile.write(jsonString+",\n")
        a+=4
    filename = 'modelo_final-kf.sav'
    joblib.dump(lm, open(filename, 'wb'))


In [57]:
import json
jsonFile = open("decision-tree-kfold.json", "w")
jsonTxt = ''
jsonFile.write('[')
modelLinearRegression()
modelDecisionTreeRegressor()
jsonFile.write(']')
jsonFile.close()
#modelRandomForestRegressor() não chega a correr fica em loop no modelo de cima

Resultados do modelo Linear Regression

KFold splits 5.00
KFold splits 9.00
KFold splits 13.00
KFold splits 17.00

Resultados do modelo Decision Tree Regressor

KFold splits 5.00
KFold splits 9.00
KFold splits 13.00
KFold splits 17.00


In [58]:
# load the model from disk
filename = 'modelo_final-kf.sav'
loaded_model = joblib.load(open(filename, 'rb'))
# novo registo a prever
dNew = pd.read_csv('../files/t1_win_Sample.csv')
dNew = dNew.drop('t1_fl_match_win', axis=1)   #Remover o campo vitoria
dNew.head(2)     #Mostra as primeiras n linhas do conjunto de dados   Display the first n rows of the dataset

Unnamed: 0,t1_all_total5_fl_match_win,t1_all_total5_fl_match_draw,t1_all_total5_fl_match_lost,t1_all_total10_fl_match_win,t1_all_total10_fl_match_draw,t1_all_total10_fl_match_lost,t2_all_total5_fl_match_win,t2_all_total5_fl_match_draw,t2_all_total5_fl_match_lost,t2_all_total10_fl_match_win,...,t2_away_total5_fl_match_lost,t2_away_total10_fl_match_win,t2_away_total10_fl_match_draw,t2_away_total10_fl_match_lost,t1_h2h_total5_fl_match_win,t1_h2h_total5_fl_match_draw,t1_h2h_total5_fl_match_lost,t1_h2h_total10_fl_match_win,t1_h2h_total10_fl_match_draw,t1_h2h_total10_fl_match_lost
0,3,0,2,6,1,3,0,4,1,2,...,0,2,4,0,0,0,0,0,0,0


In [59]:
#Fazer uma previsão com o novo registo
yNew = pd.Series(loaded_model.predict(dNew))
if yNew[0] == 0:
  print("Equipa da casa perde")
else:
  print("Equipa da casa ganha")

Equipa da casa perde
