# Machine Learning part
#### First predictions using basic machine learning algorithms (decision tree, random forest and svm) in three configurations of the dataset. Firstly with all the informations about the match, then without the half time result and lastly with the dataset containig only pre-match informations

### Loading and preparing the dataset in order to work with machine learnings algorithms

In [1]:
# Obsługa środowisk Python 2 i Python 3
from __future__ import division, print_function, unicode_literals

# Importowanie popularnych modułów
import numpy as np
import os

# W celu zachowania powtarzalności wyników w kolejnych przebiegach
np.random.seed(42)

# Generowanie ładnych wykresów
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Lokacja, w której będą zapisywane rysunki
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "preparing_dataset"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "pictures", CHAPTER_ID)

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "pictures", CHAPTER_ID, fig_id)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving an image", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
import os
import pandas as pd

FOOTBALL_PATH_SP = os.path.join("datasets", "spain")
football_path_sp = FOOTBALL_PATH_SP

FOOTBALL_PATH_EN = os.path.join("datasets", "england")
football_path_en = FOOTBALL_PATH_EN

FOOTBALL_PATH_FR = os.path.join("datasets", "france")
football_path_fr = FOOTBALL_PATH_FR

FOOTBALL_PATH_GE = os.path.join("datasets", "germany")
football_path_ge = FOOTBALL_PATH_GE

FOOTBALL_PATH_IT = os.path.join("datasets", "italy")
football_path_it = FOOTBALL_PATH_IT

def load_football_data(football_path, file):
    csv_path = os.path.join(football_path, file)
    return pd.read_csv(csv_path, error_bad_lines=False)

In [3]:
football_sp = load_football_data(FOOTBALL_PATH_SP, "spain.csv")
football_en = load_football_data(FOOTBALL_PATH_EN, "england.csv")
football_fr = load_football_data(FOOTBALL_PATH_FR, "france.csv")
football_ge = load_football_data(FOOTBALL_PATH_GE, "germany.csv")
football_it = load_football_data(FOOTBALL_PATH_IT, "italy.csv")

In [4]:
football = football_sp.copy()
football = football.dropna(subset=["Date"])
football = pd.DataFrame(football).fillna(0)

In [6]:
from sklearn.preprocessing import LabelEncoder

homeTeamList = football["HomeTeam"].tolist() 
awayTeamList = football["AwayTeam"].tolist()
fTRList = football["FTR"].tolist()
hTRList = football["HTR"].tolist()
divList = football["Div"].tolist()

labelEncoder = LabelEncoder()

labelEncoder.fit(homeTeamList)
label = labelEncoder.transform(homeTeamList)
football['homeTeam']=pd.Series(label)

labelEncoder.fit(awayTeamList)
label = labelEncoder.transform(awayTeamList)
football['awayTeam']=pd.Series(label)

labelEncoder.fit(hTRList)
label = labelEncoder.transform(hTRList)
football['hTR']=pd.Series(label)

labelEncoder.fit(fTRList)
label = labelEncoder.transform(fTRList)
football['fTR']=pd.Series(label)

labelEncoder.fit(divList)
label = labelEncoder.transform(divList)
football['div']=pd.Series(label)

In [7]:
import datetime

dates = pd.Series(football['Date'])
dates = pd.to_datetime(dates, format = '%d/%m/%y')
days = []
years = []

for i in dates:
    d = i.dayofyear
    days.append(d)
    y = i.year
    years.append(y)
    
x = pd.Series(days)
y = pd.Series(years)
football["DayOfTheYear"] = x
football["Year"] = y
football.head(20)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BWH,BWD,BWA,homeTeam,awayTeam,hTR,fTR,div,DayOfTheYear,Year
0,S1,27/08/05,Alaves,Barcelona,0,0,D,0,0,D,...,7.0,3.7,1.45,0,4,1,1,0,239,2005
1,S1,27/08/05,Ath Bilbao,Sociedad,3,0,H,0,0,D,...,1.95,3.15,3.65,2,31,1,2,0,239,2005
2,S1,27/08/05,Valencia,Betis,1,0,H,0,0,D,...,1.75,3.3,4.4,34,5,1,2,0,239,2005
3,S1,28/08/05,Ath Madrid,Zaragoza,0,0,D,0,0,D,...,1.65,3.4,4.9,3,39,1,1,0,240,2005
4,S1,28/08/05,Cadiz,Real Madrid,1,2,A,0,1,A,...,8.0,4.25,1.35,6,27,0,0,0,240,2005
5,S1,28/08/05,Celta,Malaga,2,0,H,1,0,H,...,2.1,3.1,3.3,7,22,2,2,0,240,2005
6,S1,28/08/05,Espanol,Getafe,0,2,A,0,0,D,...,1.65,3.5,4.75,11,12,1,0,0,240,2005
7,S1,28/08/05,Mallorca,La Coruna,0,1,A,0,1,A,...,2.65,3.1,2.5,23,18,0,0,0,240,2005
8,S1,28/08/05,Osasuna,Villarreal,2,1,H,1,0,H,...,2.7,3.15,2.4,26,37,2,2,0,240,2005
9,S1,28/08/05,Sevilla,Santander,1,0,H,1,0,H,...,1.65,3.5,4.7,30,29,2,2,0,240,2005


### Dropping the encoded or replaced attributes

In [8]:
football = football.drop(columns = ['div', 'Div','Date', 'HomeTeam', 'AwayTeam', 'HTR', 'FTR', 'FTHG', 'HTHG', 'FTAG', 'HTAG'], axis = 1)

In [9]:
football.head(5)

Unnamed: 0,HS,AS,HST,AST,HF,AF,HY,AY,HR,AR,...,B365A,BWH,BWD,BWA,homeTeam,awayTeam,hTR,fTR,DayOfTheYear,Year
0,5,17,0,10,17,19,0,1,0,0,...,1.5,7.0,3.7,1.45,0,4,1,1,239,2005
1,10,9,6,2,13,19,0,1,0,0,...,3.25,1.95,3.15,3.65,2,31,1,2,239,2005
2,9,14,2,3,18,14,2,3,0,0,...,3.25,1.75,3.3,4.4,34,5,1,2,239,2005
3,16,9,5,2,16,22,2,7,0,0,...,4.0,1.65,3.4,4.9,3,39,1,1,240,2005
4,15,17,5,6,19,25,2,2,0,0,...,1.44,8.0,4.25,1.35,6,27,0,0,240,2005


# machine learning predictions

## With HTR

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

x,y = football.loc[:,football.columns != 'fTR'], football.loc[:,'fTR']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 42)

### DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree_clf = DecisionTreeClassifier()

#### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'max_depth': list(range(5, 8)), 'min_samples_split': list(range(45, 50)), 'random_state': list(range(1, 100, 10))}
]
grid = GridSearchCV(tree_clf, param_grid, cv=5, scoring='accuracy')

grid.fit(x_train, y_train)

In [None]:
print(grid.best_score_, grid.best_params_)

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=6, min_samples_split=48, random_state=1)
tree_clf.fit(x_train, y_train)
y_pred = tree_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

In [None]:
cross_val_score(tree_clf, x_train, y_train)

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(
        tree_clf,
        out_file=image_path("football.dot"),
        #feature_names= [football.columns != 'fTR'],
        #class_names=football.columns[football.loc[:,'fTR']],
        rounded=True,
        filled=True
    )

#### Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

tree_clf = DecisionTreeClassifier()

param_dist = {"max_depth": sp_randint(1,22),
              "max_features": sp_randint(1, 22),
              "min_samples_split": sp_randint(2, 100),
              "random_state": sp_randint(2, 100),
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 1000
random_search = RandomizedSearchCV(tree_clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False)
random_search.fit(x_train, y_train)

In [None]:
print(random_search.best_score_)
print(random_search.best_params_)

### FINAL MODEL AND SCORES

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=5, max_features=20, min_samples_split=11, 
                                  criterion='entropy', random_state=39)

tree_clf.fit(x_train,y_train)
y_pred = tree_clf.predict(x_test)

In [None]:
cross_val_score(tree_clf, x_train, y_train)

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(
        tree_clf,
        out_file=image_path("football.dot"),
        #feature_names= [football.columns != 'fTR'],
        #class_names=football.columns[football.loc[:,'fTR']],
        rounded=True,
        filled=True
    )

#### Decission Tree Classfier - accuracy scores
<br> Spain: 0.6327788046826864
<br> England:  0.6457178065311152
<br> Germany:  0.6386831275720165
<br> Italy:  0.6378162450066578
<br> France:  0.6285046728971962
<br> All:  0.6239823375189734

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

In [None]:
rf = RandomForestClassifier(max_leaf_nodes=54, min_samples_split=0.015924609574019265, 
                            n_estimators=87, random_state=83)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

In [None]:
cross_val_score(rf, x_train, y_train, cv=3, scoring="accuracy")

In [None]:
from sklearn.ensemble import RandomForestClassifier
for name, score in zip(football.columns, rf.feature_importances_):
    print(name, score)

#### Random Forest Classifier - accuracy scores
<br> Spain: 0.6685150955021565
<br> England:  0.6543438077634011
<br> Germany:  0.6592592592592592
<br> Italy:  0.6511318242343542
<br> France:  0.6339563862928349
<br> All:  0.6255002069821995

### SVM_poly

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

poly_kernel_svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="poly", degree=2, coef0=6, C=3, gamma=0.003))
    ])
poly_kernel_svm_clf.fit(x_train,y_train)
y_pred = poly_kernel_svm_clf.predict(x_test)

In [None]:
print(accuracy_score(y_test, y_pred))

### SVM_poly -  accuracy scores
<br> Spain:  0.674060382008626
<br> England:  0.6789895255699322
<br> Germany:  0.6551440329218107
<br> Italy:  0.6857523302263648
<br> France:  0.6588785046728972
<br> All:  0.6094935835518146

In [None]:
cross_val_score(poly_kernel_svm_clf, x_train, y_train, cv=3, scoring="accuracy")

### SVM_rbf

In [None]:
rbf_kernel_svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="rbf", gamma=0.001, C=8))
    ])
rbf_kernel_svm_clf.fit(x_train,y_train)

y_pred = rbf_kernel_svm_clf.predict(x_test)

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
cross_val_score(rbf_kernel_svm_clf, x_train, y_train, cv=3, scoring="accuracy")

### SVM_rbf - accuracy scores
<br> Spain:  0.6746765249537893
<br> England:  0.6765249537892791
<br> Germany:  0.6641975308641975
<br> Italy:  0.6870838881491345
<br> France:  0.6549844236760125
<br> All:  0.611563405547123

## Without HTR

In [11]:
from sklearn.model_selection import train_test_split

football = football.drop(columns = ['hTR'], axis = 1, errors='ignore')
x,y = football.loc[:,football.columns != 'fTR'], football.loc[:,'fTR']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=11, max_features=18, min_samples_split=97,
                                  random_state=68, criterion='entropy')

tree_clf.fit(x_train,y_train)
y_pred = tree_clf.predict(x_test)

### Decission Tree Classfier - accuracy scores
<br> Spain:  0.563154651879236
<br> England:  0.5403573629081947
<br> Germany:  0.5473251028806584
<br> Italy:  0.5512649800266312
<br> France:  0.5397196261682243
<br> All:  0.5407754933075756

In [None]:
rf = RandomForestClassifier(max_leaf_nodes=64, min_samples_split=0.018494868021880835, 
                            n_estimators=80, random_state=73)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

### RandomForestClassifier - accuracy scores
<br> Spain:  0.5834873690696242
<br> England:  0.5779420825631546
<br> Germany:  0.5604938271604938
<br> Italy:  0.5938748335552596
<br> France:  0.5654205607476636
<br> All:  0.5231130122809439

In [None]:
poly_kernel_svm_clf.fit(x_train,y_train)
y_pred = poly_kernel_svm_clf.predict(x_test)

### SVM_poly - accuracy scores
<br> Spain:  0.6056685150955021
<br> England:  0.5884165126309304
<br> Germany:  0.5958847736625514
<br> Italy:  0.618508655126498
<br> France:  0.6059190031152648
<br> All:  0.48254450117289915

In [None]:
rbf_kernel_svm_clf.fit(x_train,y_train)
y_pred = rbf_kernel_svm_clf.predict(x_test)

### SVM_rbf - accuracy scores
<br> Spain:  0.6087492298213185
<br> England:  0.5804066543438078
<br> Germany:  0.5876543209876544
<br> Italy:  0.6091877496671105
<br> France:  0.6051401869158879
<br> All:  0.4824065130398786

## only pre-match statistcs

In [12]:
from sklearn.model_selection import train_test_split

football = football.drop(columns = ['HST', 'AST', 'HS', 'AS', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR'], axis = 1, errors='ignore')
x,y = football.loc[:,football.columns != 'fTR'], football.loc[:,'fTR']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)

In [None]:
tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=3, 
                                 min_samples_split=97, random_state=61)

tree_clf.fit(x_train,y_train)
y_pred = tree_clf.predict(x_test)

### Decission Tree Classfier - accuracy scores
<br> Spain:  0.5231053604436229
<br> England:  0.5452865064695009
<br> Germany:  0.4962962962962963
<br> Italy:  0.5419440745672437
<br> France:  0.5077881619937694
<br> All:  0.4986891127363047

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_leaf_nodes=59, min_samples_split=0.021240342539542535, 
                            n_estimators=91, random_state=74)

rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)

In [15]:
for name, score in zip(football.columns, rf.feature_importances_):
    print(name, score)

B365H 0.16655119474858746
B365D 0.06762611844299588
B365A 0.14534322104632377
BWH 0.1608604296408546
BWD 0.07876490832855569
BWA 0.16426936289508584
homeTeam 0.04885358588648191
awayTeam 0.04239572453869703
fTR 0.0877329159285606
DayOfTheYear 0.03760253854385736


### RandomForestClassifier - accuracy scores
<br> Spain:  0.5175600739371534
<br> England:  0.5557609365372767
<br> Germany:  0.497119341563786
<br> Italy:  0.5439414114513982
<br> France:  0.4937694704049844
<br> All:  0.5308403477300953

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

poly_kernel_svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="poly"))
    ])

param_dist = {"svm_clf__degree": sp_randint(1, 10),
              "svm_clf__coef0": sp_randint(1, 8),
              "svm_clf__gamma": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
              "svm_clf__C": sp_randint(1,8)}

# run randomized search
n_iter_search = 5
random_search = RandomizedSearchCV(poly_kernel_svm_clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5, iid=False, verbose=1)
random_search.fit(x_train, y_train)

In [None]:
print(random_search.best_score_)
print(random_search.best_params_)

In [None]:
poly_kernel_svm_clf.fit(x_train,y_train)
y_pred = poly_kernel_svm_clf.predict(x_test)

### SVM_poly - accuracy scores
<br>Spain:  0.51879235982748
<br>England:  0.5372766481823783
<br>Germany:  0.4831275720164609
<br>Italy:  0.5346205059920106
<br>France:  0.5007788161993769
<br>All:  0.4815785842417552

In [None]:
rbf_kernel_svm_clf.fit(x_train,y_train)
y_pred = rbf_kernel_svm_clf.predict(x_test)

### SVM_rbf - accuracy scores
<br>Spain:  0.5200246457178065
<br>England:  0.5360443622920518
<br>Germany:  0.48148148148148145
<br>Italy:  0.525965379494008
<br>France:  0.4945482866043614
<br>All:  0.48144059610873463