# Result predictions
### Predicting results for one week of league games by all of the used algorithms

### Data load

In [1]:
# Obsługa środowisk Python 2 i Python 3
from __future__ import division, print_function, unicode_literals

# Importowanie popularnych modułów
import numpy as np
import os

# W celu zachowania powtarzalności wyników w kolejnych przebiegach
np.random.seed(42)

# Generowanie ładnych wykresów
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Lokacja, w której będą zapisywane rysunki
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "preparing_dataset"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "pictures", CHAPTER_ID)

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "pictures", CHAPTER_ID, fig_id)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving an image", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
import os
import pandas as pd

FOOTBALL_PATH_SP = os.path.join("datasets", "spain")
football_path_sp = FOOTBALL_PATH_SP

FOOTBALL_PATH_EN = os.path.join("datasets", "england")
football_path_en = FOOTBALL_PATH_EN

FOOTBALL_PATH_FR = os.path.join("datasets", "france")
football_path_fr = FOOTBALL_PATH_FR

FOOTBALL_PATH_GE = os.path.join("datasets", "germany")
football_path_ge = FOOTBALL_PATH_GE

FOOTBALL_PATH_IT = os.path.join("datasets", "italy")
football_path_it = FOOTBALL_PATH_IT

def load_football_data(football_path, file):
    csv_path = os.path.join(football_path, file)
    return pd.read_csv(csv_path, error_bad_lines=False)

In [3]:
football_sp = load_football_data(FOOTBALL_PATH_SP, "spain.csv")
football_en = load_football_data(FOOTBALL_PATH_EN, "england.csv")
football_fr = load_football_data(FOOTBALL_PATH_FR, "france.csv")
football_ge = load_football_data(FOOTBALL_PATH_GE, "germany.csv")
football_it = load_football_data(FOOTBALL_PATH_IT, "italy.csv")

In [4]:
football = football_en.copy()
football = football.dropna(subset=["Date"])
football = pd.DataFrame(football).fillna(0)

In [5]:
football.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5410 entries, 0 to 5409
Data columns (total 26 columns):
Div         5410 non-null object
Date        5410 non-null object
HomeTeam    5410 non-null object
AwayTeam    5410 non-null object
FTHG        5410 non-null float64
FTAG        5410 non-null float64
FTR         5410 non-null object
HTHG        5410 non-null float64
HTAG        5410 non-null float64
HTR         5410 non-null object
HS          5410 non-null float64
AS          5410 non-null float64
HST         5410 non-null float64
AST         5410 non-null float64
HF          5410 non-null float64
AF          5410 non-null float64
HY          5410 non-null float64
AY          5410 non-null float64
HR          5410 non-null float64
AR          5410 non-null float64
B365H       5410 non-null float64
B365D       5410 non-null float64
B365A       5410 non-null float64
BWH         5410 non-null float64
BWD         5410 non-null float64
BWA         5410 non-null float64
dtypes: float64

In [6]:
from sklearn.preprocessing import LabelEncoder

homeTeamList = football["HomeTeam"].tolist() 
awayTeamList = football["AwayTeam"].tolist()
fTRList = football["FTR"].tolist()
hTRList = football["HTR"].tolist()
divList = football["Div"].tolist()

labelEncoder = LabelEncoder()

labelEncoder.fit(homeTeamList)
label = labelEncoder.transform(homeTeamList)
football['homeTeam']=pd.Series(label)

labelEncoder.fit(awayTeamList)
label = labelEncoder.transform(awayTeamList)
football['awayTeam']=pd.Series(label)

labelEncoder.fit(hTRList)
label = labelEncoder.transform(hTRList)
football['hTR']=pd.Series(label)

labelEncoder.fit(fTRList)
label = labelEncoder.transform(fTRList)
football['fTR']=pd.Series(label)

labelEncoder.fit(divList)
label = labelEncoder.transform(divList)
football['div']=pd.Series(label)

In [7]:
import datetime

dates = pd.Series(football['Date'])
dates = pd.to_datetime(dates, format = '%d/%m/%y')
days = []
years = []

for i in dates:
    d = i.dayofyear
    days.append(d)
    y = i.year
    years.append(y)
    
x = pd.Series(days)
y = pd.Series(years)
football["DayOfTheYear"] = x
football["Year"] = y

In [8]:
football = football.drop(columns = ['div', 'Div','Date', 'HomeTeam', 'AwayTeam', 'HTR', 'FTR', 'FTHG', 'HTHG', 'FTAG', 'HTAG', 'hTR', 'HST', 'AST', 'HS', 'AS', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR'], axis = 1)
football.shape

(5410, 11)

In [9]:
x,y = football.loc[:,football.columns != 'fTR'], football.loc[:,'fTR']

x_train = x[:-10]
x_test = x[-10:]
y_train = y[:-10]
y_test = y[-10:]

# DECISION TREE - IT

In [28]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=3, 
                                 min_samples_split=97, random_state=61)

tree_clf.fit(x_train,y_train)
y_pred = tree_clf.predict(x_test)

for actual, predicted in zip(y_pred, y_test):
    print(actual, predicted)

2 1
2 2
2 2
0 0
2 2
0 1
0 2
2 2
2 1
0 1


# RANDOM FOREST - EN

In [12]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_leaf_nodes=59, min_samples_split=0.021240342539542535, 
                            n_estimators=91, random_state=74)

rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)

for actual, predicted in zip(y_pred, y_test):
    print(actual, predicted)

2 2
2 2
2 1
2 2
2 2
2 1
2 1
0 0
0 1
0 2


# SVM - EN

In [11]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

poly_kernel_svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="poly", degree=2, coef0=6, C=3, gamma=0.003))
    ])
poly_kernel_svm_clf.fit(x_train,y_train)
y_pred = poly_kernel_svm_clf.predict(x_test)

for actual, predicted in zip(y_pred, y_test):
    print(actual, predicted)

2 2
2 2
2 1
2 2
2 2
2 1
2 1
0 0
0 1
0 2


# MLPClassifier - SP

In [10]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(activation='tanh', alpha=1e-06, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(75, 264), learning_rate='constant',
              learning_rate_init=0.001, max_iter=164, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=51, shuffle=True, solver='adam', tol=1e-21,
              validation_fraction=0.1, verbose=22, warm_start=False)

mlp_clf.fit(x_train,y_train)
y_pred = mlp_clf.predict(x_test)

Iteration 1, loss = 1.08440654
Iteration 2, loss = 1.05046994
Iteration 3, loss = 1.03340231
Iteration 4, loss = 1.00939981
Iteration 5, loss = 1.01968888
Iteration 6, loss = 0.99195108
Iteration 7, loss = 0.99692753
Iteration 8, loss = 0.98798988
Iteration 9, loss = 0.98727816
Iteration 10, loss = 0.98329841
Iteration 11, loss = 0.98438162
Iteration 12, loss = 0.98733056
Iteration 13, loss = 0.97975204
Iteration 14, loss = 0.98111042
Iteration 15, loss = 0.97737514
Iteration 16, loss = 0.97877608
Iteration 17, loss = 0.97907255
Iteration 18, loss = 0.99359897
Iteration 19, loss = 0.97695831
Iteration 20, loss = 0.97900266
Iteration 21, loss = 0.97736140
Iteration 22, loss = 0.97652832
Iteration 23, loss = 0.97287492
Iteration 24, loss = 0.97695692
Iteration 25, loss = 0.99472708
Iteration 26, loss = 0.97966479
Iteration 27, loss = 0.97535546
Iteration 28, loss = 0.97806157
Iteration 29, loss = 0.99344511
Iteration 30, loss = 0.99381478
Iteration 31, loss = 0.98009940
Iteration 32, los

In [11]:
for actual, predicted in zip(y_pred, y_test):
    print(actual, predicted)

2 2
2 2
2 1
2 2
2 2
2 1
2 1
0 0
0 1
0 2


# DNNClassifier - EN

In [21]:
import tensorflow as tf

config = tf.contrib.learn.RunConfig(tf_random_seed=42) # nieukazane w konfiguracji
feature_cols = tf.contrib.learn.infer_real_valued_columns_from_input(x_train)
dnn_clf = tf.contrib.learn.DNNClassifier(hidden_units=[300,120,30], n_classes=3, 
                                         activation_fn=tf.nn.elu, feature_columns=feature_cols)
dnn_clf = tf.contrib.learn.SKCompat(dnn_clf) # jeśli wersja modułu TensorFlow >= 1.1


dnn_clf.fit(x_train, y_train, batch_size=50, steps=40000)
y_pred = dnn_clf.predict(x_test)

for actual, predicted in zip(y_pred['classes'], y_test):
    print(actual, predicted)

W1128 17:17:39.744534 139895037765440 data_feeder.py:283] float64 is not supported by many models, consider casting to float32.
W1128 17:17:39.750876 139895037765440 estimator.py:453] Using temporary folder as model directory: /tmp/tmpff57dn5u
W1128 17:17:39.753569 139895037765440 data_feeder.py:283] float64 is not supported by many models, consider casting to float32.
W1128 17:19:17.434558 139895037765440 data_feeder.py:283] float64 is not supported by many models, consider casting to float32.


2 2
2 2
2 1
2 2
2 2
2 1
2 1
0 0
0 1
0 2


# RNN

In [28]:
from keras.utils import np_utils

x_train = x_train.astype(float)
x_test = x_test.astype(float)
y_train = y_train.astype(float)
y_test = y_test.astype(float)
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# reshape input to be [samples, time steps, features]
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

In [29]:
import tensorflow as tf

model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(256, input_shape=(1, 10), activation='relu', return_sequences=True))
model.add(tf.keras.layers.LSTM(128, activation='relu'))
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=200, batch_size=50, verbose=1)
model.save('rnn_all_model.h5')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


In [30]:
y_pred = model.predict_classes(x_test, batch_size=50)
val_loss, val_acc = model.evaluate(x_test, y_test, verbose=0)
print(val_acc, val_loss)

0.6 0.8601630926132202


In [31]:
for actual, predicted in zip(y_pred, y_test):
    print(actual, predicted)

2 [0. 1. 0.]
2 [0. 0. 1.]
2 [0. 0. 1.]
0 [1. 0. 0.]
2 [0. 0. 1.]
2 [0. 1. 0.]
2 [0. 0. 1.]
2 [0. 0. 1.]
2 [0. 1. 0.]
2 [0. 1. 0.]
