# RNNs part
### Same predictions as in the earlier parts using recurrent neural network with tools provided by Keras library

### Data load

In [1]:
# Obsługa środowisk Python 2 i Python 3
from __future__ import division, print_function, unicode_literals

# Importowanie popularnych modułów
import numpy as np
import os

# W celu zachowania powtarzalności wyników w kolejnych przebiegach
np.random.seed(42)

# Generowanie ładnych wykresów
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Lokacja, w której będą zapisywane rysunki
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "preparing_dataset"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "pictures", CHAPTER_ID)

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "pictures", CHAPTER_ID, fig_id)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving an image", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
import os
import pandas as pd

FOOTBALL_PATH_SP = os.path.join("datasets", "spain")
football_path_sp = FOOTBALL_PATH_SP

FOOTBALL_PATH_EN = os.path.join("datasets", "england")
football_path_en = FOOTBALL_PATH_EN

FOOTBALL_PATH_FR = os.path.join("datasets", "france")
football_path_fr = FOOTBALL_PATH_FR

FOOTBALL_PATH_GE = os.path.join("datasets", "germany")
football_path_ge = FOOTBALL_PATH_GE

FOOTBALL_PATH_IT = os.path.join("datasets", "italy")
football_path_it = FOOTBALL_PATH_IT

def load_football_data(football_path, file):
    csv_path = os.path.join(football_path, file)
    return pd.read_csv(csv_path, error_bad_lines=False)

In [3]:
football_sp = load_football_data(FOOTBALL_PATH_SP, "spain.csv")
football_en = load_football_data(FOOTBALL_PATH_EN, "england.csv")
football_fr = load_football_data(FOOTBALL_PATH_FR, "france.csv")
football_ge = load_football_data(FOOTBALL_PATH_GE, "germany.csv")
football_it = load_football_data(FOOTBALL_PATH_IT, "italy.csv")

In [4]:
football = football_it.copy()
football = football.dropna(subset=["Date"])
football = pd.DataFrame(football).fillna(0)

In [5]:
football.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5004 entries, 0 to 5003
Data columns (total 26 columns):
Div         5004 non-null object
Date        5004 non-null object
HomeTeam    5004 non-null object
AwayTeam    5004 non-null object
FTHG        5004 non-null float64
FTAG        5004 non-null float64
FTR         5004 non-null object
HTHG        5004 non-null float64
HTAG        5004 non-null float64
HTR         5004 non-null object
HS          5004 non-null float64
AS          5004 non-null float64
HST         5004 non-null float64
AST         5004 non-null float64
HF          5004 non-null float64
AF          5004 non-null float64
HY          5004 non-null float64
AY          5004 non-null float64
HR          5004 non-null float64
AR          5004 non-null float64
B365H       5004 non-null float64
B365D       5004 non-null float64
B365A       5004 non-null float64
BWH         5004 non-null float64
BWD         5004 non-null float64
BWA         5004 non-null float64
dtypes: float64

In [6]:
from sklearn.preprocessing import LabelEncoder

homeTeamList = football["HomeTeam"].tolist() 
awayTeamList = football["AwayTeam"].tolist()
fTRList = football["FTR"].tolist()
hTRList = football["HTR"].tolist()
divList = football["Div"].tolist()

labelEncoder = LabelEncoder()

labelEncoder.fit(homeTeamList)
label = labelEncoder.transform(homeTeamList)
football['homeTeam']=pd.Series(label)

labelEncoder.fit(awayTeamList)
label = labelEncoder.transform(awayTeamList)
football['awayTeam']=pd.Series(label)

labelEncoder.fit(hTRList)
label = labelEncoder.transform(hTRList)
football['hTR']=pd.Series(label)

labelEncoder.fit(fTRList)
label = labelEncoder.transform(fTRList)
football['fTR']=pd.Series(label)

labelEncoder.fit(divList)
label = labelEncoder.transform(divList)
football['div']=pd.Series(label)

In [7]:
import datetime

dates = pd.Series(football['Date'])
dates = pd.to_datetime(dates, format = '%d/%m/%y')
days = []
years = []

for i in dates:
    d = i.dayofyear
    days.append(d)
    y = i.year
    years.append(y)
    
x = pd.Series(days)
y = pd.Series(years)
football["DayOfTheYear"] = x
football["Year"] = y
football.head(20)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BWH,BWD,BWA,homeTeam,awayTeam,hTR,fTR,div,DayOfTheYear,Year
0,I1,27/08/05,Fiorentina,Sampdoria,2.0,1.0,H,2.0,0.0,H,...,2.25,2.9,3.2,13,30,3,2,0,239,2005
1,I1,27/08/05,Livorno,Lecce,2.0,1.0,H,1.0,1.0,D,...,1.9,3.2,3.8,20,19,2,2,0,239,2005
2,I1,28/08/05,Ascoli,Milan,1.0,1.0,D,0.0,0.0,D,...,9.0,4.5,1.3,0,22,2,1,0,240,2005
3,I1,28/08/05,Inter,Treviso,3.0,0.0,H,1.0,0.0,H,...,1.16,5.9,15.0,16,35,3,2,0,240,2005
4,I1,28/08/05,Juventus,Chievo,1.0,0.0,H,1.0,0.0,H,...,1.22,5.5,10.0,17,10,3,2,0,240,2005
5,I1,28/08/05,Lazio,Messina,1.0,0.0,H,1.0,0.0,H,...,1.9,3.1,4.0,18,21,3,2,0,240,2005
6,I1,28/08/05,Parma,Palermo,1.0,1.0,D,0.0,1.0,A,...,2.55,2.9,2.75,26,25,1,1,0,240,2005
7,I1,28/08/05,Reggina,Roma,0.0,3.0,A,0.0,1.0,A,...,3.9,3.15,1.9,28,29,1,0,0,240,2005
8,I1,28/08/05,Siena,Cagliari,2.0,1.0,H,1.0,1.0,D,...,2.2,2.9,3.3,32,6,2,2,0,240,2005
9,I1,28/08/05,Udinese,Empoli,1.0,0.0,H,1.0,0.0,H,...,1.6,3.4,5.5,36,12,3,2,0,240,2005


In [8]:
import datetime

dates = pd.Series(football['Date'])
dates = pd.to_datetime(dates, format = '%d/%m/%y')
days = []
years = []

for i in dates:
    d = i.dayofyear
    days.append(d)
    y = i.year
    years.append(y)
    
x = pd.Series(days)
y = pd.Series(years)
football["DayOfTheYear"] = x
football["Year"] = y

In [9]:
football = football.drop(columns = ['div', 'Div','Date', 'HomeTeam', 'AwayTeam', 'HTR', 'FTR', 'FTHG', 'HTHG','FTAG', 'HTAG'], axis = 1)

In [10]:
football.head(5)

Unnamed: 0,HS,AS,HST,AST,HF,AF,HY,AY,HR,AR,...,B365A,BWH,BWD,BWA,homeTeam,awayTeam,hTR,fTR,DayOfTheYear,Year
0,15.0,9.0,9.0,4.0,18.0,23.0,1.0,4.0,0.0,0.0,...,3.25,2.25,2.9,3.2,13,30,3,2,239,2005
1,17.0,6.0,7.0,5.0,27.0,21.0,2.0,3.0,0.0,0.0,...,4.0,1.9,3.2,3.8,20,19,2,2,239,2005
2,8.0,16.0,3.0,9.0,22.0,16.0,2.0,1.0,0.0,0.0,...,1.4,9.0,4.5,1.3,0,22,2,1,240,2005
3,16.0,7.0,9.0,3.0,13.0,20.0,1.0,3.0,0.0,0.0,...,13.0,1.16,5.9,15.0,16,35,3,2,240,2005
4,16.0,2.0,7.0,0.0,16.0,12.0,1.0,2.0,0.0,0.0,...,13.0,1.22,5.5,10.0,17,10,3,2,240,2005


In [11]:
from sklearn.model_selection import train_test_split

x,y = football.loc[:,football.columns != 'fTR'], football.loc[:,'fTR']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)

### Encoding the input data in order to make it work with rnns (expected input is [batch_size, n_steps, n_inputs])

In [12]:
x_train = x_train.astype(float)
x_test = x_test.astype(float)
y_train = y_train.astype(float)
y_test = y_test.astype(float)

In [13]:
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [14]:
# reshape input to be [samples, time steps, features]
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

In [15]:
from keras.utils import np_utils

y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [37]:
import tensorflow as tf

model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(256, input_shape=(1, 21), activation='relu', return_sequences=True))
model.add(tf.keras.layers.LSTM(128, activation='relu'))
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=200, batch_size=20, verbose=1)
model.save('rnn_all_model.h5')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


In [38]:
y_pred = model.predict(x_test, batch_size=20)
val_loss, val_acc = model.evaluate(x_test, y_test, verbose=0)
print(val_acc, val_loss)

0.66378164 0.7577571179196932


<br> France 0.6347352
<br> Spain 0.6623537
<br> England 0.66789895
<br> Germany 0.65761316
<br> Italy 0.66378164

In [16]:
football = football.drop(columns=['hTR'], axis=1)
x,y = football.loc[:,football.columns != 'fTR'], football.loc[:,'fTR']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)


x_train = x_train.astype(float)
x_test = x_test.astype(float)
y_train = y_train.astype(float)
y_test = y_test.astype(float)
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# reshape input to be [samples, time steps, features]
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

In [40]:
import tensorflow as tf

model_no_htr = tf.keras.Sequential()
model_no_htr.add(tf.keras.layers.LSTM(256, input_shape=(1, 20), activation='relu', return_sequences=True))
model_no_htr.add(tf.keras.layers.LSTM(128, activation='relu'))
model_no_htr.add(tf.keras.layers.Dense(3, activation='softmax'))
model_no_htr.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_no_htr.fit(x_train, y_train, epochs=200, batch_size=20, verbose=1)
model_no_htr.save('rnn_no_htr_model.h5')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


In [41]:
y_pred = model_no_htr.predict(x_test, batch_size=20)
val_loss, val_acc = model_no_htr.evaluate(x_test, y_test, verbose=0)
print(val_acc, val_loss)

0.5985353 0.8756807754423901


<br> France 0.57476634
<br> Spain 0.6155268
<br> England 0.5902649
<br> Germany 0.5761317
<br> Italy 0.5985353

In [17]:
football = football.drop(columns=['HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR'], axis=1)
x,y = football.loc[:,football.columns != 'fTR'], football.loc[:,'fTR']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)


x_train = x_train.astype(float)
x_test = x_test.astype(float)
y_train = y_train.astype(float)
y_test = y_test.astype(float)
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# reshape input to be [samples, time steps, features]
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

In [18]:
import tensorflow as tf

model_no_stats = tf.keras.Sequential()
model_no_stats.add(tf.keras.layers.LSTM(256, input_shape=(1, 10), activation='relu', return_sequences=True))
model_no_stats.add(tf.keras.layers.LSTM(128, activation='relu'))
model_no_stats.add(tf.keras.layers.Dense(3, activation='softmax'))
model_no_stats.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_no_stats.fit(x_train, y_train, epochs=200, batch_size=20, verbose=1)
model_no_stats.save('rnn_no_stats_model.h5')

W1124 22:01:01.955901 139867182622528 deprecation.py:506] From /home/kub/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1124 22:01:02.684263 139867182622528 deprecation.py:323] From /home/kub/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/

In [19]:
from sklearn.metrics import accuracy_score

y_pred = model_no_stats.predict_classes(x_test, batch_size=20)
# print(accuracy_score(y_test, y_pred))

In [20]:
val_loss, val_acc = model_no_stats.evaluate(x_test, y_test, verbose=0)
print(val_acc)

0.5406125


<br> France 0.50311524
<br> Spain 0.52002466
<br> England 0.54775107
<br> Germany 0.49465021
<br> Italy 0.5406125