In [116]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import featuretools as ft

Import the data from the given soccer spreadsheets

In [117]:
global_rankings = pd.read_csv("./data/spi_global_rankings.csv")
global_rankings.head()

Unnamed: 0,rank,prev_rank,name,league,off,def,spi
0,1,1,Manchester City,Barclays Premier League,3.24,0.25,94.74
1,2,3,Bayern Munich,German Bundesliga,3.46,0.41,93.98
2,3,2,Liverpool,Barclays Premier League,2.9,0.29,92.43
3,4,4,Paris Saint-Germain,French Ligue 1,2.9,0.5,89.22
4,5,6,Barcelona,Spanish Primera Division,2.83,0.47,89.17


In [118]:
global_rankings_inter = pd.read_csv("./data/spi_global_rankings_intl.csv")
global_rankings_inter.head()

Unnamed: 0,rank,name,confed,off,def,spi
0,1,Spain,UEFA,3.52,0.56,91.92
1,2,Brazil,CONMEBOL,2.88,0.37,90.29
2,3,Germany,UEFA,3.24,0.61,89.43
3,4,Belgium,UEFA,3.05,0.59,88.29
4,5,Argentina,CONMEBOL,2.67,0.41,88.04


In [119]:
match_list = pd.read_csv("./data/spi_matches.csv")
match_list.head()

Unnamed: 0,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,probtie,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
0,2016-08-12,1843,French Ligue 1,Bastia,Paris Saint-Germain,51.16,85.68,0.0463,0.838,0.1157,...,32.4,67.7,0.0,1.0,0.97,0.63,0.43,0.45,0.0,1.05
1,2016-08-12,1843,French Ligue 1,AS Monaco,Guingamp,68.85,56.48,0.5714,0.1669,0.2617,...,53.7,22.9,2.0,2.0,2.45,0.77,1.75,0.42,2.1,2.1
2,2016-08-13,2411,Barclays Premier League,Hull City,Leicester City,53.57,66.81,0.3459,0.3621,0.2921,...,38.1,22.2,2.0,1.0,0.85,2.77,0.17,1.25,2.1,1.05
3,2016-08-13,2411,Barclays Premier League,Crystal Palace,West Bromwich Albion,55.19,58.66,0.4214,0.2939,0.2847,...,43.6,34.6,0.0,1.0,1.11,0.68,0.84,1.6,0.0,1.05
4,2016-08-13,2411,Barclays Premier League,Everton,Tottenham Hotspur,68.02,73.25,0.391,0.3401,0.2689,...,31.9,48.0,1.0,1.0,0.73,1.11,0.88,1.81,1.05,1.05


Let's merge the two datasets together. Lets focus on the data that we have plenty of information on so lets do this merge as an intersection of both sets. If the team has no global rankings its much tougher to predict

In particular for each team that is playing you need to get the offensive score in general to get a feel for how their offense is against the other team

You will also need the defense score for the other team. This is important because if you have a clash between a team with a good offense theoretically a team with a good defense equals out the playing field.

In [145]:
match_rankings_1 = pd.merge(match_list , global_rankings[['rank', 'name','off']], left_on = ['team1'] , right_on = ['name'], how = 'inner')
match_rankings.head()
match_rankings = pd.merge(match_rankings_1 , global_rankings[['name','def']], left_on = ['team2'] , right_on = ['name'], how = 'inner')
match_rankings.head()


Unnamed: 0,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,probtie,...,xg2,nsxg1,nsxg2,adj_score1,adj_score2,rank,name_x,off,name_y,def
0,2016-08-12,1843,French Ligue 1,AS Monaco,Guingamp,68.85,56.48,0.5714,0.1669,0.2617,...,0.77,1.75,0.42,2.1,2.1,82,AS Monaco,1.86,Guingamp,1.56
1,2017-11-04,1843,French Ligue 1,AS Monaco,Guingamp,77.29,57.31,0.6807,0.1249,0.1944,...,0.39,1.59,0.57,5.98,0.0,82,AS Monaco,1.86,Guingamp,1.56
2,2018-12-22,1843,French Ligue 1,AS Monaco,Guingamp,60.09,54.55,0.5027,0.2268,0.2705,...,1.06,0.88,0.79,0.0,1.98,82,AS Monaco,1.86,Guingamp,1.56
3,2017-02-19,1843,French Ligue 1,Bordeaux,Guingamp,59.49,58.21,0.4603,0.242,0.2977,...,1.0,2.23,1.16,3.14,0.0,128,Bordeaux,1.43,Guingamp,1.56
4,2017-09-23,1843,French Ligue 1,Bordeaux,Guingamp,67.44,58.08,0.5544,0.1869,0.2587,...,1.21,1.53,1.23,2.63,1.05,128,Bordeaux,1.43,Guingamp,1.56


Let's clean up our teams and process them with labels. This is important for the step where we want to build our regression models

In [121]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(match_rankings['team1'])
match_rankings['team1'] = le.transform(match_rankings['team1'])
le.fit(match_list['team2'])
match_rankings['team2'] = le.transform(match_rankings['team2'])



We have some rows with empty data, let's delete those.

In [127]:
match_rankings= match_rankings.dropna()

Create the object you want to predict, in this case we want to predict the score of the first team.

In [144]:
from sklearn.model_selection import train_test_split
my_pred = match_rankings['score1']
pd.DataFrame(my_pred).fillna(0).head()


Unnamed: 0,score1
0,2.0
1,6.0
2,0.0
3,3.0
4,3.0


Lets import the features we want to create the random forest classifier to care about. Obviously we care about the teams, the origianl match prediction score, and finally lets include how good the offense is and the defense of the other team they are playing and create a good relationship between not only the team win potential, but also the strength of its players in their respective positions

In [143]:
df_features = match_rankings[['team1', 'team2', 'spi1', 'spi2', 'prob1', 'prob2', 'probtie','proj_score1', 'proj_score2',
       'importance1', 'importance2', 'score1', 'score2', 'xg1', 'xg2', 'nsxg1',
       'nsxg2', 'adj_score1', 'adj_score2', 'off','def' ]]
df_features = df_features.dropna()
pd.DataFrame(df_features).fillna(0).head()


Unnamed: 0,team1,team2,spi1,spi2,prob1,prob2,probtie,proj_score1,proj_score2,importance1,...,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2,off,def
0,13,311,68.85,56.48,0.5714,0.1669,0.2617,1.82,0.86,53.7,...,2.0,2.0,2.45,0.77,1.75,0.42,2.1,2.1,1.86,1.56
1,13,311,77.29,57.31,0.6807,0.1249,0.1944,2.22,0.83,45.0,...,6.0,0.0,2.56,0.39,1.59,0.57,5.98,0.0,1.86,1.56
2,13,311,60.09,54.55,0.5027,0.2268,0.2705,1.53,0.93,45.0,...,0.0,2.0,0.45,1.06,0.88,0.79,0.0,1.98,1.86,1.56
3,86,311,59.49,58.21,0.4603,0.242,0.2977,1.48,1.02,1.2,...,3.0,0.0,2.27,1.0,2.23,1.16,3.14,0.0,1.43,1.56
4,86,311,67.44,58.08,0.5544,0.1869,0.2587,1.63,0.83,45.1,...,3.0,1.0,3.1,1.21,1.53,1.23,2.63,1.05,1.43,1.56


In [130]:
X_train, X_test, y_train, y_test = train_test_split(df_features, my_pred, test_size=0.20, random_state=42)

In [131]:
y_train

15386    0.0
19959    0.0
8329     3.0
3333     1.0
20601    3.0
        ... 
26308    0.0
8780     2.0
9081     2.0
1246     0.0
13759    1.0
Name: score1, Length: 9714, dtype: float64

In [133]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

Create the random forest classifier and fit it to our training sets

In [134]:
clf = RandomForestClassifier()

clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Create a prediction for all the values in our test data as to what the score they got for the game was.

In [148]:
my_prediction = clf.predict(X_test)
my_prediction

array([4., 5., 0., ..., 2., 0., 3.])

Ok. Somehow I got 99%.... A part of me doesn't want to believe that I somehow got this score, but I can't see any problems in my implementation. The testing set even had 2429 data points and we were accurately able to predict most. Shocker.

In [142]:
accuracy_score(my_prediction, y_test)

0.9913544668587896

In [147]:
y_test

20272    5.0
19353    5.0
3219     0.0
6311     0.0
9894     1.0
        ... 
26296    0.0
11432    1.0
20474    2.0
5772     0.0
2715     3.0
Name: score1, Length: 2429, dtype: float64