In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier

sns.set(color_codes=True)
%matplotlib inline

database = "database.sqlite"
con = sqlite3.connect(database)

In [3]:
#Get main tables from db
countries = pd.read_sql_query("SELECT * from Country", con)
matches = pd.read_sql_query("SELECT * from Match", con)
leagues = pd.read_sql_query("SELECT * from League", con)
teams = pd.read_sql_query("SELECT * from Team", con)
teams_att = pd.read_sql_query("SELECT * FROM Team_Attributes", con)
player_att = pd.read_sql_query("Select * from Player_Attributes", con)

In [4]:
#Cleaning matches
matches_df = pd.read_sql("""SELECT * from MATCH""", con)
teams_df = pd.read_sql("""SELECT * from TEAM""", con)
player_attributes_df = pd.read_sql("""SELECT * from PLAYER_ATTRIBUTES""", con)

matches_df['date'] = pd.to_datetime(matches_df['date'], format='%Y-%m-%d')

#Replace team_id for its name to make it easier to analyse.
team_id_map = teams_df.set_index('team_api_id').to_dict()['team_long_name']
matches_df['home_team_api_id'] = matches_df['home_team_api_id'].map(team_id_map)
matches_df['away_team_api_id'] = matches_df['away_team_api_id'].map(team_id_map)

#Get class variable
matches_df['game_status'] = 0
matches_df['game_status'] = np.where(matches_df['home_team_goal'] > matches_df['away_team_goal'] , '1', matches_df['game_status'])
matches_df['game_status'] = np.where(matches_df['home_team_goal'] < matches_df['away_team_goal'], '-1', matches_df['game_status'])

#Get players info
home_players = ["home_player_" + str(x) for x in range(1, 12)]
away_players = ["away_player_" + str(x) for x in range(1, 12)]

matches_kept_columns = ["id", "league_id", "date", "home_team_api_id", "away_team_api_id", "home_team_goal", "away_team_goal"]
matches_kept_columns = matches_kept_columns + home_players
matches_kept_columns = matches_kept_columns + away_players

matches_df = matches_df[matches_kept_columns]

#Get overall ratings for all players from player_attributes table
for player in home_players:
    matches_df = pd.merge(matches_df, player_attributes_df[["id", "overall_rating"]], left_on=[player], right_on=["id"], suffixes=["", "_" + player])
for player in away_players:
    matches_df = pd.merge(matches_df, player_attributes_df[["id", "overall_rating"]], left_on=[player], right_on=["id"], suffixes=["", "_" + player])
    
matches_df = matches_df.rename(columns={"overall_rating": "overall_rating_home_player_1"})

matches_df = matches_df[ matches_df[['overall_rating_' + p for p in home_players]].isnull().sum(axis = 1) <= 0]
matches_df = matches_df[ matches_df[['overall_rating_' + p for p in away_players]].isnull().sum(axis = 1) <= 0]

matches_df['overall_rating_home'] = matches_df[['overall_rating_' + p for p in home_players]].sum(axis=1)
matches_df['overall_rating_away'] = matches_df[['overall_rating_' + p for p in away_players]].sum(axis=1)
matches_df['overall_rating_difference'] = matches_df['overall_rating_home'] - matches_df['overall_rating_away']

matches_df['mean_overall_rating_home'] = matches_df[['overall_rating_' + p for p in home_players]].mean(axis=1)
matches_df['mean_overall_rating_away'] = matches_df[['overall_rating_' + p for p in away_players]].mean(axis=1)


#Remove all players column because we just need the "global" ones
for c in matches_df.columns:
    if '_player_' in c:
        matches_df = matches_df.drop(c, axis=1)
 
#Get goal difference
matches_df['goal_diff'] = matches_df['home_team_goal'] - matches_df['away_team_goal']

matches_df['Game Result'] = 'Defeat'
matches_df['Game Result'] = np.where(matches_df['goal_diff'] == 0, 'Draw', matches_df['Game Result'])
matches_df['Game Result'] = np.where(matches_df['goal_diff'] > 0, 'Win', matches_df['Game Result'])

matches_df = matches_df.drop(['id', 'date', 'home_team_api_id','away_team_api_id','home_team_goal','away_team_goal','overall_rating_difference', 'goal_diff'], axis=1)
matches_df

Unnamed: 0,league_id,overall_rating_home,overall_rating_away,mean_overall_rating_home,mean_overall_rating_away,Game Result
0,7809,746.0,783.0,67.818182,71.181818,Draw
1,7809,772.0,790.0,70.181818,71.818182,Defeat
4,7809,733.0,770.0,66.636364,70.000000,Win
5,7809,809.0,778.0,73.545455,70.727273,Defeat
6,7809,763.0,791.0,69.363636,71.909091,Win
...,...,...,...,...,...,...
6536,13274,764.0,772.0,69.454545,70.181818,Win
6537,19694,772.0,780.0,70.181818,70.909091,Win
6538,19694,755.0,720.0,68.636364,65.454545,Defeat
6539,17642,752.0,779.0,68.363636,70.818182,Win


In [5]:
y = matches_df['Game Result']
X = matches_df.drop('Game Result', axis=1)
X

Unnamed: 0,league_id,overall_rating_home,overall_rating_away,mean_overall_rating_home,mean_overall_rating_away
0,7809,746.0,783.0,67.818182,71.181818
1,7809,772.0,790.0,70.181818,71.818182
4,7809,733.0,770.0,66.636364,70.000000
5,7809,809.0,778.0,73.545455,70.727273
6,7809,763.0,791.0,69.363636,71.909091
...,...,...,...,...,...
6536,13274,764.0,772.0,69.454545,70.181818
6537,19694,772.0,780.0,70.181818,70.909091
6538,19694,755.0,720.0,68.636364,65.454545
6539,17642,752.0,779.0,68.363636,70.818182


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42, 
                                                    stratify=y)


In [7]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-1.29270982,  0.42944062,  1.54178267,  0.42944062,  1.54178267],
       [ 1.55241967, -1.8511987 ,  0.31360728, -1.8511987 ,  0.31360728],
       [ 1.03990761, -0.31766536,  0.63055577, -0.31766536,  0.63055577],
       ...,
       [ 1.31123753, -0.94680725,  0.11551448, -0.94680725,  0.11551448],
       [-1.06422148,  2.31686626, -0.24105257,  2.31686626, -0.24105257],
       [ 0.06341083, -0.78952178, -1.07304235, -0.78952178, -1.07304235]])

In [30]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                         hidden_layer_sizes=(15, 10), random_state=1)
clf.fit(X_train_scaled, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(20, 15, 10), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [31]:
x=clf.predict(scaler.transform(X_test))
np.set_printoptions(threshold=sys.maxsize)
print(x)

['Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Draw' 'Win' 'Win' 'Win'
 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Draw' 'Win'
 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win'
 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win'
 'Defeat' 'Win' 'Draw' 'Win' 'Win' 'Win' 'Win' 'Win' 'Defeat' 'Win' 'Win'
 'Win' 'Win' 'Win' 'Win' 'Win' 'Draw' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win'
 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Draw' 'Win'
 'Win' 'Draw' 'Win' 'Defeat' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win'
 'Defeat' 'Win' 'Win' 'Draw' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win'
 'Win' 'Draw' 'Win' 'Win' 'Draw' 'Win' 'Draw' 'Win' 'Win' 'Win' 'Win'
 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win'
 'Win' 'Win' 'Win' 'Draw' 'Win' 'Win' 'Win' 'Win' 'Win' 'Draw' 'Draw'
 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Win' 'Draw'
 'Defeat' 'Defeat' 'Win' 'Win' 'Win' 'Win' 'Win' 'Draw' 'Win'

In [32]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, x)

0.444253859348199

In [22]:
clf.predict_proba(X_test)

, 0.00000000e+000],
       [3.83368045e-028, 1.00000000e+000, 8.14012119e-201],
       [0.00000000e+000, 1.00000000e+000, 0.00000000e+000],
       [3.73477536e-007, 9.99999627e-001, 5.76815632e-097],
       [0.00000000e+000, 1.00000000e+000, 0.00000000e+000],
       [3.34642980e-007, 9.99999665e-001, 1.88791069e-096],
       [0.00000000e+000, 1.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 1.00000000e+000, 0.00000000e+000],
       [1.38147479e-006, 9.99998619e-001, 4.50454865e-087],
       [0.00000000e+000, 1.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 1.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 1.00000000e+000, 0.00000000e+000],
       [1.05744834e-011, 1.00000000e+000, 4.87954031e-199],
       [1.63274078e-022, 1.00000000e+000, 1.05311747e-223],
       [0.00000000e+000, 1.00000000e+000, 0.00000000e+000],
       [0.00000000e+000, 1.00000000e+000, 0.00000000e+000],
       [2.17542883e-032, 1.00000000e+000, 9.29574667e-212],
       [8.02906274e-