In [167]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import random
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn import svm

In [146]:
df = pd.read_csv('newstats.csv', index_col = 0)

In [154]:
df.groupby(['winner']).mean()[['1 Goals', '2 Goals','1 Possession','2 Possession']]

Unnamed: 0_level_0,1 Goals,2 Goals,1 Possession,2 Possession
winner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.55814,0.589147,0.512357,0.487643
1,1.245353,0.282528,0.613803,0.386197
2,0.232394,1.288732,0.382754,0.617246


In [153]:
df.groupby(['winner']).count()[['1 Goals', '2 Goals','1 Possession','2 Possession', "1 Total xG", "2 Total xG"]]

Unnamed: 0_level_0,1 Goals,2 Goals,1 Possession,2 Possession,1 Total xG,2 Total xG
winner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,129,129,129,129,129,129
1,269,269,269,269,269,269
2,284,284,284,284,284,284


In [152]:
for i in range(3):
    for index, row in df.iterrows():
        if bool(random.getrandbits(1)):
            row1 = df.iloc[index]
            row1[:21], row1[21:-2] = row1[21:-2], row1[:21].copy()
            row1[-2] = 1 if row1[-2] == 2 else 2 if row1[-2] == 1 else 0
            row1[-1] = 1 if row1[-1] == 2 else 2 if row1[-1] == 1 else 0
            df.iloc[index] = row1

In [156]:
features = df.iloc[: , :-2]
target = df.winner

In [157]:
X_train, X_test, y_train, y_test = train_test_split(features, target)

In [168]:
cl_estimators = {
    'k-Nearest Neighbor': KNeighborsClassifier(), 
    'Gaussian Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(max_iter = 100000),
    'SVM': svm.SVC()}

In [169]:
def classifiers_models():
    for name, model in cl_estimators.items(): 
        x = model.fit(X=X_train, y = y_train)
        accuracytest = x.score(X_test, y_test)
        accuracytrain = x.score(X_train, y_train)
        print(name + ": \n\t" + "Classification accuracy on the test data:", f"{accuracytest:.2%}")
        print(f"\tClassification accuracy on the training data: {accuracytrain:.2%}\n")

In [170]:
classifiers_models()

k-Nearest Neighbor: 
	Classification accuracy on the test data: 62.57%
	Classification accuracy on the training data: 69.86%

Gaussian Naive Bayes: 
	Classification accuracy on the test data: 67.25%
	Classification accuracy on the training data: 65.56%

Decision Tree: 
	Classification accuracy on the test data: 60.23%
	Classification accuracy on the training data: 100.00%

Logistic Regression: 
	Classification accuracy on the test data: 72.51%
	Classification accuracy on the training data: 74.17%

SVM: 
	Classification accuracy on the test data: 71.35%
	Classification accuracy on the training data: 66.34%



In [171]:
def logistic_tuning():
    param_grid = {"solver":["sag", "saga","liblinear","newton-cg", "lbfgs"]}
    grid_search = GridSearchCV(LogisticRegression(max_iter = 100000), param_grid, cv=10)
    grid_search.fit(X=X_train, y=y_train)
    print("Best parameters: ", grid_search.best_params_)
    print("Training set score with best parameter: ", grid_search.score(X_train, y_train))
    print("Test set score with best parameter: ", grid_search.score(X_test, y_test))

In [172]:
logistic_tuning()

Best parameters:  {'solver': 'liblinear'}
Training set score with best parameter:  0.7279843444227005
Test set score with best parameter:  0.7076023391812866


In [173]:
model = LogisticRegression(max_iter=100000)
model.fit(X=X_train, y = y_train)
predicted = model.predict(X=X_test)
odds = model.predict_proba(X = X_test)


In [174]:
X_test['winner'] = y_test
X_test['predicted'] = predicted

In [176]:
oddf = pd.DataFrame(odds, columns = ['% Tie', '% 1 Win', '% 2 Win'], index = X_test.index)

In [177]:
X_test = pd.concat([X_test, oddf], axis=1)

In [175]:
def hwin(s1, s2):
    if s1 > s2: 
        return 1
    elif s2 > s1:
        return 2
    else:
        return 0
X_test['fh winner'] = X_test.apply(lambda row: hwin(row['1 Goals'], row['2 Goals']), axis = 1)

In [178]:
X_test

Unnamed: 0,1 Accurate Long Balls,1 Corners,1 Dribbles Attempted,1 Dribbles Succeeded,1 Fouls Conceded,1 Goals,1 Interceptions,1 Long Balls,1 Offsides,1 Pass Completion,...,2 Total Passes,2 Total Shots on Target,2 Total xG,2 Yellow Cards,winner,predicted,fh winner,% Tie,% 1 Win,% 2 Win
525,14,1,1,1,9,0,7,33,1,0.679,...,431,1,0.665,0,0,2,0,0.072293,0.084801,0.842906
598,16,4,7,4,14,1,2,45,3,0.583,...,249,4,0.603,1,2,2,0,0.165558,0.184876,0.649566
194,11,0,7,5,7,0,11,30,2,0.619,...,339,1,0.469,1,0,2,0,0.037803,0.030681,0.931515
47,14,2,9,6,7,0,4,29,0,0.881,...,184,3,0.531,0,1,0,0,0.450585,0.372925,0.176489
130,19,2,13,8,9,1,2,27,0,0.918,...,155,1,0.911,1,0,1,0,0.349015,0.614944,0.036041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,16,1,7,2,5,1,6,35,0,0.761,...,328,1,0.208,2,1,2,0,0.305305,0.179604,0.515091
10,13,3,6,4,4,1,5,32,0,0.846,...,225,1,0.410,0,1,1,1,0.220702,0.703993,0.075304
355,14,3,11,8,4,1,7,26,0,0.802,...,469,4,1.530,0,2,2,2,0.271935,0.033510,0.694555
460,15,2,14,10,10,0,5,37,1,0.614,...,341,2,0.624,0,0,2,0,0.291070,0.270402,0.438528


In [180]:
X_test[X_test['winner'] != X_test['predicted']].head(50)

Unnamed: 0,1 Accurate Long Balls,1 Corners,1 Dribbles Attempted,1 Dribbles Succeeded,1 Fouls Conceded,1 Goals,1 Interceptions,1 Long Balls,1 Offsides,1 Pass Completion,...,2 Total Passes,2 Total Shots on Target,2 Total xG,2 Yellow Cards,winner,predicted,fh winner,% Tie,% 1 Win,% 2 Win
525,14,1,1,1,9,0,7,33,1,0.679,...,431,1,0.665,0,0,2,0,0.072293,0.084801,0.842906
194,11,0,7,5,7,0,11,30,2,0.619,...,339,1,0.469,1,0,2,0,0.037803,0.030681,0.931515
47,14,2,9,6,7,0,4,29,0,0.881,...,184,3,0.531,0,1,0,0,0.450585,0.372925,0.176489
130,19,2,13,8,9,1,2,27,0,0.918,...,155,1,0.911,1,0,1,0,0.349015,0.614944,0.036041
517,6,0,3,2,14,1,6,30,0,0.701,...,243,2,1.172,0,2,0,1,0.446423,0.342946,0.210632
183,18,3,14,9,2,0,9,28,1,0.837,...,287,2,0.39,0,1,2,0,0.133197,0.312908,0.553895
473,10,2,12,8,13,1,9,31,0,0.639,...,372,4,1.342,0,0,2,0,0.312484,0.050663,0.636853
267,24,3,16,9,6,0,6,32,2,0.878,...,148,1,0.306,1,2,1,2,0.242284,0.38694,0.370776
208,14,4,20,14,3,0,6,22,1,0.873,...,107,4,0.489,2,1,2,2,0.334629,0.212084,0.453287
502,9,3,25,11,8,2,6,29,1,0.763,...,205,4,0.911,0,2,1,1,0.142358,0.764935,0.092707


In [187]:
len(model.coef_[0])

42

In [242]:
model.n_features_in_

42

In [240]:
impf = {}
for f, c in zip(features.columns, model.coef_[0]):
    impf[f] = c
impf = dict(sorted(impf.items(), key=lambda x: x[0][1:]) )

In [241]:
impf

{'1 Accurate Long Balls': 0.02318709647698052,
 '2 Accurate Long Balls': -0.040530138787669034,
 '1 Corners': -0.030979229227089403,
 '2 Corners': 0.03428199164602894,
 '1 Dribbles Attempted': -0.034806494564596065,
 '2 Dribbles Attempted': -0.01955028888500152,
 '1 Dribbles Succeeded': 0.008556402900207653,
 '2 Dribbles Succeeded': 0.07092912207198492,
 '1 Fouls Conceded': 0.007862144055574236,
 '2 Fouls Conceded': 0.006709500429363881,
 '1 Goals': 0.22812453984247955,
 '2 Goals': -0.0664448312922816,
 '1 Interceptions': -0.021884224477067525,
 '2 Interceptions': -0.008380927855637238,
 '1 Long Balls': -0.023610983461058065,
 '2 Long Balls': 0.00790591214404024,
 '1 Offsides': -0.11339495240569386,
 '2 Offsides': 0.0028688637461839668,
 '1 Pass Completion': 0.10220808711645554,
 '2 Pass Completion': -0.03824918559282281,
 '1 Possession': 0.05965094593907284,
 '2 Possession': -0.16552510138827267,
 '1 Red Cards': -0.11281167694068185,
 '2 Red Cards': -0.2756511711980367,
 '1 Shots': 0.

In [253]:
pd.DataFrame(model.coef_, columns = features.columns).iloc[0]

1 Accurate Long Balls      0.023187
1 Corners                 -0.030979
1 Dribbles Attempted      -0.034806
1 Dribbles Succeeded       0.008556
1 Fouls Conceded           0.007862
1 Goals                    0.228125
1 Interceptions           -0.021884
1 Long Balls              -0.023611
1 Offsides                -0.113395
1 Pass Completion          0.102208
1 Possession               0.059651
1 Red Cards               -0.112812
1 Shots                    0.073320
1 Shots Blocked           -0.107572
1 Shots woodwork          -0.472116
1 Tackles Attempted       -0.027405
1 Throws                   0.022373
1 Total Passes             0.003053
1 Total Shots on Target   -0.129112
1 Total xG                -0.051992
1 Yellow Cards            -0.093145
2 Accurate Long Balls     -0.040530
2 Corners                  0.034282
2 Dribbles Attempted      -0.019550
2 Dribbles Succeeded       0.070929
2 Fouls Conceded           0.006710
2 Goals                   -0.066445
2 Interceptions           -0