## <font color='blue'> Section 5. Data Modelling </font>

Next, we will consider different modelling techniques to see which can best represent our data. In this case, we aim to fit a model capable of Multiclass Classification
in order to predict the outcome of the game (win, draw, loss) given the feature set.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:

data = pd.read_csv('./result3.csv')

In [None]:
data.columns

Index(['Unnamed: 0', 'club_game', 'result', 'game_id', 'competition_code',
       'season', 'round', 'date', 'club_id', 'club_goals', 'club_position',
       'location', 'player_number', 'num_attack', 'num_defender',
       'num_midfield', 'num_goalkeeper', 'attack_ratio', 'defender_ratio',
       'midfield_ratio', 'avg_age_team', 'avg_height_team', 'avg_age_attack',
       'avg_age_defender', 'avg_age_midfield', 'avg_age_goalkeeper',
       'avg_height_attack', 'avg_height_defender', 'avg_height_midfield',
       'avg_height_goalkeeper', 'Europe_num', 'North_America_num',
       'South_America_num', 'Asia_num', 'Oceania_num', 'Africa_num',
       'EU_ratio', 'NA_ratio', 'SA_ratio', 'AS_ratio', 'AF_ratio', 'OC_ratio',
       'left_num', 'right_num', 'both_num', 'left_ratio', 'right_ratio',
       'both_ratio'],
      dtype='object')

In [None]:
# Define the feature set (X) and target variable (y) 
X_all = data.copy()
X = X_all.drop(['result','club_game','game_id','competition_code','season','round','date','club_id','club_goals','club_position','location','left_num','right_num','both_num'], axis=1).copy()
y = X_all['result']

print(y)

0         win
1         win
2        draw
3         win
4         win
5        lose
6        draw
7        draw
8        lose
9        draw
10       lose
11        win
12        win
13       lose
14        win
15       draw
16       lose
17       draw
18       draw
19       lose
20        win
21       lose
22        win
23       lose
24        win
25       lose
26        win
27        win
28       lose
29       lose
         ... 
70479     win
70480    lose
70481     win
70482    lose
70483    lose
70484     win
70485     win
70486     win
70487    lose
70488     win
70489    draw
70490    lose
70491    draw
70492    lose
70493     win
70494    lose
70495     win
70496    lose
70497    draw
70498    lose
70499    lose
70500     win
70501     win
70502    draw
70503    lose
70504     win
70505     win
70506     win
70507    lose
70508    lose
Name: result, Length: 70509, dtype: object


In [None]:
# Standardising the data
from sklearn.preprocessing import scale

#Center to the mean and component wise scale to unit variance.
for col in X.columns:
    X[col] = scale(X[col])



In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Let's say we want to split the data in 70:30 for train:test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2, stratify = y)

### Naive Bayes

In [None]:

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Training the Naive Bayes model on the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
ac = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test, y_pred)

In [None]:
 print(ac, cm)

0.41440930364487305 [[ 698 1332 3018]
 [ 919 2461 4410]
 [ 892 1816 5607]]


QDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score

In [None]:
lda = LinearDiscriminantAnalysis()
model = lda.fit(X_train, y_train)

print(model.priors_)

[0.2386336  0.36826323 0.39310317]




In [None]:
print(model.means_)

[[-0.01673371 -0.13190434 -0.0094462  -0.02321831 -0.06244595  0.00549588
   0.01633542  0.0162248  -0.03589169  0.04440521  0.00936517  0.02724842
   0.03736706  0.03691696  0.00716778  0.00311033 -0.00165823  0.00660217
   0.01942718 -0.05459794 -0.01068951 -0.01293733  0.01697274 -0.00032842
   0.03290407 -0.02850952 -0.00626148 -0.00796831  0.01983901  0.00146835
   0.04088129 -0.02203219  0.01361886  0.01243063]
 [ 0.16016082 -0.00766732  0.09112122 -0.0434156  -0.06188772  0.02071723
   0.09608671 -0.04235115 -0.06157304 -0.01741091 -0.00347096 -0.01307933
  -0.01130587 -0.00328674 -0.01539313  0.01962421  0.00557235 -0.02358445
  -0.01911558  0.00830262  0.01235984 -0.07153099  0.00907188  0.01410793
   0.04598539  0.00885252  0.01253941 -0.07089895  0.01084657  0.01391766
   0.04583636  0.00724743  0.00968464 -0.02916509]
 [-0.13988215  0.08725546 -0.079629    0.05476689  0.09588494 -0.0227444
  -0.09993148  0.02982573  0.07947036 -0.01064549 -0.00243351 -0.00428832
  -0.012092

In [None]:
print(model.coef_)

[[-1.01972073e-02  1.61671108e-02  1.89906223e-01 -1.86951875e-01
   6.49552006e-03 -2.92185494e-01 -2.66372030e+00 -1.84987393e+00
  -2.63064042e+00  1.09600056e-02 -1.18504222e-04  1.32183299e-02
   2.05956816e-02  1.66574153e-02 -9.24328692e-03  4.12670782e-03
   5.23356395e-03  6.00883942e-03  2.42236433e-02  4.57779968e-03
  -1.63031125e-01  2.63662662e-01  1.53052539e-01  1.56413269e-02
  -4.11305940e-01 -1.58641489e-02  1.55641152e-01 -2.69219364e-01
  -1.53386889e-01 -1.69668516e-02  4.45889397e-01 -1.17751759e-02
   2.38959536e-03  1.53044553e-02]
 [ 1.70531415e-01 -1.02355262e-01  5.46814709e-02 -3.43427511e-02
  -1.15898353e-01  1.87292116e-01  1.20555283e+00  9.64852362e-01
   1.32007778e+00 -6.03348233e-03 -3.80340667e-03 -9.85666691e-03
   4.03038038e-03  8.68600513e-03 -8.18666863e-03  2.72256827e-02
   1.20965041e-03 -3.34507808e-02 -3.06744913e-02  2.19020090e-01
   4.97760593e-02 -2.03994013e-01 -5.74577653e-01  1.35360198e-01
   3.58814146e-01 -2.17448307e-01 -5.2085

In [None]:
pred=model.predict(X_test)
print(np.unique(pred, return_counts=True))

(array(['draw', 'lose', 'win'], dtype='<U4'), array([  290,  9289, 11574]))


In [None]:
print(confusion_matrix(pred, y_test))
print(classification_report(y_test, pred, digits=3))

[[  96   91  103]
 [2284 4154 2851]
 [2668 3545 5361]]
              precision    recall  f1-score   support

        draw      0.331     0.019     0.036      5048
        lose      0.447     0.533     0.486      7790
         win      0.463     0.645     0.539      8315

   micro avg      0.454     0.454     0.454     21153
   macro avg      0.414     0.399     0.354     21153
weighted avg      0.426     0.454     0.400     21153



### Quadratic Discriminant Analysis

In [None]:
qda = QuadraticDiscriminantAnalysis()
model2 = qda.fit(X_train, y_train)
print(model2.priors_)
print(model2.means_)

[0.2386336  0.36826323 0.39310317]
[[-0.01673371 -0.13190434 -0.0094462  -0.02321831 -0.06244595  0.00549588
   0.01633542  0.0162248  -0.03589169  0.04440521  0.00936517  0.02724842
   0.03736706  0.03691696  0.00716778  0.00311033 -0.00165823  0.00660217
   0.01942718 -0.05459794 -0.01068951 -0.01293733  0.01697274 -0.00032842
   0.03290407 -0.02850952 -0.00626148 -0.00796831  0.01983901  0.00146835
   0.04088129 -0.02203219  0.01361886  0.01243063]
 [ 0.16016082 -0.00766732  0.09112122 -0.0434156  -0.06188772  0.02071723
   0.09608671 -0.04235115 -0.06157304 -0.01741091 -0.00347096 -0.01307933
  -0.01130587 -0.00328674 -0.01539313  0.01962421  0.00557235 -0.02358445
  -0.01911558  0.00830262  0.01235984 -0.07153099  0.00907188  0.01410793
   0.04598539  0.00885252  0.01253941 -0.07089895  0.01084657  0.01391766
   0.04583636  0.00724743  0.00968464 -0.02916509]
 [-0.13988215  0.08725546 -0.079629    0.05476689  0.09588494 -0.0227444
  -0.09993148  0.02982573  0.07947036 -0.01064549 



In [None]:
pred2=model2.predict(X_test)
print(np.unique(pred2, return_counts=True))
print(confusion_matrix(pred2, y_test))
print(classification_report(y_test, pred2, digits=3))

(array(['draw', 'lose', 'win'], dtype=object), array([8749, 4904, 7500]))
[[2113 3153 3483]
 [1212 1965 1727]
 [1723 2672 3105]]
              precision    recall  f1-score   support

        draw      0.242     0.419     0.306      5048
        lose      0.401     0.252     0.310      7790
         win      0.414     0.373     0.393      8315

   micro avg      0.340     0.340     0.340     21153
   macro avg      0.352     0.348     0.336     21153
weighted avg      0.368     0.340     0.341     21153



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=aceb451a-c323-4d42-a6b2-6c4559d224cd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>