# Expected goals (xG) model

Imports

In [None]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import utils

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.model_selection import KFold

import random
import imblearn
import pickle

## Lectura de los datos.

In [None]:
dataset = pd.read_csv("data/dataset.csv")

## Modelos

In [None]:
playername = "Lionel Andrés Messi Cuccittini"
#playername = "Luis Alberto Suárez Díaz"

In [None]:
dataset = dataset[dataset["player"]==playername]
dataset = dataset.reset_index(drop=True)

In [None]:
dataset.shape

In [None]:
dataset.head()

In [None]:
print("Number No goals",len(dataset[dataset.goal==0]))
print("Number goals",len(dataset[dataset.goal==1]))

In [None]:
dataset = dataset.drop_duplicates()
dataset = dataset.drop(dataset[(dataset.angle>3.14)].index)
dataset = dataset.drop(dataset[(dataset.distance>60)].index)
dataset = dataset.drop(dataset[(dataset.angle==0)].index)

In [None]:
X = dataset.copy()
X.drop(["player","team","goal"], axis=1, inplace=True)
y = dataset[["goal"]].copy()

#SMOTEtlink = imblearn.combine.SMOTETomek(sampling_strategy="all", random_state=42)
#X, y = SMOTEtlink.fit_resample(X, y)

SMOTEenn = imblearn.combine.SMOTEENN(sampling_strategy="minority", random_state=42)
X, y = SMOTEenn.fit_resample(X, y)

dataset = pd.concat([X, y], axis=1)

In [None]:
print("Number No goals",len(dataset[dataset.goal==0]))
print("Number goals",len(dataset[dataset.goal==1]))

In [None]:
for i,s in dataset.iterrows():
    
    contadorTipojugada=0
    contadorTipoRemate=0
    contadorParteCuerpo=0
    
    if s["Open Play"]==0 and s["Corner"]==0 and s["Free Kick"]==0 and s["Penalty"]==0:
        dataset.drop(i)
    else:
        if s["Open Play"]==1:
            contadorTipojugada += 1
        if s["Corner"]==1:
            contadorTipojugada += 1
        if s["Free Kick"]==1:
            contadorTipojugada += 1
        if s["Penalty"]==1:
            contadorTipojugada += 1
            
    if s["Backheel"]==0 and s["Diving Header"]==0 and s["Half Volley"]==0 and s["Lob"]==0 and s["Normal"]==0 and s["Overhead Kick"]==0 and s["Volley"]==0:
        dataset.drop(i)
    else:
        if s["Backheel"]==1:
            contadorTipoRemate += 1
        if s["Diving Header"]==1:
            contadorTipoRemate += 1
        if s["Half Volley"]==1:
            contadorTipoRemate += 1
        if s["Lob"]==1:
            contadorTipoRemate += 1
        if s["Normal"]==1:
            contadorTipoRemate += 1
        if s["Overhead Kick"]==1:
            contadorTipoRemate += 1
        if s["Volley"]==1:
            contadorTipoRemate += 1
    
    if s["Head"]==1:
        contadorParteCuerpo += 1
    if s["Other"]==1:
        contadorParteCuerpo += 1
    if s["preferred_foot"]==1:
        contadorParteCuerpo += 1
    if s["not_preferred_foot"]==1:
        contadorParteCuerpo += 1
        
    if contadorTipojugada > 1:
        dataset.drop(i)
    if contadorTipoRemate > 1:
        dataset.drop(i)
    if contadorParteCuerpo > 1:
        dataset.drop(i)
        
dataset = dataset.drop(dataset[(dataset.Head==0) & (dataset.Other==0) & (dataset.preferred_foot==0) & (dataset.not_preferred_foot==0)].index)
dataset = dataset.drop(dataset[(dataset.angle>3.14)].index)
dataset = dataset.drop(dataset[(dataset.distance>60)].index)
dataset = dataset.drop(dataset[(dataset.angle==0)].index)

In [None]:
dataset = dataset.drop_duplicates()

In [None]:
print("Number No goals",len(dataset[dataset.goal==0]))
print("Number goals",len(dataset[dataset.goal==1]))

In [None]:
sns.scatterplot(data=dataset, x="angle", y="distance", hue="goal", s=2)

In [None]:
results = []

def call_model(model, dataset_num, X_train, X_test, y_train, y_test):
    if model == "LR":
        clf = LogisticRegression(max_iter=10000, random_state=42)
    elif model == "GBR":
        clf = GradientBoostingClassifier(random_state=42)
    elif model == "SVM":
        clf = SVC(probability=True, random_state=42)
    elif model == "RFC":
        clf = RandomForestClassifier(random_state=42)
    elif model == "MLP":  
        clf = MLPClassifier(max_iter=500, random_state=42)
    
    scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=10)
    score = np.mean(scores)
    clf = clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    pred_proba = clf.predict_proba(X_test)
    score_t = clf.score(X_test, y_test)
    cm = confusion_matrix(y_test, pred)
    print(cm)

    global results
    results.append([model, dataset_num, score])
    
    return [score, score_t, pred, pred_proba, cm]

Posibles datasets

In [None]:
dataset1 = dataset[['distance', 'angle','goal']].copy()

dataset2 = dataset[['distance', 'angle', 'Head', 'Other', 'preferred_foot',
       'not_preferred_foot', 'goal']].copy()

dataset3 = dataset[['distance', 'angle', 'Corner', 'Free Kick', 'Open Play', 'Penalty',
        'Head', 'Other', 'preferred_foot', 'not_preferred_foot', 'goal']].copy()

dataset4 =  dataset[['distance', 'angle', 'under_pressure', 'follows_dribble', 'one_on_one',
       'first_time', 'defenders_between', 'Corner', 'Free Kick', 'Open Play', 'Penalty', 
        'Head', 'Other', 'preferred_foot', 'not_preferred_foot', 'goal']].copy()

dataset5 = dataset[['distance', 'angle', 'under_pressure', 'follows_dribble', 'one_on_one',
       'first_time', 'defenders_between', 'Corner', 'Free Kick', 'Open Play', 'Penalty', 'Backheel', 
        'Diving Header', 'Half Volley', 'Lob', 'Normal', 'Overhead Kick', 'Volley', 'Head',
        'Other', 'preferred_foot', 'not_preferred_foot', 'goal']].copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset1.iloc[:,:-1], dataset1.iloc[:,-1], test_size=0.25, random_state=42)
res = call_model("LR", "dataset1", X_train, X_test, y_train, y_test)
res = call_model("GBR", "dataset1", X_train, X_test, y_train, y_test)
res = call_model("SVM", "dataset1", X_train, X_test, y_train, y_test)
res = call_model("RFC", "dataset1", X_train, X_test, y_train, y_test)
res = call_model("MLP", "dataset1", X_train, X_test, y_train, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset2.iloc[:,:-1], dataset2.iloc[:,-1], test_size=0.25, random_state=42)
res = call_model("LR", "dataset2", X_train, X_test, y_train, y_test)
res = call_model("GBR", "dataset2", X_train, X_test, y_train, y_test)
res = call_model("SVM", "dataset2", X_train, X_test, y_train, y_test)
res = call_model("RFC", "dataset2", X_train, X_test, y_train, y_test)
res = call_model("MLP", "dataset2", X_train, X_test, y_train, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset3.iloc[:,:-1], dataset3.iloc[:,-1], test_size=0.25, random_state=42)
res = call_model("LR", "dataset3", X_train, X_test, y_train, y_test)
res = call_model("GBR", "dataset3", X_train, X_test, y_train, y_test)
res = call_model("SVM", "dataset3", X_train, X_test, y_train, y_test)
res = call_model("RFC", "dataset3", X_train, X_test, y_train, y_test)
res = call_model("MLP", "dataset3", X_train, X_test, y_train, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset4.iloc[:,:-1], dataset4.iloc[:,-1], test_size=0.25, random_state=42)
res = call_model("LR", "dataset4", X_train, X_test, y_train, y_test)
res = call_model("GBR", "dataset4", X_train, X_test, y_train, y_test)
res = call_model("SVM", "dataset4", X_train, X_test, y_train, y_test)
res = call_model("RFC", "dataset4", X_train, X_test, y_train, y_test)
res = call_model("MLP", "dataset4", X_train, X_test, y_train, y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset5.iloc[:,:-1], dataset5.iloc[:,-1], test_size=0.25, random_state=42)
res = call_model("LR", "dataset5", X_train, X_test, y_train, y_test)
res = call_model("GBR", "dataset5", X_train, X_test, y_train, y_test)
res = call_model("SVM", "dataset5", X_train, X_test, y_train, y_test)
res = call_model("RFC", "dataset5", X_train, X_test, y_train, y_test)
res = call_model("MLP", "dataset5", X_train, X_test, y_train, y_test)

In [None]:
pd.DataFrame(results, columns=["model","dataset","score"])

Hyperparameter tuning

In [None]:
param_grid = {
    'random_state'= [42],
    'bootstrap': [True, False],
    'criterion': ["gini","entropy","log_loss"],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, None],
    'max_features': [None,"sqrt","log2"],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [1, 2, 4, 6, 8, 10, 12],
    'n_estimators': [20, 50, 100, 200, 300, 1000]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv=10, n_jobs = -1, verbose = 2)

grid_search.fit(dataset5.iloc[:,0:-1], dataset5.iloc[:,-1])
best_grid = grid_search.best_estimator_

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_score_

In [None]:
resultados_hyptun = pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)

In [None]:
resultados_hyptun.sort_values('Accuracy',ascending=False).head(20)

Guardar modelo

In [None]:
#rf = RandomForestClassifier(bootstrap=False, n_estimators=200, min_samples_split=2, random_state=42) #Messi
rf = RandomForestClassifier(bootstrap=False, n_estimators=100, min_samples_split=4, random_state=42) #Suarez

rf = rf.fit(dataset5.iloc[:,0:-1], dataset5.iloc[:,-1])

In [None]:
filename = "xG_model_"+ playername.replace(" ", "_")
pickle.dump(rf, open(filename, 'wb'))

In [None]:
filename = "data/dataset_"+ playername.replace(" ", "_")+".csv"
dataset5.to_csv(filename, index=False)