In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from statistics import mean
import plotly.graph_objects as go
import pylab
import warnings
from scipy.stats import sem
from datetime import datetime
from termcolor import colored
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import *
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, KFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
import json
import random

Loading data

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
df=pd.read_csv('Akkumulated_tennis_PR.csv')

df=df.drop(['Unnamed: 0.1',
 'Unnamed: 0',
 'P1Ace',
 'P2Ace',
 'P1Winner',
 'P2Winner',
 'P1DoubleFault',
 'P2DoubleFault',
 'P1UnfErr',
 'P2UnfErr',
 'P1NetPoint',
 'P2NetPoint',
 'P1NetPointWon',
 'P2NetPointWon',
 'P1BreakPoint',
 'P2BreakPoint',
 'P1BreakPointWon',
 'P2BreakPointWon',
 'P1BreakPointMissed',
 'P2BreakPointMissed',
 'ServeIndicator',
 'ServeNumber',
 'WinnerType',
 'WinnerShotType',
 'P1DistanceRun',
 'P2DistanceRun',
 'RallyCount','P1_ServeWidth_0',
 'P1_ServeWidth_B',
 'P1_ServeWidth_BC',
 'P1_ServeWidth_BW',
 'P1_ServeWidth_C',
 'P1_ServeWidth_DoubleFault',
 'P1_ServeWidth_W',
 'P1_ServeDepth_0',
 'P1_ServeDepth_CTL',
 'P1_ServeDepth_DoubleFault',
 'P1_ServeDepth_NCTL',
 'P1_ReturnDepth_0',
 'P1_ReturnDepth_D',
 'P1_ReturnDepth_DoubleFault',
 'P1_ReturnDepth_ND',
 'P1_ReturnDepth_ServeAce',
 'P1_ReturnDepth_Service box',
 'P2_ServeWidth_0',
 'P2_ServeWidth_B',
 'P2_ServeWidth_BC',
 'P2_ServeWidth_BW',
 'P2_ServeWidth_C',
 'P2_ServeWidth_DoubleFault',
 'P2_ServeWidth_W',
 'P2_ServeDepth_0',
 'P2_ServeDepth_CTL',
 'P2_ServeDepth_DoubleFault',
 'P2_ServeDepth_NCTL',
 'P2_ReturnDepth_0',
 'P2_ReturnDepth_D',
 'P2_ReturnDepth_DoubleFault',
 'P2_ReturnDepth_ND',
 'P2_ReturnDepth_ServeAce',
 'P2_ReturnDepth_Service box',
 'ElapsedTime',
 'ServeWidth',
 'ServeDepth',
 'P1Momentum',
 'P2Momentum',
 'ReturnDepth',
 'datetime',
 'year',
 'slam',
 'match_num',
 'player1',
 'player2',
 'tourney_name',
 'tourney_date',
 'winner_name',
 'winner_hand',
 'loser_name',
 'loser_hand'],axis=1)

Making rank for p1 and p2

In [None]:
df.loc[df['match_id'] == "2020-usopen-1228", 'loser_rank'] = 501

In [None]:
p1_rank = []
p2_rank = []
for i in range(len(df)):
    if df.player1.iloc[i] == df.winner_name.iloc[i]:
        p1_rank.append(df.winner_rank.iloc[i])
        p2_rank.append(df.loser_rank.iloc[i])
    if df.player1.iloc[i] == df.loser_name.iloc[i]:
        p1_rank.append(df.loser_rank.iloc[i])
        p2_rank.append(df.winner_rank.iloc[i])
df['P1Rank'] = p1_rank
df['P2Rank'] = p2_rank

Making total elapsed time in seconds

In [None]:
total_time=[]
for i in range(len(df.match_id.unique())):
    total_time = np.append(total_time, np.insert(df[df['match_id'] == df.match_id.unique()[i]].Point_lenght_sec.cumsum().values[:-1],0,0))
df['Total_time']=total_time

Creating a tiebreak variable

Columns to be shifted

In [None]:
to_be_shifted=[
    ('P1GamesWon','P2GamesWon'),
    ('P1Score','P2Score'),
    ('P1PointsWon','P2PointsWon'),
    ('P1_ServeWidth_B_A','P2_ServeWidth_B_A'),
    ('P1_ServeWidth_BC_A','P2_ServeWidth_BC_A'),
    ('P1_ServeWidth_BW_A','P2_ServeWidth_BW_A'),
    ('P1_ServeWidth_C_A','P2_ServeWidth_C_A'),
    ('P1_ServeWidth_W_A','P2_ServeWidth_W_A'),
    ('P1_ServeDepth_CTL_A','P2_ServeDepth_CTL_A'),
    ('P1_ServeDepth_NCTL_A','P2_ServeDepth_NCTL_A'),
    ('P1_ReturnDepth_D_A','P2_ReturnDepth_D_A'),
    ('P1_ReturnDepth_ND_A','P2_ReturnDepth_ND_A'),
    ('P1AceA','P2AceA'),
    ('P1WinnerA','P2WinnerA'),
    ('P1DoubleFaultA','P2DoubleFaultA'),
    ('P1UnfErrA','P2UnfErrA'),
    ('P1NetPointA','P2NetPointA'),
    ('P1NetPointWonA','P2NetPointWonA'),
    ('P1BreakPointA','P2BreakPointA'),
    ('P1BreakPointWonA','P2BreakPointWonA'),
    ('P1BreakPointMissedA','P2BreakPointMissedA'),
    ('P1DistanceRunA','P1DistanceRunA'),
    ('P1SetsWon','P2SetsWon'),
    ('player1','player2'),
    ('P1NetPoint','P2NetPoint'),
    ('P1Rank','P2Rank')
    ]

Changing pointwinenr to binary encoding 0 = p1, 1= p2

In [None]:
Server = {1: 0,2: 1}
df.PointServer = [Server[item] for item in df.PointServer]
df.PointWinner = [Server[item] for item in df.PointWinner]
df.GameWinnerA = [Server[item] for item in df.GameWinnerA]
df.SetWinnerA = [Server[item] for item in df.SetWinnerA]

Surface attribute, 0 = US open (Turf), 1 = Wimbeldon grass

In [None]:
surface=[]
for i in range(len(df)):
    if df.match_id.iloc[i][5] == "u":
        surface.append(0)
    else:
        surface.append(1)
df['Surface']=surface

TieBreak

In [None]:
tie=[]
for i in range(len(df)):
    if (df.GameNo.iloc[i]>12) and (df.Surface.iloc[i]==0):
        tie.append(1)
    elif (df.GameNo.iloc[i] == 25) and (df.Surface.iloc[i]==1) and (df.year.iloc[i]>2018):
        tie.append(1)
    elif (df.GameNo.iloc[i] > 12) and (df.Surface.iloc[i]==1) and (df.SetNo.iloc[i] < 5):
        tie.append(1)
    else:
        tie.append(0)
df['Tiebreak']=tie

Editing point number mistakes in data

In [None]:
for i in range(len(df)):
    if pd.isna(df.iloc[i]['PointNumber']):
        df['PointNumber'].iloc[i]=df['PointNumber'].iloc[i+1]-1

Editing NA value for loser rank in a single match

Making unique time stamps for what point number in the game it is

In [None]:
game_timestamp=[]

k=0
for i in range(len(df)):
    if df.GameNo.iloc[i] != df.GameNo.iloc[i-1]:
        k=1
        game_timestamp.append(k)
    if df.GameNo.iloc[i] == df.GameNo.iloc[i-1]:
        k=k+1
        game_timestamp.append(k)
df['Game_timestamp']=game_timestamp

Attributes for predictions

In [None]:
Game_test_columns=['SetNo',
                   'P1GamesWon',
                   'P2GamesWon',
                   'GameNo',
                   'PointNumber',
                   #'PointServer',
                   'Tiebreak',
                   'P1Score',
                   'P2Score',
                   'P1PointsWon',
                   'P2PointsWon',
                   #'Point_lenght_sec',
                   'P1_ServeWidth_B_A',
                   'P1_ServeWidth_BC_A',
                   'P1_ServeWidth_BW_A',
                   'P1_ServeWidth_C_A',
                   'P1_ServeWidth_W_A',
                   'P1_ServeDepth_CTL_A',
                   'P1_ServeDepth_NCTL_A',
                   'P1_ReturnDepth_D_A',
                   'P1_ReturnDepth_ND_A',
                   'P2_ServeWidth_B_A',
                   'P2_ServeWidth_BC_A',
                   'P2_ServeWidth_BW_A',
                   'P2_ServeWidth_C_A',
                   'P2_ServeWidth_W_A',
                   'P2_ServeDepth_CTL_A',
                   'P2_ServeDepth_NCTL_A',
                   'P2_ReturnDepth_D_A',
                   'P2_ReturnDepth_ND_A',
                   'P1AceA',
                   'P2AceA',
                   'P1WinnerA',
                   'P2WinnerA',
                   'P1DoubleFaultA',
                   'P2DoubleFaultA',
                   'P1UnfErrA',
                   'P2UnfErrA',
                   'P1NetPointA',
                   'P2NetPointA',
                   'P1NetPointWonA',
                   'P2NetPointWonA',
                   'P1BreakPointA',
                   'P2BreakPointA',
                   'P1BreakPointWonA',
                   'P2BreakPointWonA',
                   'P1BreakPointMissedA',
                   'P2BreakPointMissedA',
                   'P1DistanceRunA',
                   'P2DistanceRunA',
                   'RallyCountA',
                   'P1SetsWon',
                   'P2SetsWon',
                   'Game_timestamp',
                   'P1Rank',
                   'P2Rank',
                   'Total_time',
                   'Surface']

k=0
for i in range(len(df)-1):
    if df2['P2GamesWon'].loc[i] > df2.P2GamesWon.loc[i+1] and df2['P2GamesWon'].loc[i+1] != 0:
        k=k+1
        print(i)

In [None]:
#df2[57924:57928]  Der er en fejl i P2GamesWOn her

Shifting columns

In [None]:
df2=df.copy()
for i in range(len(df)):
    if df['PointServer'].loc[i] == 1:
        for j in range(len(to_be_shifted)):
            df.at[i,to_be_shifted[j][0]] = df2[to_be_shifted[j][1]].loc[i]
            df.at[i,to_be_shifted[j][1]] = df2[to_be_shifted[j][0]].loc[i]
        if df.PointWinner.loc[i] == 0:
            df.at[i,"PointWinner"] = 1
        elif df.PointWinner.loc[i] == 1:
            df.at[i,"PointWinner"] = 0
            
        if df.GameWinnerA.loc[i] == 0:
            df.at[i,"GameWinnerA"]=1
        elif df.GameWinnerA.loc[i] == 1: 
            df.at[i,"GameWinnerA"] = 0

        if df.SetWinnerA.loc[i] == 0:
            df.at[i,"SetWinnerA"]=1
        elif df.SetWinnerA.loc[i] == 1: 
            df.at[i,"SetWinnerA"] = 0
            
    

# Functions

In [None]:
def metrics(Y_test,y_pred,x_val,model):
    val_pred=y_pred
    x_val=x_val
    model=model
    auc=round(accuracy_score(Y_test,val_pred),4)
    f_score=round(f1_score(Y_test, val_pred),4)
    P_score=round(precision_score(Y_test,val_pred),4)
    recall=round(recall_score(Y_test, val_pred),4)
    
    ns_probs = [0 for _ in range(len(Y_test))]
    lr_probs = model.predict_proba(x_val)
    lr_probs = lr_probs[:, 1]
    # calculate scores
    ns_auc = roc_auc_score(Y_test, ns_probs)
    lr_auc = round(roc_auc_score(Y_test, lr_probs),4)
    return [auc,f_score,P_score,recall,lr_auc]
def averages1(name,metrics):
    
    res = [round(sum(x) / len(x),3) for x in zip(*metrics)]
    print(f"{name} Accuracy score: {res[0]*100}, f1 score: {res[1]}, Precision: {res[2]}, Recall: {res[3]}, ROC-AUC: {res[4]}")
    return

def averages(metrics):
    
    res = [round(sum(x) / len(x),3) for x in zip(*metrics)]
    #print(f"{name} Accuracy score: {res[0]*100}, f1 score: {res[1]}, Precision: {res[2]}, Recall: {res[3]}, ROC-AUC: {res[4]}")
    return res

def data_split(df):
    matches=list(df.match_id.unique())
    Val_games=random.sample(matches,round(len(matches)/10))
    df_val=df[df['match_id'].isin(Val_games)]
    df=df[~df['match_id'].isin(Val_games)]
    return df,df_val


def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = importance
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_df = fi_df[fi_df.feature_importance > 0.01]
    

    #Define size of bar plot
    plt.figure(figsize=(8,6))
    plt.tight_layout()
    plt.yticks(fontsize=7)
    #Plot Searborn bar chart
    ax=sns.barplot(y='feature_names',x='feature_importance',data=fi_df,palette="rocket")
    ax.set_xlabel('feature_importance')
    #plt.xticks(rotation = 90)
    
    #Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('Feature importance (Gain) > 0')
    plt.ylabel('FEATURE NAMES')


def evaluation_time(y_val,y_pred,model,x_val):
    plt.style.use('ggplot')
    cm = confusion_matrix(y_val, (val_pred))
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax,fmt='g',cmap='rocket'); #annot=True to annotate cells

    # labels, title and ticks
    ax.set_xlabel('Predicted Winner');ax.set_ylabel('Actual Winner'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Server', 'Returner']); ax.yaxis.set_ticklabels(['Server', 'Returner'], rotation=360);
    plt.show()

    f_score=f1_score(y_val, val_pred, average=None)
    P_score=precision_score(y_val,val_pred,average=None)
    recall=recall_score(y_val, val_pred, average=None)


    ns_probs = [0 for _ in range(len(y_val))]
    lr_probs = model.predict_proba(x_val)
    lr_probs = lr_probs[:, 1]
    # calculate scores
    ns_auc = roc_auc_score(y_val, ns_probs)
    lr_auc = roc_auc_score(y_val, lr_probs)
    # summarize scores
    print(f"Test set accuracy score {(np.round(accuracy_score(y_pred,y_val),3)*100)}%")
    print(f"F1 score {np.round(f_score,3)}")
    print(f"Recall score {np.round(recall,3)}")
    print(f"Precision score {np.round(P_score,3)}")
    print('Model: ROC AUC=%.3f' % (lr_auc))
    # calculate roc curves
    ns_fpr, ns_tpr, _ = roc_curve(y_val, ns_probs)
    lr_fpr, lr_tpr, _ = roc_curve(y_val, lr_probs)
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Base')
    plt.plot(lr_fpr, lr_tpr, marker='.', label='XGBoost')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()

def evaluation(y_val,y_pred,model_xg,x_val):
    plt.style.use('ggplot')
    cm = confusion_matrix(y_val, (val_pred))
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax,fmt='g',cmap='rocket'); #annot=True to annotate cells

    ax.set_xlabel('Predcited Winner');ax.set_ylabel('Actual Winner'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Server', 'Returner']); ax.yaxis.set_ticklabels(['Server', 'Returner'], rotation=360);
    plt.show()

    f_score=f1_score(y_val, val_pred, average=None)
    P_score=precision_score(y_val,val_pred,average=None)
    recall=recall_score(y_val, val_pred, average=None)


    ns_probs = [0 for _ in range(len(y_val))]
    lr_probs = model_xg.predict_proba(x_val)
    lr_probs = lr_probs[:, 1]
    # calculate scores
    ns_auc = roc_auc_score(y_val, ns_probs)
    lr_auc = roc_auc_score(y_val, lr_probs)
    # summarize scores
    print(f"Test set accuracy score {np.round(val_scoreXG*100,3)}%")
    print(f"F1 score {np.round(f_score,3)}")
    print(f"Recall score {np.round(recall,3)}")
    print(f"Precision score {np.round(P_score,3)}")
    print('Xgboost: ROC AUC=%.3f' % (lr_auc))
    # calculate roc curves
    ns_fpr, ns_tpr, _ = roc_curve(y_val, ns_probs)
    lr_fpr, lr_tpr, _ = roc_curve(y_val, lr_probs)
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Base')
    plt.plot(lr_fpr, lr_tpr, marker='.', label='XGBoost')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()
    
    results = model_xg.evals_result()
    epochs = len(results['validation_0']['error'])
    x_axis = range(0, epochs)
    # plot log loss
    fig, ax = plt.subplots()
    ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
    ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
    ax.legend()
    plt.ylabel('Log Loss')
    plt.title('XGBoost Log Loss')
    plt.show()
    # plot classification error
    fig, ax = plt.subplots()
    ax.plot(x_axis, results['validation_0']['error'], label='Train')
    ax.plot(x_axis, results['validation_1']['error'], label='Test')
    ax.legend()
    plt.ylabel('Classification Error')
    plt.title('XGBoost Classification Error')
    plt.show()
    return

def errors(values):
    errors=[]
    for i in range(len(values)):
        errors.append(sem(values[i])*100)
    return errors


def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
def sample(x,y):
    xx=np.unique(y,return_counts=True)
    xx=xx[1][0]/xx[1][1]
    if xx < 4:
        return x,y
    if xx > 4:
        oversample = SMOTE(0.5)
        x, y = oversample.fit_resample(x, y)
        return x,y

Creating Train and test set data

In [None]:
df_copy=df.copy()
df,df_val=data_split(df)

# GameWinner

Define timestamps and show data distribution

In [None]:
time_stamps = np.unique(df['Game_timestamp'])

l=[]

for i in range(len(df)-1):
    if df.Game_timestamp.iloc[i] > df.Game_timestamp.iloc[i+1]:
        l.append(df.Game_timestamp.iloc[i])

from collections import Counter
c=Counter(l)
plt.bar(c.keys(), c.values())
plt.title('Number of points in games ')
plt.xlabel('Number of points in game')
plt.ylabel('Number of games')
plt.show()

l2=[]
for i in range(1,max(df.Game_timestamp)):
    l2.append(len(df[df.Game_timestamp==i]))
print(l2)
plt.plot(l2)
plt.title('Number of rows for each time stamp (Points/Games) ')
plt.xlabel('Points in game')
plt.ylabel('Number of rows')

### Initial GameWinner Model - UNTUNED

In [None]:
# Model for timestamps

df,df_val=data_split(df_copy)

warnings.filterwarnings('ignore')
RFC_features=[]
XGB_features=[]
LR_features=[]
val=[]
ADA_features=[]
Model_performances=[]
FI_XGB_final=[]

for i in range(0,time_stamps[19]):
    
    if l2[i] > 300:
        y=df[(df.Game_timestamp==i+1) & (df.Tiebreak == 0) ]['GameWinnerA'].values
        x=df[(df.Game_timestamp==i+1) & (df.Tiebreak == 0)][Game_test_columns].values
        y_val=df_val[(df_val.Game_timestamp==i+1)&(df_val.Tiebreak == 0)]['GameWinnerA'].values
        x_val=df_val[(df_val.Game_timestamp==i+1)&(df_val.Tiebreak == 0)][Game_test_columns].values
        #x, y = sample(x,y)
        labels=Game_test_columns
        scaler = StandardScaler()
        x=scaler.fit_transform(x)
        x_val=scaler.transform(x_val)

        folds = 10
        kf = KFold(n_splits=folds, random_state=42, shuffle=True)
        RFCScore = []
        ADAScore = []
        XGBScore = []
        LRScore=[]
        FI_rfc = []
        FI_ada = []
        FI_XGB = []
        FI_LR = []
        for train_index, test_index in kf.split(x):
            m=[]

            X_train, X_test = x[train_index], x[test_index]
            Y_train, Y_test = y[train_index], y[test_index]


        #RF

            RFC = RandomForestClassifier(max_depth=6, random_state=0)
            predRFC = RFC.fit(X_train,Y_train)
            y_predRFC = predRFC.predict(X_test)
            scoreRFC = accuracy_score(y_predRFC, Y_test)
            RFCScore.append(scoreRFC)
            FI_rfc.append(metrics(Y_test,y_predRFC,X_test,RFC))

        #ADA

            ada = AdaBoostClassifier(base_estimator = RandomForestClassifier(max_depth=2, random_state=0))
            predADA = ada.fit(X_train, Y_train)
            y_predADA = predADA.predict(X_test)
            scoreADA = accuracy_score(y_predADA, Y_test)
            ADAScore.append(scoreADA)
            FI_ada.append(metrics(Y_test,y_predADA,X_test,ada))


        #XG
            model_xg = XGBClassifier(objective ='binary:logistic',)
            model_xg.fit(X_train, Y_train);
            y_pred_xg = model_xg.predict(X_test);
            scoreXG = accuracy_score(y_pred_xg, Y_test)
            XGBScore.append(scoreXG)
            FI_XGB.append(metrics(Y_test,y_pred_xg,X_test,model_xg))
            
        
        #LR
            logreg = LogisticRegression()
            logreg.fit(X_train, Y_train)
            y_pred_lr=logreg.predict(X_test)
            scoreLR=accuracy_score(y_pred_lr, Y_test)
            LRScore.append(scoreLR)
            FI_LR.append(metrics(Y_test,y_pred_lr,X_test,logreg))
        
        X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.20, random_state=42)
        model_xg = XGBClassifier(objective ='binary:logistic',min_child_weight=5,gamma=0.2,colsample_bytree=0.9,
                                     max_depth=6,learning_rate = 0.01,scale_pos_weight=1.3,verbosity=0,use_label_encoder=False)
        model_xg.fit(X_train, y_train);
        y_pred_xg = model_xg.predict(X_test);
        scoreXG = accuracy_score(y_pred_xg, y_test)
        #print(f"Model training score {round(scoreXG,2)}")
        val_pred=model_xg.predict(x_val)
        val_scoreXG = accuracy_score(val_pred, y_val)
        #print(f"Validation set score {round(val_scoreXG,3)}")
        val.append(val_scoreXG)
        FI_XGB_final.append(model_xg.feature_importances_)
        print(f"Predicted dist: {np.unique(val_pred, return_counts=True)}")
        print(f"Actual Dist: {np.unique(y_val,return_counts=True)}")


    #i+=1
    #print(f"Processing fold {i}")
        XGB_features.append(averages(FI_XGB))
        RFC_features.append(averages(FI_rfc))
        ADA_features.append(averages(FI_ada))
        LR_features.append(averages(FI_LR))
        #rf_scores.append(np.mean(RFCScore)*100)
        #lr_scores.append(np.mean(LRScore)*100)
        #xg_scores.append(np.mean(XGBScore)*100)
        m.append(np.mean(RFCScore)*100)
        m.append(np.mean(XGBScore)*100)
        m.append(np.mean(LRScore)*100)
        m.append(np.mean(ADAScore)*100)
        
        Model_performances.append(m)
    
        print(f"\nModel Accuracy for timestamp {i+1} ")
        #print(f"Random Forest: {np.round(np.mean(RFCScore)*100,2)}% ---- XGboost: {np.round(np.mean(XGBScore)*100,2)}% .. XG_Test: {np.round(val_scoreXG*100,2)}% ---- Logistic Regression: {np.round(np.mean(LRScore)*100,2)}% ----  ")
        #---- Adaboost: {np.round(np.mean(ADAScore)*100,2)}
        #evaluation_time(y_val,val_pred,model_xg,x_val)
warnings.filterwarnings('default')
plt.figure(figsize=(15,8))
plt.plot([item[3] for item in Model_performances],label=('Adaboost'),color="red")
#plt.plot(np.array(val)*100,label=('XG_test'))
plt.plot([item[2] for item in Model_performances],label=('Logistic regression'),color="Purple")
plt.plot([item[1] for item in Model_performances],label=('XGboost'),color="yellow")
plt.plot([item[0] for item in Model_performances],label=('Random Forest'),color="blue")
plt.plot(np.full(len(Model_performances), 80.5),label=("BaseLine"),linestyle=("dashed"),color="black")
plt.title('Model Accuracy for each time stamp')
plt.xticks(range(0,14),np.arange(1,14+1))
plt.grid(True)
plt.legend(loc="lower left")
plt.xlabel('Time Stamps (Point number in game)')
plt.ylabel('Accuracy score [%]');
#print(f"\nLogistic regression: {round(np.mean([item[2] for item in Model_performances]),2)}\nXGboost {round(np.mean([item[1] for item in Model_performances]),2)}\nRandom Forest {round(np.mean([item[0] for item in Model_performances]),2)}, \nAdaboost: {round(np.mean([item[3] for item in Model_performances]),2)}")
averages1("XGB",XGB_features)
averages1("RFC",RFC_features)
averages1("LR",LR_features)
averages1("ADA",ADA_features)
print(f"XG_Test {round(np.mean(np.array(val)*100),2)}")
plt.savefig("Gamewinner_time_untuned",bbox_inches='tight',dpi=300)

In [None]:
XGB_features

In [None]:
for i in range(len(FI_XGB_final)):
    plot_feature_importance(FI_XGB_final[i],labels,"XG")
    plt.title(f"Timestamp {i+1}")
    plt.savefig(f"Game_winner_feature{i+1}",bbox_inches='tight',dpi=300)

# Hyperparameter tuning

xgboost_hyper_time=[]
for i in range(len(Model_performances)):
    X_train, X_test, y_train, y_test = train_test_split(df[df.Game_timestamp==i+1][Game_test_columns].values,df[df.Game_timestamp==i+1]['GameWinnerA'].values, test_size=0.20, random_state=42)

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test) 
    classifier=XGBClassifier(objective='binary:logistic')
    #grid_search=GridSearchCV(classifier,param_grid=params,scoring='accuracy',n_jobs=-1,cv=10)

    #start_time = timer(None)
    #grid_search.fit(X_train,y_train)
    #timer(start_time) # timing ends here for "start_time" variable
    
    random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=50,scoring='accuracy',n_jobs=-1,cv=10,verbose=3)
    start_time = timer(None)
    random_search.fit(X_train,y_train)
    timer(start_time) # timing ends here for "start_time" variable
    
    
    xgboost_hyper_time.append(random_search.best_params_)
with open('Gird_search_game_time', 'w') as fout:
    json.dump(xgboost_hyper_time, fout)
    

In [None]:
with open('Gird_search_game_time') as json_file:
    params_game = json.load(json_file)

df,df_val=data_split(df_copy)

# GameWinner Timestamp Tuned

In [None]:
warnings.filterwarnings('ignore')
RFC_features=[]
XGB_features=[]
LR_features=[]
val=[]
ADA_features=[]
Model_performances=[]
FI_XGB_final=[]
error_xg=[]

for i in range(0,time_stamps[19]):
    
    if l2[i] > 300:
        y=df[(df.Game_timestamp==i+1) & (df.Tiebreak == 0) ]['GameWinnerA'].values
        x=df[(df.Game_timestamp==i+1) & (df.Tiebreak == 0)][Game_test_columns].values
        y_val=df_val[(df_val.Game_timestamp==i+1)&(df_val.Tiebreak == 0)]['GameWinnerA'].values
        x_val=df_val[(df_val.Game_timestamp==i+1)&(df_val.Tiebreak == 0)][Game_test_columns].values
        #x, y = sample(x,y)
        labels=Game_test_columns
        scaler = StandardScaler()
        x=scaler.fit_transform(x)
        x_val=scaler.transform(x_val)

        folds = 10
        kf = KFold(n_splits=folds, random_state=42, shuffle=True)
        RFCScore = []
        ADAScore = []
        XGBScore = []
        LRScore=[]
        FI_rfc = []
        FI_ada = []
        FI_XGB = []
        FI_LR = []
        for train_index, test_index in kf.split(x):
            m=[]
            
        error_fill=[]
        for j in range(10):
            df_xg,df_val_xg=data_split(df_copy)
            y=df_xg[(df_xg.Game_timestamp==i+1)&(df_xg.Tiebreak == 0)]['GameWinnerA'].values
            x=df_xg[(df_xg.Game_timestamp==i+1)&(df_xg.Tiebreak == 0)][Game_test_columns].values
            y_val=df_val_xg[(df_val_xg.Game_timestamp==i+1)&(df_val_xg.Tiebreak == 0)]['GameWinnerA'].values
            x_val=df_val_xg[(df_val_xg.Game_timestamp==i+1)&(df_val_xg.Tiebreak == 0)][Game_test_columns].values
            #x, y = sample(x,y)
            labels=Game_test_columns
            x=scaler.transform(x)
            x_val=scaler.transform(x_val)
            X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.20, random_state=42)
            #model_xg = XGBClassifier(objective ='binary:logistic',)
            model_xg = XGBClassifier(objective ='binary:logistic',params=params_game[i])
            model_xg.fit(X_train, y_train);
            val_pred=model_xg.predict(x_val)
            val_scoreXG = accuracy_score(val_pred, y_val)
            #print(f"Validation set score {round(val_scoreXG,3)}")
            error_fill.append(val_scoreXG)
            
        error_xg.append(error_fill)    
        val.append(val_scoreXG)
        FI_XGB_final.append(model_xg.feature_importances_)
        print(f"Predicted dist: {np.unique(val_pred, return_counts=True)}")
        print(f"Actual Dist: {np.unique(y_val,return_counts=True)}")

        evaluation_time(y_val,val_pred,model_xg,x_val)
warnings.filterwarnings('default')
plt.figure(figsize=(15,8))
#plt.plot([item[3] for item in Model_performances],label=('Adaboost'))
error=errors(error_xg)
plt.plot(np.array(val)*100,label=('XG_test'))
plt.fill_between(np.arange(0,len(error)), np.array(val)*100-error, np.array(val)*100+error,alpha=0.5)
plt.plot(np.full(len(Model_performances), 80.5),label=("BaseLine"),linestyle=("dashed"),color="black")
plt.title('Model Accuracy for each Game time_stamp')
plt.xticks(range(len(error)),np.arange(1,len(error)+1,1))
plt.grid(True)
plt.legend(loc="lower left")
plt.xlabel('Time Stamps (Point number in game)')
plt.ylabel('Accuracy score [%]');
print(f"XG_Test {round(np.mean(np.array(val)*100),2)}")
#\n---Average Accuracy--- \nAdaboost: {round(np.mean([item[3] for item in Model_performances]),2)}
plt.savefig("Gamewinner_time_tuuned_real",bbox_inches='tight',dpi=300)

In [None]:
plt.figure(figsize=(15,8))
#plt.plot([item[3] for item in Model_performances],label=('Adaboost'))
error=errors(error_xg)
plt.plot(np.array(val)*100,label=('XG_test'))
plt.fill_between(np.arange(0,len(error)), np.array(val)*100-error, np.array(val)*100+error,alpha=0.5)
plt.plot(np.full(14, 80.5),label=("BaseLine"),linestyle=("dashed"),color="black")
plt.title('Model Accuracy for each Game time_stamp')
plt.xticks(range(len(error)),np.arange(1,len(error)+1,1))
plt.grid(True)
plt.legend(loc="lower left")
plt.xlabel('Time Stamps (Point number in game)')
plt.ylabel('Accuracy score [%]');
print(f"XG_Test {round(np.mean(np.array(val)*100),2)}")
#\n---Average Accuracy--- \nAdaboost: {round(np.mean([item[3] for item in Model_performances]),2)}
plt.savefig("Gamewinner_time_tuuned_real",bbox_inches='tight',dpi=300)

In [None]:
for i in range(len(FI_XGB_final)):
    plot_feature_importance(FI_XGB_final[i],labels,"XG")
    plt.title(f"Timestamp {i+1}")
    plt.savefig(f"Game_winner_feature{i+1}",bbox_inches='tight',dpi=300)

# Under Her der model for timestamp 4 + overall model

In [None]:
df_time=df[df['Game_timestamp']==4]

In [None]:
df_1=df[(df['P1Score']==15) & (df['P2Score']==40)]
df_1_val=df_val[(df_val['P1Score']==15) & (df_val['P2Score']==40)]

In [None]:
df_2=df[(df['P1Score']==30) & (df['P2Score']==30)]
df_2_val=df_val[(df_val['P1Score']==30) & (df_val['P2Score']==30)]

In [None]:
df_3=df[(df['P1Score']==40) & (df['P2Score']==15)]
df_3_val=df_val[(df_val['P1Score']==40) & (df_val['P2Score']==15)]

In [None]:
dfs=[df_1,df_2,df_3]
dfs_val=[df_1_val,df_2_val,df_3_val]
scores=["15-40","30-30","40-15"]

In [None]:
baselines=[]
for i in range(len(dfs)):
    baselines.append(dfs[i]['PointWinner'].value_counts()[0]/len(dfs[i]))

## Accuracy for 15-40, 30-30, 40-15

In [None]:
warnings.filterwarnings('ignore')
RFC_features=[]
XGB_features=[]
LR_features=[]
val=[]
ADA_features=[]
Model_performances=[]


for i in range(len(dfs)):
    y=dfs[i]['GameWinnerA'].values
    x=dfs[i][Game_test_columns].values
    y_val=dfs_val[i]['GameWinnerA'].values
    x_val=dfs_val[i][Game_test_columns].values
    labels=Game_test_columns
    labels=Game_test_columns
    scaler = StandardScaler()
    x=scaler.fit_transform(x)
    x_val=scaler.transform(x_val)
    folds = 10
    kf = KFold(n_splits=folds, shuffle=True)
    RFCScore = []
    ADAScore = []
    XGBScore = []
    LRScore=[]
    FI_rfc = []
    FI_ada = []
    FI_XGB = []
    FI_LR = []
    
    
    X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.20, random_state=42)

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)  
    model_xg = XGBClassifier(objective ='binary:logistic', colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 6, verbosity=0)
    model_xg.fit(X_train, y_train);
    y_pred_xg = model_xg.predict(X_test);
    scoreXG = accuracy_score(y_pred_xg, y_test)
        #print(f"Model training score {round(scoreXG,2)}")
    val_pred=model_xg.predict(x_val)
    val_scoreXG = accuracy_score(val_pred, y_val)
        #print(f"Validation set score {round(val_scoreXG,3)}")
    val.append(val_scoreXG) 
    
    for train_index, test_index in kf.split(x):
        m=[]

        X_train, X_test = x[train_index], x[test_index]
        Y_train, Y_test = y[train_index], y[test_index]
        
        RFC = RandomForestClassifier(max_depth=6, random_state=0)
        predRFC = RFC.fit(X_train,Y_train)
        y_predRFC = predRFC.predict(X_test)
        scoreRFC = accuracy_score(y_predRFC, Y_test)
        RFCScore.append(scoreRFC)
        FI_rfc.append(RFC.feature_importances_)
        
        #XG
        model_xg = XGBClassifier(objective ='binary:logistic', colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 6, verbosity=0)
        model_xg.fit(X_train, Y_train);
        y_pred_xg = model_xg.predict(X_test);
        scoreXG = accuracy_score(y_pred_xg, Y_test)
        XGBScore.append(scoreXG)
        FI_XGB.append(model_xg.feature_importances_)       
        
                #LR
        logreg = LogisticRegression()
        logreg.fit(X_train, Y_train)
        y_pred_lr=logreg.predict(X_test)
        scoreLR=accuracy_score(y_pred_lr, Y_test)
        LRScore.append(scoreLR)
            #FI_LR.append(logreg.feature_importances_)
    
    
    XGB_features.append(list(map(mean, zip(*FI_XGB))))
    RFC_features.append(list(map(mean, zip(*FI_rfc))))
    ADA_features.append(list(map(mean, zip(*FI_ada))))
    #LR_features.append(list(map(mean, zip(*FI_lr))))
    #rf_scores.append(np.mean(RFCScore)*100)
    #lr_scores.append(np.mean(LRScore)*100)
    #xg_scores.append(np.mean(XGBScore)*100)
    m.append(np.mean(RFCScore)*100)
    m.append(np.mean(XGBScore)*100)
    m.append(np.mean(LRScore)*100)
    m.append(np.mean(ADAScore)*100)
        
    Model_performances.append(m)
    print(f"\nModel Accuracy for timestamp {scores[i]}                    ")
    print(f"Random Forest: {np.round(np.mean(RFCScore)*100,2)}% ---- XGboost: {np.round(np.mean(XGBScore)*100,2)}% ---- Logistic Regression: {np.round(np.mean(LRScore)*100,2)}% ")
    print(f"XG_val: {round(val_scoreXG,2)*100}%")
        #---- XG: {np.round(np.mean(ADAScore)*100,2)}
warnings.filterwarnings('default')
plt.figure(figsize=(10,5))
#plt.plot([item[3] for item in Model_performances],label=('Adaboost'))
plt.plot(np.array(val)*100,label=('XG_val'))
plt.plot([item[2] for item in Model_performances],label=('Logistic regression'))
plt.plot([item[1] for item in Model_performances],label=('XGboost'))
plt.plot([item[0] for item in Model_performances],label=('Random Forest'))
plt.title('Model Accuracy for each time stamp')
plt.axhline(baselines[0]*100,xmin=0,xmax=2,label=("BaseLine 15-40"),linestyle=("dashed"),color="r")
plt.axhline(baselines[1]*100,xmin=0,xmax=2,label=("BaseLine 30-30"),linestyle=("dashed"), color="b")
plt.axhline(baselines[2]*100,xmin=0,xmax=2,label=("BaseLine 40-15"),linestyle=("dashed"),color="black")
loc=range(len(scores))
plt.xticks(loc, scores)
plt.grid(True)
plt.legend(loc="upper left")
plt.xlabel('Accuracy score at certain score of a game')
plt.ylabel('Accuracy score [%]');
print(f"\n---Average Accuracy--- \nLogistic regression: {round(np.mean([item[2] for item in Model_performances]),2)}\nXGboost {round(np.mean([item[1] for item in Model_performances]),2)}\nRandom Forest {round(np.mean([item[0] for item in Model_performances]),2)}")
print(f"XG_val {round(np.mean(np.array(val)*100),2)}")

In [None]:
for i in range(len(XGB_features)):
    plot_feature_importance(XGB_features[i],labels,"XG")
    plt.title(f"score {scores[i]}")

# One model  GameWinner

In [None]:
y=df['GameWinnerA'].values
x=df[Game_test_columns].values
labels=Game_test_columns
scaler = StandardScaler()
x=scaler.fit_transform(x)
warnings.filterwarnings('ignore')

folds = 10
kf = KFold(n_splits=folds, shuffle=True)
RFCScore = []
adaScore = []
XGBScore = []
LRScore = []
FI_rfc = []
FI_ada = []
FI_XGB = []

i=0
for train_index, test_index in kf.split(x):

    X_train, X_test = x[train_index], x[test_index]
    Y_train, Y_test = y[train_index], y[test_index]

    
    #RF
    
    RFC = RandomForestClassifier(max_depth=6, random_state=0)
    predRFC = RFC.fit(X_train,Y_train)
    y_predRFC = predRFC.predict(X_test)
    scoreRFC = accuracy_score(y_predRFC, Y_test)
    RFCScore.append(scoreRFC)
    FI_rfc.append(RFC.feature_importances_)
    
    #ADA

    #ada = AdaBoostClassifier(base_estimator = RandomForestClassifier(max_depth=2, random_state=0))
    #predADA = ada.fit(X_train, Y_train)
    #y_predADA = predADA.predict(X_test)
    #scoreADA = accuracy_score(y_predADA, Y_test)
    #adaScore.append(scoreADA)
    #FI_ada.append(ada.feature_importances_)
    
    
    #XG
    eval_set = [(X_train, Y_train), (X_test, Y_test)]
    model_xg = XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3,max_depth=6,
                                     learning_rate = 0.1,verbosity=0)
    model_xg.fit(X_train, Y_train, eval_metric=["error", "logloss","auc"],eval_set=eval_set );
    y_pred_xg = model_xg.predict(X_test);
    scoreXG = accuracy_score(y_pred_xg, Y_test)
    XGBScore.append(scoreXG)
    FI_XGB.append(model_xg.feature_importances_)
    
    #LR
    logreg = LogisticRegression()
    logreg.fit(X_train, Y_train)
    y_pred_lr=logreg.predict(X_test)
    scoreLR=accuracy_score(y_pred_lr, Y_test)
    LRScore.append(scoreLR)
    #FI_LR.append(logreg.feature_importances_)
    
    

    i+=1
    print(f"Processing fold {i}")
warnings.filterwarnings('default')    
rf_scores.append(np.mean(RFCScore)*100)
xg_scores.append(np.mean(XGBScore)*100)
print("\nModel Accuracy")
print(f"Baseline (Serve) 80.5% ") #Hvor mange gange vedkommende der server også vinder gamet
print(f"Random Forrest: {np.round(np.mean(RFCScore)*100,2)}%")
print(f"XGboost: {np.round(np.mean(XGBScore)*100,2)}%")
#print(f"ADAboost: {np.round(np.mean(adaScore)*100,2)}%")
print(f"Logistic Regression: {np.round(np.mean(LRScore)*100,2)}%")

print("\n Model feature impact > 0.01")
plot_feature_importance(list(map(mean, zip(*FI_rfc))),labels,"Random Forrest")
plot_feature_importance(list(map(mean, zip(*FI_XGB))),labels,"XG")
#plot_feature_importance(list(map(mean, zip(*FI_ada))),labels,"Adaboost")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[Game_test_columns],df['GameWinnerA'].values, test_size=0.20, random_state=42)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 
classifier=XGBClassifier(objective='binary:logistic')
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=10,scoring='accuracy',n_jobs=-1,cv=10,verbose=3)

start_time = timer(None)
random_search.fit(X_train,y_train)
timer(start_time) # timing ends here for "start_time" variable

random_search.best_params_

In [None]:
y=df['GameWinnerA'].values
x=df[Game_test_columns].values
y_val=df_val['GameWinnerA'].values
x_val=df_val[Game_test_columns].values
labels=Game_test_columns

#x,y=sample(x,y)

X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.20, random_state=42)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
x_val=scaler.transform(x_val);
model_xg = XGBClassifier(objective ='binary:logistic',
                         min_child_weight=1,
                         gamma=0.1,
                         colsample_bytree=0.9,
                         sub_sample=0.5,
                         max_depth=8,learning_rate = 0.08,
                         verbosity=0,
                         use_label_encoder=False,
                         scale_pos_weight=1.3)


model_xg.fit(X_train, y_train);
y_pred_xg = model_xg.predict(X_test);
scoreXG = accuracy_score(y_pred_xg, y_test)
print(f"Model training score {round(scoreXG,2)*100}")

val_pred=model_xg.predict(x_val)
val_scoreXG = accuracy_score(val_pred, y_val)
print(f"Test set accuracy score {round(val_scoreXG,2)*100}%")
evaluation_time(y_val,val_pred,model_xg,x_val)
plot_feature_importance(model_xg.feature_importances_,labels,"XG")
