# SetWinner Model

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from statistics import mean
import plotly.graph_objects as go
import pylab
import warnings
from termcolor import colored
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import *
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import random
import json
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
df = pd.read_csv('FinalData.csv')

In [None]:
game_timestamp=[]

k=0
for i in range(len(df)):
    if df.GameNo.iloc[i] != df.GameNo.iloc[i-1]:
        k=1
        game_timestamp.append(k)
    if df.GameNo.iloc[i] == df.GameNo.iloc[i-1]:
        k=k+1
        game_timestamp.append(k)
df['Game_timestamp']=game_timestamp

Feature Important function

In [None]:
def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = importance
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_df = fi_df[fi_df.feature_importance > 0.01]
    

    #Define size of bar plot
    plt.figure(figsize=(8,6))
    plt.tight_layout()
    plt.yticks(fontsize=7)
    #Plot Searborn bar chart
    ax=sns.barplot(y='feature_names',x='feature_importance',data=fi_df,palette="rocket")
    ax.set_xlabel('feature_importance')
    #plt.xticks(rotation = 90)
    
    #Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE > 0.01')
    plt.ylabel('FEATURE NAMES')


In [None]:
def evaluation_time(y_val,y_pred,model,x_val):
    plt.style.use('ggplot')
    cm = confusion_matrix(y_val, (y_pred))
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax,fmt='g',cmap='rocket'); #annot=True to annotate cells

    # labels, title and ticks
    ax.set_xlabel('Predicted Winner');ax.set_ylabel('Actual Winner'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Server', 'Returner']); ax.yaxis.set_ticklabels(['Server', 'Returner'], rotation=360);
    plt.show()

    f_score=f1_score(y_val, y_pred, average=None)
    P_score=precision_score(y_val,y_pred,average=None)
    recall=recall_score(y_val, y_pred, average=None)


    ns_probs = [0 for _ in range(len(y_val))]
    lr_probs = model.predict_proba(x_val)
    lr_probs = lr_probs[:, 1]
    # calculate scores
    ns_auc = roc_auc_score(y_val, ns_probs)
    lr_auc = roc_auc_score(y_val, lr_probs)
    # summarize scores
    print(f"Test set accuracy score {(np.round(accuracy_score(y_pred,y_val),3)*100)}%")
    print(f"F1 score {np.round(f_score,3)}")
    print(f"Average F1 Score {np.round((f_score[0]+f_score[1])/2,3)}")
    print(f"Recall score {np.round(recall,3)}")
    print(f"Average Recall score {np.round((recall[0]+recall[1])/2,3)}")
    print(f"Precision score {np.round(P_score,3)}")
    print(f"Average precision Score {np.round((P_score[0]+P_score[1])/2,3)}")
    print('Model: ROC AUC=%.3f' % (lr_auc))
    # calculate roc curves
    ns_fpr, ns_tpr, _ = roc_curve(y_val, ns_probs)
    lr_fpr, lr_tpr, _ = roc_curve(y_val, lr_probs)
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Base')
    plt.plot(lr_fpr, lr_tpr, marker='.', label='XGBoost')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()
    
def averages1(name,metrics):
    
    res = [round(sum(x) / len(x),3) for x in zip(*metrics)]
    print(f"{name} Accuracy score: {res[0]*100}, f1 score: {res[1]}, Precision: {res[2]}, Recall: {res[3]}, ROC-AUC: {res[4]}")
    return

def metrics(Y_test,y_pred,x_val,model):
    val_pred=y_pred
    x_val=x_val
    model=model
    auc=round(accuracy_score(Y_test,val_pred),4)
    f_score=round(f1_score(Y_test, val_pred),4)
    P_score=round(precision_score(Y_test,val_pred),4)
    recall=round(recall_score(Y_test, val_pred),4)
    
    ns_probs = [0 for _ in range(len(Y_test))]
    lr_probs = model.predict_proba(x_val)
    lr_probs = lr_probs[:, 1]
    # calculate scores
    ns_auc = roc_auc_score(Y_test, ns_probs)
    lr_auc = round(roc_auc_score(Y_test, lr_probs),4)
    return [auc,f_score,P_score,recall,lr_auc]

def averages(metrics):
    
    res = [round(sum(x) / len(x),3) for x in zip(*metrics)]
    #print(f"{name} Accuracy score: {res[0]*100}, f1 score: {res[1]}, Precision: {res[2]}, Recall: {res[3]}, ROC-AUC: {res[4]}")
    return res

Prediction variables

In [None]:
predictionV=['SetNo',
                   'P1GamesWon',
                   'P2GamesWon',
                   'GameNo',
                   'PointNumber',
                   #'PointServer',
                   'Tiebreak',
                   'P1Score',
                   'P2Score',
                   'P1PointsWon',
                   'P2PointsWon',
                   #'Point_lenght_sec',
                   'P1_ServeWidth_B_A',
                   'P1_ServeWidth_BC_A',
                   'P1_ServeWidth_BW_A',
                   'P1_ServeWidth_C_A',
                   'P1_ServeWidth_W_A',
                   'P1_ServeDepth_CTL_A',
                   'P1_ServeDepth_NCTL_A',
                   'P1_ReturnDepth_D_A',
                   'P1_ReturnDepth_ND_A',
                   'P2_ServeWidth_B_A',
                   'P2_ServeWidth_BC_A',
                   'P2_ServeWidth_BW_A',
                   'P2_ServeWidth_C_A',
                   'P2_ServeWidth_W_A',
                   'P2_ServeDepth_CTL_A',
                   'P2_ServeDepth_NCTL_A',
                   'P2_ReturnDepth_D_A',
                   'P2_ReturnDepth_ND_A',
                   'P1AceA',
                   'P2AceA',
                   'P1WinnerA',
                   'P2WinnerA',
                   'P1DoubleFaultA',
                   'P2DoubleFaultA',
                   'P1UnfErrA',
                   'P2UnfErrA',
                   'P1NetPointA',
                   'P2NetPointA',
                   'P1NetPointWonA',
                   'P2NetPointWonA',
                   'P1BreakPointA',
                   'P2BreakPointA',
                   'P1BreakPointWonA',
                   'P2BreakPointWonA',
                   'P1BreakPointMissedA',
                   'P2BreakPointMissedA',
                   'P1DistanceRunA',
                   'P2DistanceRunA',
                   'RallyCountA',
                   'P1SetsWon',
                   'P2SetsWon',
                   'Game_timestamp',
                   'P1Rank',
                   'P2Rank',
                   'Total_time',
                   'Surface']

Function for making test-set

In [None]:
def data_split(df):
    matches=list(df.match_id.unique())
    Test_games=random.sample(matches,round(len(matches)/10))
    df_test=df[df['match_id'].isin(Test_games)]
    df=df[~df['match_id'].isin(Test_games)]
    return df,df_test

In [None]:
df,df_test=data_split(df)

In [None]:
l2=[]
for i in range(1,max(df.GameNo)):
    l2.append(len(df[df.GameNo==i]))
df_set=df[df['Game_timestamp']==1]
print(l2)
plt.plot(l2)
plt.title('Number of rows for each time stamp (Games in a set) ')
plt.xlabel('Game in a set')
plt.ylabel('Number of rows')

In [None]:
df_test=df_test[df_test['Game_timestamp']==1]

In [None]:
warnings.filterwarnings('ignore')
RFC_features=[]
XGB_features=[]
LR_features=[]
ADA_features=[]
Model_performances=[]
test=[]
FI_XGB_final=[]
RFF1 = []
RFR = []
RFP = []
RFAU = []
ADAF1 = []
ADAR = []
ADAP = []
ADAAU = []
XGF1 = []
XGR = []
XGP = []
XGAU = []
LRF1 = []
LRR = []
LRP = []
LRAU = []


f = open('RF_Hyper_time',)  
# returns JSON object as 
# a dictionary
RF_hyper_time = json.load(f)

for i in range(13):
    
    if l2[i] > 3000:
        y=df_set[df_set.GameNo==i+1]['SetWinnerA'].values
        x=df_set[df_set.GameNo==i+1][predictionV].values
        labels=predictionV
        scaler = StandardScaler()
        x=scaler.fit_transform(x)

        folds = 10
        kf = KFold(n_splits=folds, random_state=42, shuffle=True)
        RFCScore = []
        ADAScore = []
        XGBScore = []
        LRScore=[]
        FI_rfc = []
        FI_ada = []
        FI_XGB = []
        FI_LR = []
        F_score = 0
        p_score = 0
        Recall = 0
        lr_auc = 0
        F2_score = 0
        p2_score = 0
        Recall2 = 0
        lr_auc2 = 0
        F3_score = 0
        p3_score = 0
        Recall3 = 0
        lr_auc3 = 0
        F4_score = 0
        p4_score = 0
        Recall4 = 0
        lr_auc4 = 0
        for train_index, val_index in kf.split(x):
            m=[]

            X_train, X_val = x[train_index], x[val_index]
            Y_train, Y_val = y[train_index], y[val_index]


        #RF

            RFC = RandomForestClassifier(max_depth=5, random_state=0)
            RFC.fit(X_train,Y_train)
            y_predRFC = RFC.predict(X_val)
            scoreRFC = accuracy_score(y_predRFC, Y_val)
            RFCScore.append(scoreRFC)
            FI_rfc.append(RFC.feature_importances_)
            f_score = f1_score(Y_val, y_predRFC, average=None)
            F_score = F_score + np.round((f_score[0]+f_score[1])/2,3)
            P_score=precision_score(Y_val,y_predRFC,average=None)
            p_score = p_score + np.round((P_score[0]+P_score[1])/2,3)
            recall=recall_score(Y_val, y_predRFC, average=None)
            Recall = Recall + np.round((recall[0]+recall[1])/2,3)
            lr_probs = RFC.predict_proba(X_val)
            lr_probs = lr_probs[:, 1]
            lr_auc = lr_auc + roc_auc_score(Y_val, lr_probs)

        #ADA

            ada = AdaBoostClassifier(base_estimator = RandomForestClassifier(max_depth=6, random_state=0))
            ada.fit(X_train, Y_train)
            y_predADA = ada.predict(X_val)
            scoreADA = accuracy_score(y_predADA, Y_val)
            ADAScore.append(scoreADA)
            FI_ada.append(ada.feature_importances_)
            f_score = f1_score(Y_val, y_predADA, average=None)
            F2_score = F2_score + np.round((f_score[0]+f_score[1])/2,3)
            P_score=precision_score(Y_val,y_predADA,average=None)
            p2_score = p2_score + np.round((P_score[0]+P_score[1])/2,3)
            recall=recall_score(Y_val, y_predADA, average=None)
            Recall2 = Recall2 + np.round((recall[0]+recall[1])/2,3)
            lr_probs = ada.predict_proba(X_val)
            lr_probs = lr_probs[:, 1]
            lr_auc2 = lr_auc2 + roc_auc_score(Y_val, lr_probs)


        #XG
            model_xg = XGBClassifier(objective ='binary:logistic',min_child_weight=2,gamma=0.2,colsample_bytree=0.3,
                                     max_depth=6,learning_rate = 0.05,verbosity=0,use_label_encoder=False)
            model_xg.fit(X_train, Y_train);
            y_pred_xg = model_xg.predict(X_val);
            scoreXG = accuracy_score(y_pred_xg, Y_val)
            XGBScore.append(scoreXG)
            FI_XGB.append(model_xg.feature_importances_)
            f_score = f1_score(Y_val, y_pred_xg, average=None)
            F3_score = F3_score + np.round((f_score[0]+f_score[1])/2,3)
            P_score=precision_score(Y_val,y_pred_xg,average=None)
            p3_score = p3_score + np.round((P_score[0]+P_score[1])/2,3)
            recall=recall_score(Y_val, y_pred_xg, average=None)
            Recall3 = Recall3 + np.round((recall[0]+recall[1])/2,3)
            lr_probs = model_xg.predict_proba(X_val)
            lr_probs = lr_probs[:, 1]
            lr_auc3 = lr_auc3 + roc_auc_score(Y_val, lr_probs)
        
        #LR
            logreg = LogisticRegression()
            logreg.fit(X_train, Y_train)
            y_pred_lr=logreg.predict(X_val)
            scoreLR=accuracy_score(y_pred_lr, Y_val)
            LRScore.append(scoreLR)
            #FI_LR.append(logreg.feature_importances_)
            f_score = f1_score(Y_val, y_pred_lr, average=None)
            F4_score = F4_score + np.round((f_score[0]+f_score[1])/2,3)
            P_score=precision_score(Y_val,y_pred_lr,average=None)
            p4_score = p4_score + np.round((P_score[0]+P_score[1])/2,3)
            recall=recall_score(Y_val, y_pred_lr, average=None)
            Recall4 = Recall4 + np.round((recall[0]+recall[1])/2,3)
            lr_probs = logreg.predict_proba(X_val)
            lr_probs = lr_probs[:, 1]
            lr_auc4 = lr_auc4 + roc_auc_score(Y_val, lr_probs)

    
    #i+=1
    #print(f"Processing fold {i}")
        XGB_features.append(list(map(mean, zip(*FI_XGB))))
        RFC_features.append(list(map(mean, zip(*FI_rfc))))
        ADA_features.append(list(map(mean, zip(*FI_ada))))
        #LR_features.append(list(map(mean, zip(*FI_lr))))
        #rf_scores.append(np.mean(RFCScore)*100)
        #lr_scores.append(np.mean(LRScore)*100)
        #xg_scores.append(np.mean(XGBScore)*100)
        m.append(np.mean(RFCScore)*100)
        m.append(np.mean(XGBScore)*100)
        m.append(np.mean(LRScore)*100)
        m.append(np.mean(ADAScore)*100)
        RFF1.append(F_score/10)
        RFR.append(Recall/10)
        RFP.append(p_score/10)
        RFAU.append(lr_auc/10)
        ADAF1.append(F2_score/10)
        ADAR.append(Recall2/10)
        ADAP.append(p2_score/10)
        ADAAU.append(lr_auc2/10)
        XGF1.append(F3_score/10)
        XGR.append(Recall3/10)
        XGP.append(p3_score/10)
        XGAU.append(lr_auc3/10)
        LRF1.append(F4_score/10)
        LRR.append(Recall4/10)
        LRP.append(p4_score/10)
        LRAU.append(lr_auc4/10)
        
        Model_performances.append(m)
    
        print(f"\nModel Accuracy for timestamp {i+1} ") 
        print(f"Random Forest: {np.round(np.mean(RFCScore)*100,2)}% ---- XGboost: {np.round(np.mean(XGBScore)*100,2)}% ---- Logistic Regression: {np.round(np.mean(LRScore)*100,2)}%,---- Adaboost: {np.round(np.mean(ADAScore)*100,2)}")
warnings.filterwarnings('default')

plt.style.use('ggplot')
plt.figure(figsize=(15,8))
plt.plot([item[3] for item in Model_performances],label=('Adaboost'), color = "red")
plt.plot([item[2] for item in Model_performances],label=('Logistic regression'), color = 'Purple')
plt.plot([item[1] for item in Model_performances],label=('XGboost'), color = 'yellow')
plt.plot([item[0] for item in Model_performances],label=('Random Forest'), color = 'blue')
plt.plot(np.full(len(Model_performances), 61.14),label=("BaseLine"),linestyle=("dashed"), color = 'black')
plt.title('Model Accuracy for each timestamp validation set')
plt.xticks(range(0,len(XGB_features)),labels=np.arange(1,len(XGB_features)+1))
plt.grid(True)
plt.legend(loc="lower left")
plt.xlabel('Time Stamps (Game in set)')
plt.ylabel('Accuracy score [%]');
plt.savefig("SetWinnerPlotVal",bbox_inches='tight',dpi=300)
print(f"\n---Average Accuracy --- \n Adaboost {round(np.mean([item[3] for item in Model_performances]),2)}\nLogistic regression: {round(np.mean([item[2] for item in Model_performances]),2)}\nXGboost {round(np.mean([item[1] for item in Model_performances]),2)}\nRandom Forest {round(np.mean([item[0] for item in Model_performances]),2)}\nRF_test {round(np.mean(test)*100,2)}")
print(f"\n---Average f1-score ---\n Adaboost {round(np.mean(ADAF1)*100,2)}\nLogistic regression: {round(np.mean(LRF1)*100,2)}\nXGboost {round(np.mean(XGF1)*100,2)}\nRandom Forest {round(np.mean(RFF1)*100,2)}")
print(f"\n---Average Recall score ---\n Adaboost {round(np.mean(ADAR)*100,2)}\nLogistic regression: {round(np.mean(LRR)*100,2)}\nXGboost {round(np.mean(XGR)*100,2)}\nRandom Forest {round(np.mean(RFR)*100,2)}")
print(f"\n---Average Precision score ---\n Adaboost {round(np.mean(ADAP)*100,2)}\nLogistic regression: {round(np.mean(LRP)*100,2)}\nXGboost {round(np.mean(XGP)*100,2)}\nRandom Forest {round(np.mean(RFP)*100,2)}")
print(f"\n---Average ROC-AUC ---\n Adaboost {round(np.mean(ADAAU)*100,2)}\nLogistic regression: {round(np.mean(LRAU)*100,2)}\nXGboost {round(np.mean(XGAU)*100,2)}\nRandom Forest {round(np.mean(RFAU)*100,2)}")
#\n---Average Accuracy --- \n Adaboost {round(np.mean([item[3] for item in Model_performances]),2)}
#---- Adaboost: {np.round(np.mean(ADAScore)*100,2)}

## Hyperparameter tuning

In [None]:
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from datetime import datetime
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

params={
 "min_child_weight" : [ 1, 3, 5, 7, 9 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.6, 0.9, 1.2 , 1.5 ],
 "subsample": np.arange(0.5, 1.0, 0.11).tolist(),
 "max_depth"        : [ 4, 5, 6, 8, 10, 12],
 "learning_rate"    : [0.01, 0.05, 0.08, 0.1],
 "scale_pos_weight":np.arange(1, 1.5, 0.1).tolist()
}


XG_hyper_time=[]
for i in range(len(Model_performances)):
    X_train, X_test, y_train, y_test = train_test_split(df_set[df_set.GameNo==i+1][predictionV].values,df_set[df_set.GameNo==i+1]['SetWinnerA'].values, test_size=0.20, random_state=42)

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test) 
    XG = XGBClassifier(objective='binary:logistic')
    #grid_search=GridSearchCV(classifier,param_grid=params,scoring='accuracy',n_jobs=-1,cv=10)

    #start_time = timer(None)
    #grid_search.fit(X_train,y_train)
    #timer(start_time) # timing ends here for "start_time" variable
    
    random_search=RandomizedSearchCV(XG,param_distributions=params,n_iter=25,scoring='accuracy',n_jobs=-1,cv=10,verbose=3)
    start_time = timer(None)
    random_search.fit(X_train,y_train)
    timer(start_time) # timing ends here for "start_time" variable
    
    
    XG_hyper_time.append(random_search.best_params_)
with open('XG_Hyper_time', 'w') as fout:
    json.dump(XG_hyper_time, fout)

In [None]:
warnings.filterwarnings('ignore')
RFC_features=[]
XGB_features=[]
LR_features=[]
ADA_features=[]
Model_performances=[]
test=[]
FI_XGB_final=[]
error_xg=[]

#f = open('RF_Hyper_time',)  
# returns JSON object as 
# a dictionary
#RF_hyper_time = json.load(f)

f = open('XG_Hyper_time')
XG_Hyper_time = json.load(f)

for i in range(13):
    
    if l2[i] > 3000:
        y=df_set[df_set.GameNo==i+1]['SetWinnerA'].values
        x=df_set[df_set.GameNo==i+1][predictionV].values
        y_test=df_test[df_test.GameNo==i+1]['SetWinnerA'].values
        x_test=df_test[df_test.GameNo==i+1][predictionV].values
        labels=predictionV
        scaler = StandardScaler()
        x=scaler.fit_transform(x)
        x_test=scaler.transform(x_test)

        RFCScore = []
        ADAScore = []
        XGBScore = []
        LRScore=[]
        FI_rfc = []
        FI_ada = []
        FI_XGB = []
        FI_LR = []
        error_fill=[]
        for j in range(10):
            df_xg,df_test_xg=data_split(df)
            y2=df_xg[df_xg.Game_timestamp==i+1]['SetWinnerA'].values
            x2=df_xg[df_xg.Game_timestamp==i+1][predictionV].values
            y_test2=df_test_xg[df_test_xg.Game_timestamp==i+1]['SetWinnerA'].values
            x_test2=df_test_xg[df_test_xg.Game_timestamp==i+1][predictionV].values
            #x, y = sample(x,y)
            labels=predictionV
            x2=scaler.transform(x2)
            x_test2=scaler.transform(x_test2)
            model_xg2 = XGBClassifier(objective ='binary:logistic',min_child_weight=XG_Hyper_time[i]['min_child_weight'],gamma=XG_Hyper_time[i]['gamma'],colsample_bytree=XG_Hyper_time[i]['colsample_bytree'],
                                     max_depth=XG_Hyper_time[i]['max_depth'],learning_rate = XG_Hyper_time[i]['learning_rate'],verbosity=0,use_label_encoder=False, scale_pos_weight = XG_Hyper_time[i]['scale_pos_weight'],
                                subsample = XG_Hyper_time[i]['subsample'])
            #model_xg = XGBClassifier(objective ='binary:logistic',params=params_game[i])
            model_xg2.fit(x2, y2);
            error_pred=model_xg2.predict(x_test2)
            error_scoreXG = accuracy_score(error_pred, y_test2)
            error_fill.append(error_scoreXG)
    
        #X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size=0.20, random_state=42)
        
        model_xg = XGBClassifier(objective ='binary:logistic',min_child_weight=XG_Hyper_time[i]['min_child_weight'],gamma=XG_Hyper_time[i]['gamma'],colsample_bytree=XG_Hyper_time[i]['colsample_bytree'],
                                     max_depth=XG_Hyper_time[i]['max_depth'],learning_rate = XG_Hyper_time[i]['learning_rate'],verbosity=0,use_label_encoder=False, scale_pos_weight = XG_Hyper_time[i]['scale_pos_weight'],
                                subsample = XG_Hyper_time[i]['subsample'])
        #model_xg = RandomForestClassifier(n_estimators = RF_hyper_time[i]['n_estimators'],min_samples_split = RF_hyper_time[i]['min_samples_split'],
         #                               min_samples_leaf = RF_hyper_time[i]['min_samples_leaf'],max_features = RF_hyper_time[i]['max_features'],
          #                               max_depth = RF_hyper_time[i]['max_depth'],bootstrap = RF_hyper_time[i]['bootstrap'] )
        model_xg.fit(x, y);
        #y_pred_xg = model_xg.predict(X_test);
        #scoreXG = accuracy_score(y_pred_xg, y_test)
        #print(f"Model training score {round(scoreXG,2)}")
        test_pred=model_xg.predict(x_test)
        test_scoreXG = accuracy_score(test_pred, y_test)
        #print(f"Validation set score {round(val_scoreXG,3)}")
        error_xg.append(np.mean(error_fill))
        test.append(test_scoreXG)
        FI_XGB_final.append(model_xg.feature_importances_)
        print(f"Predicted dist: {np.unique(test_pred, return_counts=True)}")
        print(f"Actual Dist: {np.unique(y_test,return_counts=True)}")
    #i+=1
    #print(f"Processing fold {i}")
        XGB_features.append(list(map(mean, zip(*FI_XGB))))
        RFC_features.append(list(map(mean, zip(*FI_rfc))))
        ADA_features.append(list(map(mean, zip(*FI_ada))))
        #LR_features.append(list(map(mean, zip(*FI_lr))))
        #rf_scores.append(np.mean(RFCScore)*100)
        #lr_scores.append(np.mean(LRScore)*100)
        #xg_scores.append(np.mean(XGBScore)*100)
        print(f"Timestamp {i+1}")
        evaluation_time(y_test,test_pred,model_xg,x_test)
        
print(f"--- Average Accuracy--- \nXG_test {round(np.mean(test)*100,2)}")

plt.figure(figsize=(15,8))
plt.plot(np.array(test)*100,label=('XG_test'))
plt.fill_between(np.arange(0,len(error_xg)), np.array(test)*100-error_xg, np.array(test)*100+error_xg,alpha=0.5)
plt.plot(np.full(len(test), 61.14),label=("BaseLine"),linestyle=("dashed"))
plt.title('Model Accuracy for each timestamp for XGBoost on test data')
plt.xticks(range(0,len(XGB_features)),labels=np.arange(1,len(XGB_features)+1))
plt.grid(True)
plt.legend(loc="lower left")
plt.xlabel('Timestamps (Game in set)')
plt.ylabel('Accuracy score [%]');
plt.savefig("SetWinnerPlotTest",bbox_inches='tight',dpi=300)

In [None]:
plot_feature_importance(FI_XGB_final[5],labels,"XG")
plt.title(f"Set_timestamp (GameNo) {6}")
plt.savefig("FI_8",bbox_inches='tight',dpi=300)

In [None]:
plot_feature_importance(FI_XGB_final[10],labels,"XG")
plt.title(f"Set_timestamp (GameNo) {11}")
plt.savefig("FI_11",bbox_inches='tight',dpi=300)

In [None]:
for i in range(len(FI_XGB_final)):
    plot_feature_importance(FI_XGB_final[i],labels,"XG")
    plt.title(f"Set_timestamp (GameNo) {i+1}")