# Create the dataset for the model, calculate statistics for each player

### Import Libraries

In [1]:
import pandas as pd
from tqdm import tqdm
pd.set_option('display.max_columns', None)

In [3]:
clean_data = pd.read_csv("../data/cleanedDataset.csv")
clean_data

### Create the dataset, calculate statistics for each player

In [6]:
from utils.updateStats import getStats, updateStats, createStats

final_dataset = []
prev_stats = createStats()

# Iterate through each row in clean_data
for index, row in tqdm(clean_data.iterrows(), total=len(clean_data)):
    player1 = {
        "ID": row["p1_id"],
        "ATP_POINTS": row["p1_rank_points"],
        "ATP_RANK": row["p1_rank"],
        "AGE": row["p1_age"],
        "HEIGHT": row["p1_ht"],
    }

    player2 = {
        "ID": row["p2_id"],
        "ATP_POINTS": row["p2_rank_points"],
        "ATP_RANK": row["p2_rank"],
        "AGE": row["p2_age"],
        "HEIGHT": row["p2_ht"],
    }

    match = {
        "BEST_OF": row["best_of"],
        "DRAW_SIZE": row["draw_size"],
        "SURFACE": row["surface"],
    }
    
    # Call getStatsPlayers function
    output = getStats(player1, player2, match, prev_stats)

    # Append sorted stats to final dataset
    match_data = dict(sorted(output.items()))
    match_data["RESULT"] = row.RESULT
    final_dataset.append(match_data)

    # update stats
    prev_stats = updateStats(row, prev_stats)


# Convert final dataset to DataFrame
final_dataset = pd.DataFrame(final_dataset)

100%|██████████| 95375/95375 [01:19<00:00, 1193.00it/s]


In [7]:
final_dataset.iloc[80000:90000]

Unnamed: 0,AGE_DIFF,ATP_POINTS_DIFF,ATP_RANK_DIFF,BEST_OF,DRAW_SIZE,ELO_DIFF,ELO_GRAD_LAST_100_DIFF,ELO_GRAD_LAST_10_DIFF,ELO_GRAD_LAST_200_DIFF,ELO_GRAD_LAST_25_DIFF,ELO_GRAD_LAST_3_DIFF,ELO_GRAD_LAST_50_DIFF,ELO_GRAD_LAST_5_DIFF,ELO_SURFACE_DIFF,H2H_DIFF,H2H_SURFACE_DIFF,HEIGHT_DIFF,N_GAMES_DIFF,P_1ST_IN_LAST_100_DIFF,P_1ST_IN_LAST_10_DIFF,P_1ST_IN_LAST_200_DIFF,P_1ST_IN_LAST_25_DIFF,P_1ST_IN_LAST_3_DIFF,P_1ST_IN_LAST_50_DIFF,P_1ST_IN_LAST_5_DIFF,P_1ST_WON_LAST_100_DIFF,P_1ST_WON_LAST_10_DIFF,P_1ST_WON_LAST_200_DIFF,P_1ST_WON_LAST_25_DIFF,P_1ST_WON_LAST_3_DIFF,P_1ST_WON_LAST_50_DIFF,P_1ST_WON_LAST_5_DIFF,P_2ND_WON_LAST_100_DIFF,P_2ND_WON_LAST_10_DIFF,P_2ND_WON_LAST_200_DIFF,P_2ND_WON_LAST_25_DIFF,P_2ND_WON_LAST_3_DIFF,P_2ND_WON_LAST_50_DIFF,P_2ND_WON_LAST_5_DIFF,P_ACE_LAST_100_DIFF,P_ACE_LAST_10_DIFF,P_ACE_LAST_200_DIFF,P_ACE_LAST_25_DIFF,P_ACE_LAST_3_DIFF,P_ACE_LAST_50_DIFF,P_ACE_LAST_5_DIFF,P_BP_SAVED_LAST_100_DIFF,P_BP_SAVED_LAST_10_DIFF,P_BP_SAVED_LAST_200_DIFF,P_BP_SAVED_LAST_25_DIFF,P_BP_SAVED_LAST_3_DIFF,P_BP_SAVED_LAST_50_DIFF,P_BP_SAVED_LAST_5_DIFF,P_DF_LAST_100_DIFF,P_DF_LAST_10_DIFF,P_DF_LAST_200_DIFF,P_DF_LAST_25_DIFF,P_DF_LAST_3_DIFF,P_DF_LAST_50_DIFF,P_DF_LAST_5_DIFF,WIN_LAST_100_DIFF,WIN_LAST_10_DIFF,WIN_LAST_200_DIFF,WIN_LAST_25_DIFF,WIN_LAST_3_DIFF,WIN_LAST_50_DIFF,WIN_LAST_5_DIFF,RESULT
80000,-9.0,841.0,-32.0,3,64,184.022575,-0.002058,-0.054545,0.000000,-0.004615,0.0,-0.003313,0.0,184.605287,0,0,0.0,44,3.652874,4.886677,1.335620,2.849790,6.121021,1.812696,5.191405,1.978027,-1.394833,1.014764,0.048358,0.300859,0.620852,-2.438532,9.236347,6.877268,6.839575,6.841027,12.777386,7.468817,8.126696,0.494546,-4.103362,-0.381343,-0.217938,-7.909074,-0.008587,-4.510451,7.032683,-1.775719,6.514740,-4.760504,11.489899,-1.269394,-1.439394,-4.224958,-3.562975,-3.205509,-4.444903,0.010071,-3.660038,-1.509037,21,1,0,2,0,7,0,1
80001,-8.4,-4170.0,10.0,3,64,-220.387231,0.002604,0.012121,-0.000458,0.017692,0.0,0.008307,0.0,-257.530978,-3,-3,-7.0,-841,-1.508813,-0.530155,-2.326722,-2.306902,-1.034585,-2.504935,2.624205,-9.850421,-4.354268,-8.900513,-9.896084,-7.817334,-10.699404,-3.164738,-6.912311,-0.677347,-5.624917,-4.467694,8.493338,-6.620545,5.775726,-8.150531,-4.196880,-7.533149,-6.633118,-3.024161,-7.707812,-0.050255,-2.601246,-16.478843,-5.056271,-10.277518,-2.020202,-10.252418,-26.545455,0.435816,-1.505794,0.570519,0.530280,0.165474,0.484872,-1.170738,-22,0,-30,-3,0,-9,0,0
80002,-9.9,-1690.0,2.0,3,64,-202.831154,-0.001788,0.084848,0.001364,-0.003846,0.0,-0.015990,0.1,-310.264281,1,0,10.0,-708,-2.241283,-5.229770,-2.770919,-3.412494,-10.092791,-2.345259,-6.777775,1.305443,-0.687441,-0.046172,0.196950,-4.061257,0.868513,-1.056150,-0.602446,-5.897621,-3.036140,-3.267794,3.343455,0.527568,-3.188732,3.872143,3.994266,3.463544,1.045328,0.490653,2.058985,3.005826,-4.386463,-23.238095,-2.844200,-15.301374,4.444444,-8.772285,-20.666667,0.951454,1.683183,1.449574,1.252871,0.745937,0.422526,1.188797,-6,-4,-28,-6,0,-1,-1,0
80003,15.3,5085.0,-17.0,3,64,311.774474,-0.003696,-0.030303,-0.000945,-0.009231,0.0,-0.004034,-0.1,387.939165,1,2,-3.0,1149,-0.493676,-2.161520,1.221109,1.753900,-1.525408,-0.205418,-5.019174,8.015313,3.506986,9.349808,6.118387,-2.265519,7.636057,1.652659,5.201000,-5.340481,6.395903,1.599781,-24.141975,3.117224,-14.206773,4.466240,4.281243,4.805503,3.028896,-0.883370,3.582026,2.609052,2.348282,8.667027,6.345811,9.595382,-10.656566,8.533880,2.318182,-0.682522,0.336859,-0.707924,-0.475068,-0.024985,-0.587743,-0.303428,30,1,66,4,0,10,0,0
80004,9.5,4630.0,-16.0,3,64,286.088219,-0.000120,-0.078788,-0.001874,-0.004615,0.0,0.011669,0.0,373.836292,2,0,0.0,748,3.542230,2.191832,4.169101,3.464667,3.782163,4.101749,-0.417519,0.839062,4.067984,2.773834,0.498122,11.006049,0.865281,3.497071,-0.792528,-2.171116,3.574301,-1.653272,-10.645558,-1.715422,-13.116162,-2.079830,-2.723638,-1.126541,-2.559244,-5.150770,-1.904746,-4.352713,1.403582,6.886364,3.607818,4.276190,31.565657,3.785750,23.106061,0.037870,-0.060290,-0.058290,0.513097,0.404125,0.325926,0.394998,20,3,65,7,0,7,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,10.5,-205.0,20.0,3,32,-28.638203,0.000000,-0.048485,0.000000,-0.032308,-0.5,0.000000,-0.2,-21.449110,-1,-1,-3.0,495,0.577163,1.701094,0.498901,0.985490,7.448273,2.381172,7.024265,-1.561266,-2.005468,-1.050933,-0.363901,-3.587771,-0.712373,-8.720649,2.958294,-5.150007,3.096387,2.085800,-2.628554,5.880634,-8.159004,-3.122463,-3.363286,-2.784501,-2.847717,-0.994622,-2.598815,-3.687398,-4.286663,-23.780303,-3.872768,-17.152547,-43.939394,-5.393132,-26.808081,-1.130996,-1.172924,-1.137574,-1.094476,-1.911447,-1.584160,-1.698056,0,-2,0,-2,-1,0,-1,1
89996,1.2,-961.0,60.0,3,32,-267.789672,0.000000,0.006061,0.000000,-0.003077,0.5,0.000000,0.0,-263.316003,0,0,3.0,-263,5.068595,9.940061,5.800233,6.563325,5.719594,4.501639,5.345234,-10.213597,-10.388711,-10.405238,-13.248321,-10.374478,-10.058693,-9.964337,-5.848546,-2.096420,-5.674781,-6.888036,-2.870474,-7.167300,-1.290512,-4.025591,-4.855571,-4.516814,-5.022042,-2.525531,-4.081269,-3.022852,-9.735391,-9.277778,-9.486322,-6.973738,-23.015873,-7.579941,-17.642857,-2.179892,-3.881796,-1.784083,-2.998821,-4.958403,-1.799119,-4.691858,0,-3,0,-9,-1,0,-1,1
89997,-0.5,437.0,-22.0,3,32,63.913081,0.002046,0.054545,0.000000,0.004615,-0.5,0.002401,-0.4,59.118578,3,1,-8.0,52,1.955220,3.964935,1.691596,2.257123,1.123567,1.816616,1.675151,-6.556725,-2.977687,-4.972702,-5.594135,-4.291857,-4.902396,0.017959,3.229114,8.020792,3.177559,5.890831,5.832359,5.745995,10.868591,-4.981453,-7.786055,-4.154681,-6.587777,-4.562113,-4.824453,-5.461231,-2.233274,15.642677,-4.417828,5.299839,-16.515152,-3.504601,-5.964646,-0.654308,-0.624952,-0.827190,-0.611505,1.396891,-0.922294,1.887679,-1,1,0,2,0,4,2,0
89998,-4.1,772.0,-729.0,3,32,131.691460,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,113.334918,0,0,-5.0,124,60.306679,68.457527,60.454183,62.623955,67.283849,61.285286,66.458188,70.189044,69.000416,69.737500,69.746765,76.270833,69.600138,69.780654,50.213758,59.535571,49.776150,54.378595,61.000305,52.729161,57.353900,5.941876,5.906462,5.794820,6.388738,7.222374,6.016272,5.786916,56.722479,60.543956,57.940694,58.622344,82.833333,55.538620,68.730769,1.599766,0.606609,1.732851,0.917217,1.287341,1.269249,0.799677,0,0,0,0,0,0,0,1


### Save the final dataset in a CSV

In [8]:
final_dataset.to_csv("../data/finalDataset.csv", index=False)
