In [2]:

import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import StratifiedKFold

In [3]:
df = pd.read_csv('/Users/joaosequeira/udacity_joao/atp_analysis/atp_tennis.csv')


In [4]:
df.head(10)

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,score
0,Brisbane International,2012-12-31,ATP250,Outdoor,Hard,1st Round,3,Mayer F.,Giraldo S.,Mayer F.,28,57,1215,778,1.36,3.0,6-4 6-4
1,Brisbane International,2012-12-31,ATP250,Outdoor,Hard,1st Round,3,Benneteau J.,Nieminen J.,Nieminen J.,35,41,1075,927,2.2,1.61,3-6 6-2 1-6
2,Brisbane International,2012-12-31,ATP250,Outdoor,Hard,1st Round,3,Nishikori K.,Matosevic M.,Nishikori K.,19,49,1830,845,1.25,3.75,7-5 6-2
3,Brisbane International,2012-12-31,ATP250,Outdoor,Hard,1st Round,3,Mitchell B.,Baghdatis M.,Baghdatis M.,326,36,137,1070,9.0,1.07,4-6 4-6
4,Brisbane International,2013-01-01,ATP250,Outdoor,Hard,1st Round,3,Istomin D.,Klizan M.,Istomin D.,43,30,897,1175,1.9,1.8,6-1 6-2
5,Brisbane International,2013-01-01,ATP250,Outdoor,Hard,1st Round,3,Ito T.,Millman J.,Millman J.,79,199,655,239,2.2,1.61,4-6 1-6
6,Brisbane International,2013-01-01,ATP250,Outdoor,Hard,1st Round,3,Falla A.,Levine J.,Falla A.,54,104,809,530,2.2,1.61,6-1 7-6
7,Brisbane International,2013-01-01,ATP250,Outdoor,Hard,1st Round,3,Kudla D.,Melzer J.,Melzer J.,137,29,402,1177,2.62,1.44,6-2 4-6 4-6
8,Brisbane International,2013-01-01,ATP250,Outdoor,Hard,1st Round,3,Robredo T.,Harrison R.,Robredo T.,114,69,495,710,3.0,1.36,6-4 7-6
9,Brisbane International,2013-01-01,ATP250,Outdoor,Hard,1st Round,3,Baker B.,Dimitrov G.,Dimitrov G.,61,48,756,866,3.0,1.36,3-6 6-7


In [5]:


def winner_metrics(col_x, col_y, df, winner_col='Winner', player1_col='Player_1', player2_col='Player_2'):
    """
    Função que extrai valores de colunas de um DataFrame, de acordo com condições de outras colunas.

    Argumentos:
    df -- DataFrame Pandas a ser utilizado.
    winner_col -- Nome da coluna que será comparada com "player1_col" e "player2_col".
    player1_col -- Nome da coluna que será comparada com "winner_col".
    player2_col -- Nome da coluna que será comparada com "winner_col".
    col_x -- Nome da coluna de onde será extraído o valor caso "player1_col=winner_col".
    col_y -- Nome da coluna de onde será extraído o valor caso "player2_col=winner_col".

    Retorna:
    Um DataFrame com as colunas "winner" e "odd".
    """

    # Inicializa dicionários para armazenar os valores extraídos
    result_dict = {"winner": [], "metric": []}

    # Itera sobre as linhas do DataFrame
    for index, row in df.iterrows():

        # Compara os valores das colunas "winner_col" e "player1_col"
        if row[player1_col] == row[winner_col]:
            result_dict["winner"].append(row[player1_col])
            result_dict["metric"].append(row[col_x])

        # Compara os valores das colunas "winner_col" e "player2_col"
        elif row[player2_col] == row[winner_col]:
            result_dict["winner"].append(row[player2_col])
            result_dict["metric"].append(row[col_y])

    # Retorna um DataFrame com as colunas "winner" e "odd"
    return pd.DataFrame(result_dict)


In [6]:
df_winner_odds = winner_metrics('Odd_1','Odd_2',df)

counts = df_winner_odds['winner'].value_counts()

top_counts = counts.sort_values(ascending=False)[:15]

treated_odd = [odd for odd in df_winner_odds['metric'] if odd <= 6]



fig = go.Figure(data=[go.Bar(

            x=top_counts.index, 
            y=top_counts.values

)])

fig.update_layout(title="Quantidade partidas ganhas")

pio.show(fig)


# Cria o histograma com a função "histogram"
fig = go.Figure(data=[go.Histogram(x=treated_odd, nbinsx=100)])

fig.update_layout(title="Distribuição de odds")

# Exibe o gráfico
fig.show()


In [7]:
# Define uma função que retorna o vencedor de uma partida
def get_winner_rank(df):
    if df['Winner'] == df['Player_1']:
        return df['Rank_1']
    else:
        return df['Rank_2']
    
# Define uma função que retorna o perdedor de uma partida
def get_loser_rank(df):
    if df['Winner'] != df['Player_1']:
        return df['Rank_1']
    else:
        return df['Rank_2']
    
# Aplica a função ao DataFrame para criar a coluna "Loser"
df['Rank_loser'] = df.apply(get_loser_rank, axis=1)
df['Rank_winner'] = df.apply(get_winner_rank, axis=1)

# Criando df com os ganhadores tendo rank pior
df_loser_bigger_rank = df[df['Rank_winner'] > df['Rank_loser']]

count_loser_bigger_rank = len(df_loser_bigger_rank)/len(df)
print("Número de eventos onde o rank do vencedor é maior do que o do perdedor: ", count_loser_bigger_rank)



Número de eventos onde o rank do vencedor é maior do que o do perdedor:  0.351391846068922


In [8]:

rank_difference = df_loser_bigger_rank['Rank_winner'] - df_loser_bigger_rank['Rank_loser']

treated_orank_difference = [rank for rank in rank_difference if rank <=400]

# Cria o histograma com a função "histogram"
fig = go.Figure(data=[go.Histogram(x=treated_orank_difference, nbinsx=100)])

fig.update_layout(title="Distribuição de diferença entre o ranking")

# Exibe o gráfico
fig.show()


In [9]:
treated_df = df[['Date','Series','Court','Surface','Round','Best of','Player_1','Rank_1','Player_2','Rank_2','Rank_winner']]

treated_df['Date'] = pd.to_datetime(treated_df['Date'])

#Funcao para definir valores aos vencedores e perdedores
def get_winner_loser_values(row):
    if row['Rank_winner'] == row['Rank_1']:
        return pd.Series({'Winner_Value': 1, 'Loser_Value': -1})
    else:
        return pd.Series({'Winner_Value': -1, 'Loser_Value': 1})

treated_df[['Player_1_value', 'Player_2_value']] = treated_df.apply(get_winner_loser_values, axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [10]:
'''
This code receives a DataFrame named 'df' and performs the following operations:

Selects 11 columns from 'df' and assigns them to a new DataFrame named 'prep_treated_df'.
Converts the 'Date' column in 'prep_treated_df' to datetime format.
Defines a function named 'get_winner_loser_values' that assigns a value of 1 to the winner and -1 to the loser in each row of 'prep_treated_df', and applies this function to each row of the DataFrame, assigning the results to new columns named 'Player_1_value' and 'Player_2_value'.
Creates a new DataFrame named 'df_momentum' that selects 5 columns from 'prep_treated_df' and renames them to 'Date', 'Player_1', 'Player_2', 'Value_1', and 'Value_2', respectively.
Defines a function named 'stack_columns_and_running_sum' that concatenates the 'Date', 'Player_1', and 'Value_1' columns of 'df_momentum' with the 'Date', 'Player_2', and 'Value_2' columns of 'df_momentum' (renamed to 'Player' and 'Values', respectively), resulting in a new DataFrame named 'df_stacked'. This function also calculates the cumulative sum of the 'Values' column partitioned by 'Player' and ordered by 'Date', and assigns the result to a new column named 'Cumulative_Sum' in 'df_stacked'.
Applies the 'stack_columns_and_running_sum' function to 'df_momentum', assigning the result to 'df_momentum'.
Then, merges trated_df with momentum dataframe.
'''

prep_treated_df = df[['Date','Series','Court','Surface','Round','Best of','Player_1','Rank_1','Player_2','Rank_2','Rank_winner']]

prep_treated_df['Date'] = pd.to_datetime(prep_treated_df['Date'])

#Funcao para definir valores aos vencedores e perdedores
def get_winner_loser_values(row):
    if row['Rank_winner'] == row['Rank_1']:
        return pd.Series({'Winner_Value': 1, 'Loser_Value': -1})
    else:
        return pd.Series({'Winner_Value': -1, 'Loser_Value': 1})

prep_treated_df[['Value_1', 'Value_2']] = prep_treated_df.apply(get_winner_loser_values, axis=1)


df_momentum = prep_treated_df[['Date','Player_1','Player_2','Value_1','Value_2']]

#criando o dataframe com o momento
def stack_columns_and_running_sum(df):
    # Cria um novo DataFrame com as colunas empilhadas
    df_stacked = pd.concat([df[['Date', 'Player_1', 'Value_1']].rename(columns={'Player_1': 'Player', 'Value_1': 'Values'}), 
                            df[['Date', 'Player_2', 'Value_2']].rename(columns={'Player_2': 'Player', 'Value_2': 'Values'})])
    
    # Calcula a soma cumulativa da coluna "Values" particionando por "Player" e ordenando por "Date"
    df_stacked = df_stacked.sort_values(['Player', 'Date'])
    df_stacked['Cumulative_Sum'] = df_stacked.groupby('Player')['Values'].cumsum()
    
    return df_stacked[['Date', 'Player','Cumulative_Sum']]


df_momentum = stack_columns_and_running_sum(df_momentum)

treated_df = pd.merge(prep_treated_df, df_momentum, left_on=['Date', 'Player_1'], right_on=['Date', 'Player'])
treated_df = pd.merge(treated_df, df_momentum, left_on=['Date', 'Player_2'], right_on=['Date', 'Player']).rename(columns={'Cumulative_Sum_x': 'Momentum_1', 'Cumulative_Sum_y': 'Momentum_2'}).drop(columns=['Player_x','Player_y'])







A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [11]:
treated_df.head()

Unnamed: 0,Date,Series,Court,Surface,Round,Best of,Player_1,Rank_1,Player_2,Rank_2,Rank_winner,Value_1,Value_2,Momentum_1,Momentum_2
0,2012-12-31,ATP250,Outdoor,Hard,1st Round,3,Mayer F.,28,Giraldo S.,57,28,1,-1,0,-2
1,2012-12-31,ATP250,Outdoor,Hard,1st Round,3,Benneteau J.,35,Nieminen J.,41,41,-1,1,-2,2
2,2012-12-31,ATP250,Outdoor,Hard,1st Round,3,Nishikori K.,19,Matosevic M.,49,19,1,-1,2,-1
3,2012-12-31,ATP250,Outdoor,Hard,1st Round,3,Mitchell B.,326,Baghdatis M.,36,36,-1,1,-1,0
4,2013-01-01,ATP250,Outdoor,Hard,1st Round,3,Istomin D.,43,Klizan M.,30,43,1,-1,1,-1


In [17]:

prep_treated_model_df = treated_df[['Court', 'Surface', 'Rank_1', 'Rank_2', 'Rank_winner','Momentum_1', 'Momentum_2']]

def transform_to_dummies(df):
    """
    Transforma as colunas do tipo objeto em dummies
    
    Parameters:
    df (pandas DataFrame): DataFrame com as colunas a serem transformadas em dummies
    
    Returns:
    df (pandas DataFrame): DataFrame com as colunas do tipo objeto transformadas em dummies
    """
    object_columns = list(df.select_dtypes(include='object').columns)
    for column in object_columns:
        df = pd.concat([df, pd.get_dummies(df[column], prefix=column)], axis=1)
        df = df.drop(column, axis=1)
    return df


treated_model_df = transform_to_dummies(prep_treated_model_df).astype(int)

In [18]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calcula a matriz de correlação
correlation_matrix = treated_model_df.corr()

# Exibe a matriz de correlação
print(correlation_matrix)

                 Rank_1    Rank_2  Rank_winner  Momentum_1  Momentum_2   
Rank_1         1.000000  0.071273     0.439233   -0.249757   -0.076918  \
Rank_2         0.071273  1.000000     0.442294   -0.076077   -0.257552   
Rank_winner    0.439233  0.442294     1.000000   -0.219299   -0.217310   
Momentum_1    -0.249757 -0.076077    -0.219299    1.000000    0.128531   
Momentum_2    -0.076918 -0.257552    -0.217310    0.128531    1.000000   
Court_Indoor  -0.014408 -0.001642    -0.002749    0.000397    0.005361   
Court_Outdoor  0.014408  0.001642     0.002749   -0.000397   -0.005361   
Surface_Clay   0.043631  0.038289     0.060610   -0.043446   -0.041096   
Surface_Grass  0.023696  0.011106     0.017885   -0.005240   -0.008471   
Surface_Hard  -0.055785 -0.042810    -0.067960    0.043906    0.043760   

               Court_Indoor  Court_Outdoor  Surface_Clay  Surface_Grass   
Rank_1            -0.014408       0.014408      0.043631       0.023696  \
Rank_2            -0.001642       0

In [19]:
def build_xgb_model(data):

    data.dropna(inplace=True)
    # Separando os recursos e a variável alvo
    X = data.drop('Rank_winner', axis=1)
    y = data['Rank_winner']
    
    # Dividindo os dados em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Definindo os parâmetros para otimização
    param_grid = {'max_depth': [3, 5, 7], 'learning_rate': [0.05, 0.1, 0.15], 'n_estimators': [100, 200, 300]}
    
    # Criando o classificador XGBoost
    clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')
    
    # Otimizando os parâmetros do classificador usando GridSearchCV
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc')
    grid_search.fit(X_train, y_train)
    
    # Obtendo a melhor combinação de parâmetros
    best_params = grid_search.best_params_
    
    # Treinando o classificador usando a melhor combinação de parâmetros
    clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', max_depth=best_params['max_depth'], learning_rate=best_params['learning_rate'], n_estimators=best_params['n_estimators'])
    clf.fit(X_train, y_train)
    
    # Avaliando a performance do modelo no conjunto de teste
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Armazenando as métricas de desempenho em um dicionário
    results = {'Accuracy': accuracy, 'AUC-ROC': auc_roc, 'F1 Score': f1}
    
    return results


In [22]:
treated_model_df.head()

Unnamed: 0,Rank_1,Rank_2,Rank_winner,Momentum_1,Momentum_2,Court_Indoor,Court_Outdoor,Surface_Clay,Surface_Grass,Surface_Hard
0,28,57,28,0,-2,0,1,0,0,1
1,35,41,41,-2,2,0,1,0,0,1
2,19,49,19,2,-1,0,1,0,0,1
3,326,36,36,-1,0,0,1,0,0,1
4,43,30,43,1,-1,0,1,0,0,1


In [21]:
# Separating the features and target variable
X = treated_model_df.drop('Rank_winner', axis=1)
y = treated_model_df['Rank_winner']

# Creating XGBClassifier
clf = xgb.XGBClassifier()

# Fit the model
clf.fit(X, y)

# Plot the feature importances
xgb.plot_importance(clf)
plt.show()

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
 450 451 452 453 454], got [   1    2    3    4    5    6    7    8    9   10   11   12   13   14
   15   16   17   18   19   20   21   22   23   24   25   26   27   28
   29   30   31   32   33   34   35   36   37   38   39   40   41   42
   43   44   45   46   47   48   49   50   51   52   53   54   55   56
   57   58   59   60   61   62   63   64   65   66   67   68   69   70
   71   72   73   74   75   76   77   78   79   80   81   82   83   84
   85   86   87   88   89   90   91   92   93   94   95   96   97   98
   99  100  101  102  103  104  105  106  107  108  109  110  111  112
  113  114  115  116  117  118  119  120  121  122  123  124  125  126
  127  128  129  130  131  132  133  134  135  136  137  138  139  140
  141  142  143  144  145  146  147  148  149  150  151  152  153  154
  155  156  157  158  159  160  161  162  163  164  165  166  167  168
  169  170  171  172  173  174  175  176  177  178  179  180  181  182
  183  184  185  186  187  188  189  190  191  192  193  194  195  196
  197  198  199  200  201  202  203  204  205  206  207  208  209  210
  211  212  213  214  215  216  217  219  221  222  223  224  225  226
  227  228  229  230  231  232  233  234  235  236  237  238  239  240
  241  242  243  244  245  246  247  248  249  250  251  252  253  254
  255  256  258  259  260  261  262  263  264  265  266  267  268  269
  270  271  272  273  274  275  276  277  278  279  280  281  282  283
  284  285  286  287  288  289  290  291  292  293  294  297  298  299
  300  301  302  303  304  306  308  309  310  311  312  313  314  315
  316  317  319  324  326  328  329  331  333  334  335  337  338  339
  340  344  347  348  349  352  354  355  358  360  361  364  366  369
  370  372  373  374  375  378  380  381  382  385  386  389  392  394
  395  399  401  402  405  406  407  410  412  413  415  417  419  420
  421  422  423  426  429  433  437  438  439  440  442  443  446  449
  453  454  455  457  461  470  474  475  479  487  490  491  495  498
  502  503  510  517  524  526  528  533  542  546  547  549  550  551
  553  564  570  581  583  589  607  617  635  639  641  653  654  655
  658  663  665  667  683  698  716  739  759  761  762  772  784  797
  808  820  824  826  832  837  842  847  861  862  920  946  993 1018
 1020 1035 1042 1045 1165 1184 1821]

In [120]:
treated_model_df.dtypes

Rank_1                 int64
Rank_2                 int64
Rank_winner            int64
Momentum_1             int64
Momentum_2             int64
Series_ATP250           bool
Series_ATP500           bool
Series_Grand Slam       bool
Series_Masters 1000     bool
Series_Masters Cup      bool
Court_Indoor            bool
Court_Outdoor           bool
Surface_Clay            bool
Surface_Grass           bool
Surface_Hard            bool
Round_1st Round         bool
Round_2nd Round         bool
Round_3rd Round         bool
Round_4th Round         bool
Round_Quarterfinals     bool
Round_Round Robin       bool
Round_Semifinals        bool
Round_The Final         bool
dtype: object