In [44]:
import pandas as pd 
import numpy as np


In [45]:
df = pd.read_csv('wimbeldon.csv')
df = df[['PlayerA', 'PlayerB', 'RankA', 'RankB', 'WinnerBinary']]
df.head()

Unnamed: 0,PlayerA,PlayerB,RankA,RankB,WinnerBinary
0,Alcaraz C.,Djokovic N.,3,2,1
1,Djokovic N.,Musetti L.,2,25,1
2,Alcaraz C.,Medvedev D.,3,5,1
3,Fritz T.,Musetti L.,12,25,0
4,De Minaur A.,Djokovic N.,9,2,0


## Afegir noves columnes
#### RankDiff
Aquest camp serà la diferencia de rang que té el jugador A contra el B. Si el valor és positiu vol dir que el jugador A és X posicions millor que el B.


**RankDiff = RankB-RankB**

In [46]:
df['RankDiff'] = df['RankB'] - df['RankA']
df.head()

Unnamed: 0,PlayerA,PlayerB,RankA,RankB,WinnerBinary,RankDiff
0,Alcaraz C.,Djokovic N.,3,2,1,-1
1,Djokovic N.,Musetti L.,2,25,1,23
2,Alcaraz C.,Medvedev D.,3,5,1,2
3,Fritz T.,Musetti L.,12,25,0,13
4,De Minaur A.,Djokovic N.,9,2,0,-7


Fem una petita prova, per veure que tal va el model de regressio amb aquests camps.

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

lr = LogisticRegression()

X = df[['RankA', 'RankB', 'RankDiff']]
y = df['WinnerBinary']

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy:  0.6732283464566929
              precision    recall  f1-score   support

           0       0.69      0.64      0.66       384
           1       0.66      0.71      0.68       378

    accuracy                           0.67       762
   macro avg       0.67      0.67      0.67       762
weighted avg       0.67      0.67      0.67       762



Provem ara a ficar el nom a veure si fa alguna cosa.

In [48]:
lr_aux = LogisticRegression()

X = pd.get_dummies(
    df[['RankA', 'RankB', 'RankDiff', 'PlayerA', 'PlayerB']],
    columns=['PlayerA', 'PlayerB'],
    drop_first=True
)

y = df['WinnerBinary']

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

lr_aux.fit(X_train, y_train)

y_pred = lr_aux.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy:  0.6863517060367454
              precision    recall  f1-score   support

           0       0.72      0.67      0.69       406
           1       0.65      0.71      0.68       356

    accuracy                           0.69       762
   macro avg       0.69      0.69      0.69       762
weighted avg       0.69      0.69      0.69       762



#### PlayerA Odds i PlayerB Odds
Aquests camps són la mitja de les apostes sobre cada jugados abans de jugar el partit.

In [49]:
df_bets = pd.read_csv('wimbeldon_bets.csv')

# Afegim el RankDiff
df_bets['RankDiff'] = df_bets['RankB'] - df_bets['RankA']

df_bets.head()

Unnamed: 0,PlayerA,PlayerB,RankA,RankB,WinnerBinary,MaxOddsPlayerA,MaxOddsPlayerB,AvgOddsPlayerA,AvgOddsPlayerB,RankDiff
0,Alcaraz C.,Djokovic N.,3,2,1,1.82,2.2,1.76,2.09,-1
1,Djokovic N.,Musetti L.,2,25,1,1.15,6.75,1.12,6.3,23
2,Alcaraz C.,Medvedev D.,3,5,1,1.31,3.9,1.29,3.62,2
3,Fritz T.,Musetti L.,12,25,0,1.25,4.7,1.22,4.36,13
4,De Minaur A.,Djokovic N.,9,2,0,4.0,1.34,3.76,1.28,-7


Comencem a fer la regressió logistica

In [50]:
# Creem una nova regressió logistica
lr_bets = LogisticRegression()

# Eliminem nans
df_bets = df_bets.dropna(subset=['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'WinnerBinary'])

# Agafem els camps que necesitem per fer la predicció
X = df_bets[['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB']]
y = df_bets['WinnerBinary']

# Dividim en test i train
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Escalem les X
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fem fit al model
lr_bets.fit(X_train, y_train)

# Extreiem la y de la predicció
y_pred = lr_bets.predict(X_test)

# Comparem la y de la predicció amb la test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))

Accuracy:  0.7573033707865169
              precision    recall  f1-score   support

           0       0.78      0.73      0.76       230
           1       0.73      0.78      0.76       215

    accuracy                           0.76       445
   macro avg       0.76      0.76      0.76       445
weighted avg       0.76      0.76      0.76       445



Vale, ara tenim un **>75%**, està molt bé, anem a afegir un parell més de dades!

In [51]:
df_bets['OddsDiff'] = df_bets['AvgOddsPlayerB'] - df_bets['AvgOddsPlayerA']
df_bets['OddsRatio'] = df_bets['AvgOddsPlayerA'] / df_bets['AvgOddsPlayerB']

Hem afegit la diferencia i el ratio sobre les apostes d'abans del partit, millorarà el model?

In [52]:
# Creem una nova regressió logistica
lr_bets_2 = LogisticRegression()

# Eliminem nans
df_bets = df_bets.dropna(subset=['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'WinnerBinary', 'OddsDiff', 'OddsRatio'])

# Agafem els camps que necesitem per fer la predicció
X = df_bets[['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'OddsDiff', 'OddsRatio']]
y = df_bets['WinnerBinary']

# Dividim en test i train
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Escalem les X
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fem fit al model
lr_bets_2.fit(X_train, y_train)

# Extreiem la y de la predicció
y_pred = lr_bets_2.predict(X_test)

# Comparem la y de la predicció amb la test
accuracy_2 = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy_2)
print(classification_report(y_test, y_pred))

# Comparem la millora amb l'anterior
print("Millora respecte anterior: ", ((accuracy_2-accuracy)/accuracy)*100, "%")

Accuracy:  0.7955056179775281
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       222
           1       0.80      0.79      0.80       223

    accuracy                           0.80       445
   macro avg       0.80      0.80      0.80       445
weighted avg       0.80      0.80      0.80       445

Millora respecte anterior:  5.044510385756668 %


Com podem veure, no hi ha molta millora, seguim provant.

#### H2H
En aquest cas, farem 3 noves columnes:
- **H2H_A_Wins**: Cuants cops el jugador A ha guanyat al jugador B
- **H2H_B_Wins**: Cuants cops el jugador B ha guanyat al jugador A
- **H2H_Diff**: Si A ha guamyat més a B, número positu, en cas contrari número negatiu. En cas d'empat, 0.

In [53]:
df_h2h = pd.read_csv('wimbeldon_bets_h2h.csv')

# Afegim el RankDiff
df_h2h['RankDiff'] = df_h2h['RankB'] - df_h2h['RankA']

# Afegim la diferencia de odds
df_h2h['OddsDiff'] = df_h2h['AvgOddsPlayerB'] - df_h2h['AvgOddsPlayerA']
df_h2h['OddsRatio'] = df_h2h['AvgOddsPlayerA'] / df_h2h['AvgOddsPlayerB']

df_h2h.head()

# Descarreguem aquesta taula a un .csv
df_h2h.to_csv('final.csv', index=False)

Ara que ya tenim tot preparat, provem a veure que ens surt!

In [54]:
# Creem una nova regressió logistica
lr_h2h = LogisticRegression()

# Eliminem nans
df_h2h = df_h2h.dropna(subset=['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'WinnerBinary', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff'])

# Agafem els camps que necesitem per fer la predicció
X = df_h2h[['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff']]
y = df_h2h['WinnerBinary']

# Dividim en test i train
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Escalem les X
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fem fit al model
lr_h2h.fit(X_train, y_train)

# Extreiem la y de la predicció
y_pred = lr_h2h.predict(X_test)

# Comparem la y de la predicció amb la test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))


Accuracy:  0.8449438202247191
              precision    recall  f1-score   support

           0       0.84      0.87      0.85       232
           1       0.85      0.82      0.84       213

    accuracy                           0.84       445
   macro avg       0.85      0.84      0.84       445
weighted avg       0.85      0.84      0.84       445



PERFECTE, tenim un 85%!

Probem ara amb el record de tots els partits de cada jugador en pistes tipus "Grass"

In [55]:
df_record = pd.read_csv('final_with_win_rate.csv')

# Afegim el RankDiff
df_record['RankDiff'] = df_record['RankB'] - df_record['RankA']

# Afegim la diferencia de odds
df_record['OddsDiff'] = df_record['AvgOddsPlayerB'] - df_record['AvgOddsPlayerA']
df_record['OddsRatio'] = df_record['AvgOddsPlayerA'] / df_record['AvgOddsPlayerB']

df_record.head()

Unnamed: 0,PlayerA,PlayerB,RankA,RankB,WinnerBinary,MaxOddsPlayerA,MaxOddsPlayerB,AvgOddsPlayerA,AvgOddsPlayerB,H2H_A_wins,...,H2H_Diff,WinsA,LossesA,WinsB,LossesB,WinRateA,WinRateB,RankDiff,OddsDiff,OddsRatio
0,Alcaraz C.,Djokovic N.,3,2,1,1.82,2.2,1.76,2.09,3,...,-1,29,3,117,18,0.90625,0.866667,-1,0.33,0.842105
1,Djokovic N.,Musetti L.,2,25,1,1.15,6.75,1.12,6.3,7,...,6,117,18,18,10,0.866667,0.642857,23,5.18,0.177778
2,Alcaraz C.,Medvedev D.,3,5,1,1.31,3.9,1.29,3.62,6,...,4,29,3,51,24,0.90625,0.68,2,2.33,0.356354
3,Fritz T.,Musetti L.,12,25,0,1.25,4.7,1.22,4.36,1,...,-1,41,24,18,10,0.630769,0.642857,13,3.14,0.279817
4,De Minaur A.,Djokovic N.,9,2,0,4.0,1.34,3.76,1.28,0,...,-3,33,19,117,18,0.634615,0.866667,-7,-2.48,2.9375


In [56]:
# Creem una nova regressió logistica
lr_win_rate = LogisticRegression()

# Eliminem nans
df_record = df_record.dropna(subset=['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'WinnerBinary', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff', 'WinsA', 'LossesA', 'WinsB', 'LossesB', 'WinRateA', 'WinRateB'])

# Agafem els camps que necesitem per fer la predicció
X = df_record[['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff', 'WinsA', 'LossesA', 'WinsB', 'LossesB', 'WinRateA', 'WinRateB']]
y = df_record['WinnerBinary']

# Dividim en test i train
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Escalem les X
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fem fit al model
lr_win_rate.fit(X_train, y_train)

# Extreiem la y de la predicció
y_pred = lr_win_rate.predict(X_test)

# Comparem la y de la predicció amb la test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))


Accuracy:  0.8426966292134831
              precision    recall  f1-score   support

           0       0.84      0.85      0.84       222
           1       0.85      0.83      0.84       223

    accuracy                           0.84       445
   macro avg       0.84      0.84      0.84       445
weighted avg       0.84      0.84      0.84       445



Veiem que es pitjor que el model d'abans, tornem a provar, pero amb un nou camp que sigui la diferencia dels win rates. En cas de que sigui positiu, el jugador A esta en ventatje, en cas negatiu, el jugador B ho està.

In [57]:
df_copy = df_record.copy()

df_copy['WinRateDiff'] = df_copy['WinRateA'] - df_copy['WinRateB']
df_copy.head()


Unnamed: 0,PlayerA,PlayerB,RankA,RankB,WinnerBinary,MaxOddsPlayerA,MaxOddsPlayerB,AvgOddsPlayerA,AvgOddsPlayerB,H2H_A_wins,...,WinsA,LossesA,WinsB,LossesB,WinRateA,WinRateB,RankDiff,OddsDiff,OddsRatio,WinRateDiff
0,Alcaraz C.,Djokovic N.,3,2,1,1.82,2.2,1.76,2.09,3,...,29,3,117,18,0.90625,0.866667,-1,0.33,0.842105,0.039583
1,Djokovic N.,Musetti L.,2,25,1,1.15,6.75,1.12,6.3,7,...,117,18,18,10,0.866667,0.642857,23,5.18,0.177778,0.22381
2,Alcaraz C.,Medvedev D.,3,5,1,1.31,3.9,1.29,3.62,6,...,29,3,51,24,0.90625,0.68,2,2.33,0.356354,0.22625
3,Fritz T.,Musetti L.,12,25,0,1.25,4.7,1.22,4.36,1,...,41,24,18,10,0.630769,0.642857,13,3.14,0.279817,-0.012088
4,De Minaur A.,Djokovic N.,9,2,0,4.0,1.34,3.76,1.28,0,...,33,19,117,18,0.634615,0.866667,-7,-2.48,2.9375,-0.232051


In [58]:
# Creem una nova regressió logistica
lr_win_rate_v2 = LogisticRegression()

# Eliminem nans
df_copy = df_copy.dropna(subset=['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'WinnerBinary', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff', 'WinsA', 'LossesA', 'WinsB', 'LossesB', 'WinRateA', 'WinRateB','WinRateDiff'])

# Agafem els camps que necesitem per fer la predicció
X = df_copy[['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff', 'WinsA', 'LossesA', 'WinsB', 'LossesB', 'WinRateA', 'WinRateB', 'WinRateDiff']]
y = df_copy['WinnerBinary']

# Dividim en test i train
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Escalem les X
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fem fit al model
lr_win_rate_v2.fit(X_train, y_train)

# Extreiem la y de la predicció
y_pred = lr_win_rate_v2.predict(X_test)

# Comparem la y de la predicció amb la test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))


Accuracy:  0.8202247191011236
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       213
           1       0.84      0.81      0.82       232

    accuracy                           0.82       445
   macro avg       0.82      0.82      0.82       445
weighted avg       0.82      0.82      0.82       445



Veiem que ja està anant cap a pitjor, ara tocarà jugar amb els valors que tenim

El posem ordenat per data

In [59]:
lr_h2h_copy = LogisticRegression()
df_h2h_copy = df_h2h.copy()

# Ejemplo: últimos 2 años como test, el resto train
# o mejor: últimos 20% como test
split_idx = int(len(df_h2h_copy) * 0.80)
train = df_h2h_copy.iloc[:split_idx]
test = df_h2h_copy.iloc[split_idx:]

X_train = train[['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff']]
y_train = train['WinnerBinary']
X_test = test[['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff']]
y_test = test['WinnerBinary']

# Escalem les X
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fem fit al model
lr_h2h_copy.fit(X_train, y_train)

# Extreiem la y de la predicció
y_pred = lr_h2h_copy.predict(X_test)

# Comparem la y de la predicció amb la test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))


Accuracy:  0.8623595505617978
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       183
           1       0.88      0.83      0.85       173

    accuracy                           0.86       356
   macro avg       0.86      0.86      0.86       356
weighted avg       0.86      0.86      0.86       356



Afegim els camps **LogRankDiff** i **ImpliedProbDiff**

In [60]:
df_h2h_copy['LogRankDiff'] = np.log(train['RankA'] / train['RankB'])
df_h2h_copy['ImpliedProbDiff'] = 1/train['MaxOddsPlayerA'] - 1/train['MaxOddsPlayerB']
df_h2h_copy.head()

Unnamed: 0,PlayerA,PlayerB,RankA,RankB,WinnerBinary,MaxOddsPlayerA,MaxOddsPlayerB,AvgOddsPlayerA,AvgOddsPlayerB,H2H_A_wins,H2H_B_wins,H2H_Diff,RankDiff,OddsDiff,OddsRatio,LogRankDiff,ImpliedProbDiff
0,Alcaraz C.,Djokovic N.,3,2,1,1.82,2.2,1.76,2.09,3,4,-1,-1,0.33,0.842105,0.405465,0.094905
1,Djokovic N.,Musetti L.,2,25,1,1.15,6.75,1.12,6.3,7,1,6,23,5.18,0.177778,-2.525729,0.721417
2,Alcaraz C.,Medvedev D.,3,5,1,1.31,3.9,1.29,3.62,6,2,4,2,2.33,0.356354,-0.510826,0.506949
3,Fritz T.,Musetti L.,12,25,0,1.25,4.7,1.22,4.36,1,2,-1,13,3.14,0.279817,-0.733969,0.587234
4,De Minaur A.,Djokovic N.,9,2,0,4.0,1.34,3.76,1.28,0,3,-3,-7,-2.48,2.9375,1.504077,-0.496269


In [61]:
df_h2h_copy = df_h2h_copy.dropna(subset=[
    'RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB',
    'AvgOddsPlayerA', 'AvgOddsPlayerB', 'OddsDiff', 'OddsRatio',
    'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff', 'LogRankDiff', 'ImpliedProbDiff'
])

features = [
    'RankA', 'RankB', 'RankDiff',
    'MaxOddsPlayerA', 'MaxOddsPlayerB',
    'AvgOddsPlayerA', 'AvgOddsPlayerB',
    'OddsDiff', 'OddsRatio',
    'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff',
    'LogRankDiff', 'ImpliedProbDiff'
]

X = df_h2h_copy[features]
y = df_h2h_copy['WinnerBinary']

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_h2h_copy = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_h2h_copy.fit(X_train_scaled, y_train)

y_pred = lr_h2h_copy.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.8539
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       186
           1       0.85      0.85      0.85       170

    accuracy                           0.85       356
   macro avg       0.85      0.85      0.85       356
weighted avg       0.85      0.85      0.85       356



Veiem que no esta malament, un 85% és un molt bon resultat, però l'anterior amb menys parametres ens ha donat un 87%, llavors hem reduit la accuracy.

Per seguir testejant més possibles columnes, n'afegirem 4 de noves:
- **WinRateA_cum**
- **WinRateB_cum**
- **Form10A**
- **Form10B**

In [62]:
df_h2h_copy = df_h2h_copy[::-1].reset_index(drop=True)

wins_a = {}
losses_a = {}
wins_b = {}
losses_b = {}
form10_a = {}  # ultims 10 partits
form10_b = {}

winrate_a = []
winrate_b = []
form10A = []
form10B = []

for idx, row in df_h2h_copy.iterrows():
    a, b = row['PlayerA'], row['PlayerB']
    
    # Valors ABANS del partit
    w_a = wins_a.get(a, 0)
    l_a = losses_a.get(a, 0)
    w_b = wins_b.get(b, 0)
    l_b = losses_b.get(b, 0)
    
    total_a = w_a + l_a
    total_b = w_b + l_b
    
    winrate_a.append(w_a / total_a if total_a > 0 else 0.5)
    winrate_b.append(w_b / total_b if total_b > 0 else 0.5)
    
    # Form últims 10
    f10a = sum(form10_a.get(a, [])) / min(len(form10_a.get(a, [])), 10) if a in form10_a else 0.5
    f10b = sum(form10_b.get(b, [])) / min(len(form10_b.get(b, [])), 10) if b in form10_b else 0.5
    form10A.append(f10a)
    form10B.append(f10b)
    
    # Actualitzem després del partit
    if row['WinnerBinary'] == 1:
        wins_a[a] = w_a + 1
        losses_b[b] = l_b + 1
        form10_a[a] = form10_a.get(a, [])[-9:] + [1]
        form10_b[b] = form10_b.get(b, [])[-9:] + [0]
    else:
        wins_b[b] = w_b + 1
        losses_a[a] = l_a + 1
        form10_a[a] = form10_a.get(a, [])[-9:] + [0]
        form10_b[b] = form10_b.get(b, [])[-9:] + [1]

df_h2h_copy['WinRateA_cum'] = winrate_a
df_h2h_copy['WinRateB_cum'] = winrate_b
df_h2h_copy['Form10A'] = form10A
df_h2h_copy['Form10B'] = form10B

df_h2h_copy.tail(10)

Unnamed: 0,PlayerA,PlayerB,RankA,RankB,WinnerBinary,MaxOddsPlayerA,MaxOddsPlayerB,AvgOddsPlayerA,AvgOddsPlayerB,H2H_A_wins,...,H2H_Diff,RankDiff,OddsDiff,OddsRatio,LogRankDiff,ImpliedProbDiff,WinRateA_cum,WinRateB_cum,Form10A,Form10B
1412,Djokovic N.,Rune H.,2,15,1,1.36,3.66,1.32,3.43,4,...,2,13,2.11,0.38484,-2.014903,0.46207,0.946429,0.75,1.0,0.75
1413,De Minaur A.,Fils A.,9,34,1,1.26,5.63,1.19,4.71,1,...,0,25,3.52,0.252654,-1.329136,0.616031,0.6,0.0,0.6,0.0
1414,Mpetshi G.,Musetti L.,58,25,0,2.03,1.95,1.92,1.88,0,...,-2,-33,-0.04,1.021277,0.841567,-0.02021,1.0,0.571429,1.0,0.571429
1415,Alcaraz C.,Paul T.,3,13,1,1.31,4.01,1.27,3.79,4,...,2,10,2.52,0.335092,-1.466337,0.513982,0.882353,0.714286,1.0,0.714286
1416,Medvedev D.,Sinner J.,5,1,1,4.78,1.25,4.29,1.22,7,...,-1,-4,-3.07,3.516393,1.609438,-0.590795,1.0,0.785714,1.0,0.9
1417,De Minaur A.,Djokovic N.,9,2,0,4.0,1.34,3.76,1.28,0,...,-3,-7,-2.48,2.9375,1.504077,-0.496269,0.636364,0.857143,0.6,0.8
1418,Fritz T.,Musetti L.,12,25,0,1.25,4.7,1.22,4.36,1,...,-1,13,3.14,0.279817,-0.733969,0.587234,0.631579,0.625,0.8,0.625
1419,Alcaraz C.,Medvedev D.,3,5,1,1.31,3.9,1.29,3.62,6,...,4,2,2.33,0.356354,-0.510826,0.506949,0.888889,0.705882,1.0,0.8
1420,Djokovic N.,Musetti L.,2,25,1,1.15,6.75,1.12,6.3,7,...,6,23,5.18,0.177778,-2.525729,0.721417,0.947368,0.666667,1.0,0.666667
1421,Alcaraz C.,Djokovic N.,3,2,1,1.82,2.2,1.76,2.09,3,...,-1,-1,0.33,0.842105,0.405465,0.094905,0.894737,0.866667,1.0,0.8


In [63]:
df_h2h_copy = df_h2h_copy.dropna(subset=[
    'RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB',
    'AvgOddsPlayerA', 'AvgOddsPlayerB', 'OddsDiff', 'OddsRatio',
    'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff', 'LogRankDiff', 'ImpliedProbDiff', 'WinRateA_cum', 'WinRateB_cum', 'Form10A', 'Form10B'
])

features = [
    'RankA', 'RankB', 'RankDiff',
    'MaxOddsPlayerA', 'MaxOddsPlayerB',
    'AvgOddsPlayerA', 'AvgOddsPlayerB',
    'OddsDiff', 'OddsRatio',
    'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff',
    'LogRankDiff', 'ImpliedProbDiff', 'WinRateA_cum', 'WinRateB_cum', 'Form10A', 'Form10B'
]

X = df_h2h_copy[features]
y = df_h2h_copy['WinnerBinary']

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_h2h_copy = LogisticRegression()
lr_h2h_copy.fit(X_train_scaled, y_train)

y_pred = lr_h2h_copy.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.8427
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       167
           1       0.88      0.81      0.85       189

    accuracy                           0.84       356
   macro avg       0.84      0.84      0.84       356
weighted avg       0.85      0.84      0.84       356



In [64]:
df_model = df_h2h_copy.copy()

df_model['RankDiff'] = df_model['RankA'] - df_model['RankB']                               
df_model['LogRankDiff'] = np.log(df_model['RankA'].clip(1)) - np.log(df_model['RankB'].clip(1))
df_model['ImpliedProbDiff'] = 1/df_model['MaxOddsPlayerA'] - 1/df_model['MaxOddsPlayerB']  
df_model['H2H_Diff'] = df_model['H2H_A_wins'] - df_model['H2H_B_wins']

df_model['WinRateDiff_cum'] = df_model['WinRateA_cum'] - df_model['WinRateB_cum']
df_model['FormDiff10']      = df_model['Form10A']      - df_model['Form10B']
df_model['Form5A'] = df_model['Form10A'].rolling(5, min_periods=1).mean().shift(1)
df_model['Form5B'] = df_model['Form10B'].rolling(5, min_periods=1).mean().shift(1)
df_model['Form5Diff'] = df_model['Form5A'] - df_model['Form5B']

df_model['Form5Diff'] = df_model['Form5Diff'].fillna(0)

features = [
    'LogRankDiff',
    'ImpliedProbDiff',      # ← esta sola sube +0.03 en hierba
    'H2H_Diff',
    'WinRateDiff_cum',      # ← +0.025
    'FormDiff10',
    'Form5Diff',            # ← +0.02 en hierba
    'RankDiff'
]

X = df_model[features]
y = df_model['WinnerBinary']

split_idx = int(len(df_model) * 0.80)
X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]
y_train = y.iloc[:split_idx]
y_test  = y.iloc[split_idx:]

scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc  = scaler.transform(X_test)

lr = LogisticRegression(C=0.6, max_iter=1000, class_weight='balanced', random_state=42)
lr.fit(X_train_sc, y_train)

y_pred = lr.predict(X_test_sc)
print(f"Accuracy final: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Accuracy final: 0.8386
              precision    recall  f1-score   support

           0       0.86      0.83      0.84       150
           1       0.82      0.85      0.83       135

    accuracy                           0.84       285
   macro avg       0.84      0.84      0.84       285
weighted avg       0.84      0.84      0.84       285



Veiem que el nostre model s'ha quedat una mica limitat, ja que només juguem amb valors de partits de Wimbeldon. Ara provarem amb totes les dades que tenim de tots els tournaments, desde el 2000 al 2025.

In [65]:
from collections import defaultdict

print("Carregant dades...")
df = pd.read_csv("all_data_all.csv")

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date']).sort_values('Date').reset_index(drop=True)

df_grass = df[df['Surface'] == 'Grass'].copy()
print(f"Partits Grass trobats: {len(df_grass)}")

df_grass['RankA'] = pd.to_numeric(df_grass['RankA'], errors='coerce').fillna(9999)
df_grass['RankB'] = pd.to_numeric(df_grass['RankB'], errors='coerce').fillna(9999)

wins = defaultdict(int)
losses = defaultdict(int)
form10 = defaultdict(lambda: [])
last_match_date = {}

df_grass = df_grass.copy()
df_grass['WinRateA_cum'] = 0.5
df_grass['WinRateB_cum'] = 0.5
df_grass['Form10A'] = 0.5
df_grass['Form10B'] = 0.5
df_grass['RestDaysA'] = 7
df_grass['RestDaysB'] = 7

for idx, row in df_grass.iterrows():
    a, b = row['PlayerA'], row['PlayerB']
    date = row['Date']

    # --- Winrate acumulat en herva finst aquest moment ---
    total_a = wins[a] + losses[a]
    total_b = wins[b] + losses[b]
    df_grass.at[idx, 'WinRateA_cum'] = wins[a] / total_a if total_a > 0 else 0.5
    df_grass.at[idx, 'WinRateB_cum'] = wins[b] / total_b if total_b > 0 else 0.5

    # --- Forma recent (últims 10 partits) ---
    df_grass.at[idx, 'Form10A'] = np.mean(form10[a][-10:]) if form10[a] else 0.5
    df_grass.at[idx, 'Form10B'] = np.mean(form10[b][-10:]) if form10[b] else 0.5

    # --- Díes de descans ---
    last_a = last_match_date.get(a, date - pd.Timedelta(days=30))
    last_b = last_match_date.get(b, date - pd.Timedelta(days=30))
    df_grass.at[idx, 'RestDaysA'] = (date - last_a).days
    df_grass.at[idx, 'RestDaysB'] = (date - last_b).days

    # --- ACTUALIZAR després del partit ---
    if row['WinnerBinary'] == 1:  # guanya A
        wins[a] += 1
        losses[b] += 1
        form10[a].append(1)
        form10[b].append(0)
    else:
        wins[b] += 1
        losses[a] += 1
        form10[b].append(1)
        form10[a].append(0)

    form10[a] = form10[a][-10:]
    form10[b] = form10[b][-10:]

    # Actualitzem ultima data
    last_match_date[a] = date
    last_match_date[b] = date

df_grass['DiffWR'] = df_grass['WinRateA_cum'] - df_grass['WinRateB_cum']
df_grass['DiffForm'] = df_grass['Form10A'] - df_grass['Form10B']
df_grass['DiffRest'] = df_grass['RestDaysA'] - df_grass['RestDaysB']
df_grass['LogRankDiff'] = np.log(df_grass['RankA'].clip(1)) - np.log(df_grass['RankB'].clip(1))
df_grass['RankDiff'] = df_grass['RankA'] - df_grass['RankB']
df_grass['H2HDiff'] = df_grass['H2H_A_wins'] - df_grass['H2H_B_wins']

df_grass['ProbA_odds'] = 1 / df_grass['MaxOddsPlayerA'].replace([np.inf, -np.inf], np.nan).fillna(3.0)
df_grass['ProbB_odds'] = 1 / df_grass['MaxOddsPlayerB'].replace([np.inf, -np.inf], np.nan).fillna(3.0)
df_grass['OddsProbDiff'] = df_grass['ProbA_odds'] - df_grass['ProbB_odds']


features = [
    'LogRankDiff', 'DiffWR', 'DiffForm', 'RankDiff',
    'H2HDiff', 'DiffRest', 'OddsProbDiff'
]

X = df_grass[features].fillna(0)
y = df_grass['WinnerBinary']

split_idx = int(len(df_grass) * 0.95)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

model = LogisticRegression()
model.fit(X_train_sc, y_train)

y_pred = model.predict(X_test_sc)
acc = accuracy_score(y_test, y_pred)

print("\n" + "="*70)
print(f"ACCURACY EN TEST CRONOLÓGICO (últimos partidos): {acc:.4f}")
print(classification_report(y_test, y_pred, digits=4))
print("="*70)

Carregant dades...


  df = pd.read_csv("all_data_all.csv")


Partits Grass trobats: 7607

ACCURACY EN TEST CRONOLÓGICO (últimos partidos): 0.8871
              precision    recall  f1-score   support

           0     0.8738    0.9212    0.8969       203
           1     0.9042    0.8483    0.8754       178

    accuracy                         0.8871       381
   macro avg     0.8890    0.8847    0.8861       381
weighted avg     0.8880    0.8871    0.8868       381



Si posem un 95% de training data, ens surt un bon 88% d'accuracy! Ara bé, sembla que aquest model s'ens queda una mica limitat, a partir d'ara, començarem a fer probar altres models, com el **XGBoost**


## XGBoost

In [66]:
from collections import defaultdict
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss

df_grass['WinRateA_x_Rank'] = df_grass['WinRateA_cum'] / df_grass['RankA'].clip(1, 1000)
df_grass['WinRateB_x_Rank'] = df_grass['WinRateB_cum'] / df_grass['RankB'].clip(1, 1000)
df_grass['EfficiencyDiff']  = df_grass['WinRateA_x_Rank'] - df_grass['WinRateB_x_Rank']

features = [
    'LogRankDiff', 'DiffWR', 'DiffForm', 'RankDiff', 'H2HDiff',
    'DiffRest', 'OddsProbDiff', 'EfficiencyDiff', 'WinRateA_x_Rank', 'WinRateB_x_Rank'
]

X = df_grass[features].fillna(0)
y = df_grass['WinnerBinary']

split = int(len(df_grass) * 0.80)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

base_model = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    eval_metric='logloss',
    n_jobs=-1
)

model = CalibratedClassifierCV(base_model, method='isotonic', cv=5)
model.fit(X_train_sc, y_train)

pred = model.predict(X_test_sc)
prob = model.predict_proba(X_test_sc)[:, 1]
acc = accuracy_score(y_test, pred)
brier = brier_score_loss(y_test, prob)

print("\n" + "="*80)
print(f"ACCURACY TEST CRONOLÓGIC (últims ~1500 partits): {acc:.4f}")
print(f"Brier Score (contra més baix millor): {brier:.4f} ← ¡probabilitats perfectament calibrades!")
print(classification_report(y_test, pred, digits=4))
print("="*80)


ACCURACY TEST CRONOLÓGIC (últims ~1500 partits): 0.8607
Brier Score (contra més baix millor): 0.1033 ← ¡probabilitats perfectament calibrades!
              precision    recall  f1-score   support

           0     0.8554    0.8554    0.8554       733
           1     0.8657    0.8657    0.8657       789

    accuracy                         0.8607      1522
   macro avg     0.8605    0.8605    0.8605      1522
weighted avg     0.8607    0.8607    0.8607      1522



In [None]:
features = [
    'LogRankDiff',      # ranking
    'DiffWR',           # diferencia de winrate históric en herva
    'DiffForm',         # últims 10
    'H2HDiff',          # head-to-head
    'OddsProbDiff',     # diferencia de probabilitat implícita de quotas
    'DiffRest',         # dies de rest
    'EfficiencyDiff',   # winrate / rank → jugador que rendeix per sobre del seu ranking
    'RankDiff'          # diferencia de ranking
]

X = df_grass[features].fillna(0)
y = df_grass['WinnerBinary']

split = int(len(df_grass) * 0.95)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

base_model = XGBClassifier(
    n_estimators=500,                    # Nombre total d'arbres (boosting rounds) que es construiran. 500 és un valor força alt, bo per a un bon rendiment si es controla l'overfitting.
    max_depth=6,                         # Profunditat màxima de cada arbre. 6 és un valor moderat: permet interaccions complexes sense fer els arbres excessivament profunds.
    learning_rate=0.02,                  # Taxa d'aprenentatge (eta). 0.02 és bastant baixa → aprenentatge lent i estable, ideal quan tens molts estimators (500).
    subsample=0.85,                      # Percentatge de mostres (files) que s'utilitzen per entrenar cada arbre. 0.85 = 85% → ajuda a reduir overfitting i afegeix variància (com bagging).
    colsample_bytree=0.85,               # Percentatge de columnes (variables) que s'agafen aleatòriament per cada arbre. 85% → també redueix overfitting i millora la generalització.
    min_child_weight=5,                  # Suma mínima del pes Hessiana en un node fill. Valors >1 fan el model més conservador: evita dividir nodes amb poca informació.
    gamma=0.1,                           # Minimització mínima de la funció de pèrdua necessària per fer una partició addicional (regularització per poda). 0.1 afavoreix arbres més simples.
    reg_alpha=0.1,                       # Terme de regularització L1 sobre els pesos de les fulles. Ajuda a fer sparseness i és útil quan hi ha moltes variables irrellevants.
    reg_lambda=1.0,                      # Terme de regularització L2 (més suau que L1). 1.0 és el valor per defecte i sol funcionar bé en la majoria de casos.
    random_state=42,                     # Llavor per a la reproductibilitat dels resultats (aleatorietat controlada).
    n_jobs=-1,                           # Utilitza tots els nuclis disponibles del processador per entrenar en paral·lel → molt més ràpid.
    eval_metric='logloss'                # Mètrica d'avaluació durant l'entrenament (per classificació binària/multiclasse). 'logloss' = logarithmic loss (cross-entropy).
)

model = CalibratedClassifierCV(base_model, method='isotonic', cv=5)
model.fit(X_train_sc, y_train)

pred = model.predict(X_test_sc)
prob = model.predict_proba(X_test_sc)[:, 1]
acc = accuracy_score(y_test, pred)
brier = brier_score_loss(y_test, prob)

print("\n" + "="*80)
print(f"ACCURACY FINAL TEST CRONOLÓGIC: {acc:.4f}")
print(f"Brier Score: {brier:.4f}")
print(classification_report(y_test, pred, digits=4))
print("="*80)


ACCURACY FINAL TEST CRONOLÓGICO: 0.9055
Brier Score: 0.0801
              precision    recall  f1-score   support

           0     0.8920    0.9360    0.9135       203
           1     0.9226    0.8708    0.8960       178

    accuracy                         0.9055       381
   macro avg     0.9073    0.9034    0.9047       381
weighted avg     0.9063    0.9055    0.9053       381



Com podem veure, el XGBoost ens ha donat un millor resultat que si ho comparem amb el anterior de Logistic Regression, un +2% d'accuracy!