In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [43]:
df = pd.read_csv('wimbeldon.csv')
df.head()

Unnamed: 0,PlayerA,PlayerB,RankA,RankB,RankDiff,WinnerBinary
0,Alcaraz C.,Djokovic N.,3,2,1.0,1
1,Djokovic N.,Musetti L.,2,25,-23.0,1
2,Alcaraz C.,Medvedev D.,3,5,-2.0,1
3,Fritz T.,Musetti L.,12,25,-13.0,0
4,De Minaur A.,Djokovic N.,9,2,7.0,0


In [44]:
# Eliminem PlayerA i PlayerB perque només podem treballar amb valors numerics
X = df[['RankA', 'RankB', 'RankDiff']]
y = df['WinnerBinary']

X_train, X_test, y_train, y_test = train_test_split(X, y)

regr = LinearRegression()

regr.fit(X_train, y_train)
print("R²:", regr.score(X_test, y_test))

R²: 0.0610913590870239


# LOGISTIC REGRESSION AMB NOMÉS PARTITS WIMBELDON
Hem vist que amb regressió linial no funciona gaire bé, aixi que ara provarem amb una regressió logistica.

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.7007874015748031
              precision    recall  f1-score   support

           0       0.70      0.65      0.68       365
           1       0.70      0.75      0.72       397

    accuracy                           0.70       762
   macro avg       0.70      0.70      0.70       762
weighted avg       0.70      0.70      0.70       762



# RANDOM FOREST AMB NOMÉS PARTITS WIMBELDON
Probem amb Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

print("RF Accuracy:", rf.score(X_test, y_test))

RF Accuracy: 0.6404199475065617


# RANDOM FOREST AMB TOTES LES DADES
Ara provarem amb totes les dades, pot ser que amb dades més generals, sigui capaç de predir millor.

In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# 1. Cargar
df_all = pd.read_csv('all_data.csv')

# 2. Limpiar NR
df_all['RankA'] = df_all['RankA'].replace('NR', 2000)
df_all['RankB'] = df_all['RankB'].replace('NR', 2000)
df_all['RankA'] = pd.to_numeric(df_all['RankA'], errors='coerce')
df_all['RankB'] = pd.to_numeric(df_all['RankB'], errors='coerce')

# 3. RANKDIFF CORRECTO: B - A (positivo si A es mejor)
df_all['RankDiff'] = df_all['RankB'] - df_all['RankA']

# 4. Verificar (¡MUY IMPORTANTE!)
print("Ejemplo de RankDiff:")
print(df_all[['RankA', 'RankB', 'RankDiff', 'WinnerBinary']].head(10))

# 5. Features
X = pd.get_dummies(
    df_all[['RankA', 'RankB', 'RankDiff', 'Court', 'Surface']],
    columns=['Court', 'Surface'],
    drop_first=True
)

# 6. Target
y = df_all['WinnerBinary'].astype(int)

# 7. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 8. Modelo
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    class_weight='balanced',
    random_state=42
)
rf.fit(X_train, y_train)

# 9. Evaluar
print("\nRF Accuracy:", rf.score(X_test, y_test))

Ejemplo de RankDiff:
   RankA  RankB  RankDiff  WinnerBinary
0  149.0    5.0    -144.0             0
1   34.0   83.0      49.0             1
2   33.0   83.0      50.0             0
3   28.0    5.0     -23.0             0
4   27.0   34.0       7.0             0
5  149.0   20.0    -129.0             1
6   43.0   33.0     -10.0             0
7    5.0   46.0      41.0             1
8   28.0   17.0     -11.0             1
9   83.0   67.0     -16.0             1

RF Accuracy: 0.6511474448273333


## Mejoras - Columnas a añadir
- **AgeDiff** (AgeA - AgeB)
- **H2H** "Número de veces que A ganó a B" (H2H_A_wins, H2H_B_wins, H2H_Diff_wins)
- Poner los **AvgWinners** i **AvgLoser**
- Provar amb **LogisticRegression** i **RandomForest**
- Més endavant provar amb **XGBoosting**

# REGRESIÓ LOGISTICA AMB TOTES LES DADES

In [68]:
lr = LogisticRegression()
lr.fit(X_train_clean, y_train_clean)

y_pred = lr.predict(X_test_clean)
print("Accuracy: ", accuracy_score(y_test_clean, y_pred))
print(classification_report(y_test_clean, y_pred))

Accuracy:  0.6488217664688253
              precision    recall  f1-score   support

           0       0.65      0.62      0.64      8449
           1       0.64      0.67      0.66      8568

    accuracy                           0.65     17017
   macro avg       0.65      0.65      0.65     17017
weighted avg       0.65      0.65      0.65     17017

