In [1]:
import pandas as pd 
import numpy as np


In [2]:
df = pd.read_csv('wimbeldon.csv')
df = df[['PlayerA', 'PlayerB', 'RankA', 'RankB', 'WinnerBinary']]
df.head()

Unnamed: 0,PlayerA,PlayerB,RankA,RankB,WinnerBinary
0,Alcaraz C.,Djokovic N.,3,2,1
1,Djokovic N.,Musetti L.,2,25,1
2,Alcaraz C.,Medvedev D.,3,5,1
3,Fritz T.,Musetti L.,12,25,0
4,De Minaur A.,Djokovic N.,9,2,0


## Afegir noves columnes
#### RankDiff
Aquest camp serà la diferencia de rang que té el jugador A contra el B. Si el valor és positiu vol dir que el jugador A és X posicions millor que el B.


**RankDiff = RankB-RankB**

In [3]:
df['RankDiff'] = df['RankB'] - df['RankA']
df.head()

Unnamed: 0,PlayerA,PlayerB,RankA,RankB,WinnerBinary,RankDiff
0,Alcaraz C.,Djokovic N.,3,2,1,-1
1,Djokovic N.,Musetti L.,2,25,1,23
2,Alcaraz C.,Medvedev D.,3,5,1,2
3,Fritz T.,Musetti L.,12,25,0,13
4,De Minaur A.,Djokovic N.,9,2,0,-7


Fem una petita prova, per veure que tal va el model de regressio amb aquests camps.

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

lr = LogisticRegression()

X = df[['RankA', 'RankB', 'RankDiff']]
y = df['WinnerBinary']

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy:  0.6968503937007874
              precision    recall  f1-score   support

           0       0.68      0.67      0.68       363
           1       0.71      0.72      0.71       399

    accuracy                           0.70       762
   macro avg       0.70      0.70      0.70       762
weighted avg       0.70      0.70      0.70       762



Provem ara a ficar el nom a veure si fa alguna cosa.

In [5]:
lr_aux = LogisticRegression()

X = pd.get_dummies(
    df[['RankA', 'RankB', 'RankDiff', 'PlayerA', 'PlayerB']],
    columns=['PlayerA', 'PlayerB'],
    drop_first=True
)

y = df['WinnerBinary']

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

lr_aux.fit(X_train, y_train)

y_pred = lr_aux.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy:  0.652230971128609
              precision    recall  f1-score   support

           0       0.64      0.65      0.64       368
           1       0.67      0.65      0.66       394

    accuracy                           0.65       762
   macro avg       0.65      0.65      0.65       762
weighted avg       0.65      0.65      0.65       762



#### PlayerA Odds i PlayerB Odds
Aquests camps són la mitja de les apostes sobre cada jugados abans de jugar el partit.

In [6]:
df_bets = pd.read_csv('wimbeldon_bets.csv')

# Afegim el RankDiff
df_bets['RankDiff'] = df_bets['RankB'] - df_bets['RankA']

df_bets.head()

Unnamed: 0,PlayerA,PlayerB,RankA,RankB,WinnerBinary,MaxOddsPlayerA,MaxOddsPlayerB,AvgOddsPlayerA,AvgOddsPlayerB,RankDiff
0,Alcaraz C.,Djokovic N.,3,2,1,1.82,2.2,1.76,2.09,-1
1,Djokovic N.,Musetti L.,2,25,1,1.15,6.75,1.12,6.3,23
2,Alcaraz C.,Medvedev D.,3,5,1,1.31,3.9,1.29,3.62,2
3,Fritz T.,Musetti L.,12,25,0,1.25,4.7,1.22,4.36,13
4,De Minaur A.,Djokovic N.,9,2,0,4.0,1.34,3.76,1.28,-7


Comencem a fer la regressió logistica

In [7]:
# Creem una nova regressió logistica
lr_bets = LogisticRegression()

# Eliminem nans
df_bets = df_bets.dropna(subset=['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'WinnerBinary'])

# Agafem els camps que necesitem per fer la predicció
X = df_bets[['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB']]
y = df_bets['WinnerBinary']

# Dividim en test i train
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Escalem les X
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fem fit al model
lr_bets.fit(X_train, y_train)

# Extreiem la y de la predicció
y_pred = lr_bets.predict(X_test)

# Comparem la y de la predicció amb la test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))

Accuracy:  0.7820224719101123
              precision    recall  f1-score   support

           0       0.78      0.80      0.79       232
           1       0.78      0.76      0.77       213

    accuracy                           0.78       445
   macro avg       0.78      0.78      0.78       445
weighted avg       0.78      0.78      0.78       445



Vale, ara tenim un **>75%**, està molt bé, anem a afegir un parell més de dades!

In [8]:
df_bets['OddsDiff'] = df_bets['AvgOddsPlayerB'] - df_bets['AvgOddsPlayerA']
df_bets['OddsRatio'] = df_bets['AvgOddsPlayerA'] / df_bets['AvgOddsPlayerB']

Hem afegit la diferencia i el ratio sobre les apostes d'abans del partit, millorarà el model?

In [9]:
# Creem una nova regressió logistica
lr_bets_2 = LogisticRegression()

# Eliminem nans
df_bets = df_bets.dropna(subset=['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'WinnerBinary', 'OddsDiff', 'OddsRatio'])

# Agafem els camps que necesitem per fer la predicció
X = df_bets[['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'OddsDiff', 'OddsRatio']]
y = df_bets['WinnerBinary']

# Dividim en test i train
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Escalem les X
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fem fit al model
lr_bets_2.fit(X_train, y_train)

# Extreiem la y de la predicció
y_pred = lr_bets_2.predict(X_test)

# Comparem la y de la predicció amb la test
accuracy_2 = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy_2)
print(classification_report(y_test, y_pred))

# Comparem la millora amb l'anterior
print("Millora respecte anterior: ", ((accuracy_2-accuracy)/accuracy)*100, "%")

Accuracy:  0.7573033707865169
              precision    recall  f1-score   support

           0       0.78      0.73      0.75       227
           1       0.74      0.79      0.76       218

    accuracy                           0.76       445
   macro avg       0.76      0.76      0.76       445
weighted avg       0.76      0.76      0.76       445

Millora respecte anterior:  -3.1609195402298758 %


Com podem veure, no hi ha molta millora, seguim provant.

#### H2H
En aquest cas, farem 3 noves columnes:
- **H2H_A_Wins**: Cuants cops el jugador A ha guanyat al jugador B
- **H2H_B_Wins**: Cuants cops el jugador B ha guanyat al jugador A
- **H2H_Diff**: Si A ha guamyat més a B, número positu, en cas contrari número negatiu. En cas d'empat, 0.

In [15]:
df_h2h = pd.read_csv('wimbeldon_bets_h2h.csv')

# Afegim el RankDiff
df_h2h['RankDiff'] = df_h2h['RankB'] - df_h2h['RankA']

# Afegim la diferencia de odds
df_h2h['OddsDiff'] = df_h2h['AvgOddsPlayerB'] - df_h2h['AvgOddsPlayerA']
df_h2h['OddsRatio'] = df_h2h['AvgOddsPlayerA'] / df_h2h['AvgOddsPlayerB']

df_h2h.head()

# Descarreguem aquesta taula a un .csv
df_h2h.to_csv('final.csv', index=False)

Ara que ya tenim tot preparat, provem a veure que ens surt!

In [19]:
# Creem una nova regressió logistica
lr_h2h = LogisticRegression()

# Eliminem nans
df_h2h = df_h2h.dropna(subset=['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'WinnerBinary', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff'])

# Agafem els camps que necesitem per fer la predicció
X = df_h2h[['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff']]
y = df_h2h['WinnerBinary']

# Dividim en test i train
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Escalem les X
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fem fit al model
lr_h2h.fit(X_train, y_train)

# Extreiem la y de la predicció
y_pred = lr_h2h.predict(X_test)

# Comparem la y de la predicció amb la test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))


Accuracy:  0.8674157303370786
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       224
           1       0.89      0.84      0.86       221

    accuracy                           0.87       445
   macro avg       0.87      0.87      0.87       445
weighted avg       0.87      0.87      0.87       445



PERFECTE, tenim un 85%!

Probem ara amb el record de tots els partits de cada jugador en pistes tipus "Grass"

In [20]:
df_record = pd.read_csv('final_with_win_rate.csv')

# Afegim el RankDiff
df_record['RankDiff'] = df_record['RankB'] - df_record['RankA']

# Afegim la diferencia de odds
df_record['OddsDiff'] = df_record['AvgOddsPlayerB'] - df_record['AvgOddsPlayerA']
df_record['OddsRatio'] = df_record['AvgOddsPlayerA'] / df_record['AvgOddsPlayerB']

df_record.head()

Unnamed: 0,PlayerA,PlayerB,RankA,RankB,WinnerBinary,MaxOddsPlayerA,MaxOddsPlayerB,AvgOddsPlayerA,AvgOddsPlayerB,H2H_A_wins,...,H2H_Diff,WinsA,LossesA,WinsB,LossesB,WinRateA,WinRateB,RankDiff,OddsDiff,OddsRatio
0,Alcaraz C.,Djokovic N.,3,2,1,1.82,2.2,1.76,2.09,3,...,-1,29,3,117,18,0.90625,0.866667,-1,0.33,0.842105
1,Djokovic N.,Musetti L.,2,25,1,1.15,6.75,1.12,6.3,7,...,6,117,18,18,10,0.866667,0.642857,23,5.18,0.177778
2,Alcaraz C.,Medvedev D.,3,5,1,1.31,3.9,1.29,3.62,6,...,4,29,3,51,24,0.90625,0.68,2,2.33,0.356354
3,Fritz T.,Musetti L.,12,25,0,1.25,4.7,1.22,4.36,1,...,-1,41,24,18,10,0.630769,0.642857,13,3.14,0.279817
4,De Minaur A.,Djokovic N.,9,2,0,4.0,1.34,3.76,1.28,0,...,-3,33,19,117,18,0.634615,0.866667,-7,-2.48,2.9375


In [21]:
# Creem una nova regressió logistica
lr_win_rate = LogisticRegression()

# Eliminem nans
df_record = df_record.dropna(subset=['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'WinnerBinary', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff', 'WinsA', 'LossesA', 'WinsB', 'LossesB', 'WinRateA', 'WinRateB'])

# Agafem els camps que necesitem per fer la predicció
X = df_record[['RankA', 'RankB', 'RankDiff', 'MaxOddsPlayerA', 'MaxOddsPlayerB', 'AvgOddsPlayerA', 'AvgOddsPlayerB', 'OddsDiff', 'OddsRatio', 'H2H_A_wins', 'H2H_B_wins', 'H2H_Diff', 'WinsA', 'LossesA', 'WinsB', 'LossesB', 'WinRateA', 'WinRateB']]
y = df_record['WinnerBinary']

# Dividim en test i train
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Escalem les X
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fem fit al model
lr_win_rate.fit(X_train, y_train)

# Extreiem la y de la predicció
y_pred = lr_win_rate.predict(X_test)

# Comparem la y de la predicció amb la test
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))


Accuracy:  0.8292134831460675
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       222
           1       0.84      0.82      0.83       223

    accuracy                           0.83       445
   macro avg       0.83      0.83      0.83       445
weighted avg       0.83      0.83      0.83       445

