In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the data
df = pd.read_csv('atp_matches_2023_2024.csv')

# Preprocess the data
le_surface = LabelEncoder()
le_player = LabelEncoder()

df['surface'] = le_surface.fit_transform(df['surface'])
players = pd.concat([df['player1'], df['player2']]).unique()
le_player.fit(players)
df['player1'] = le_player.transform(df['player1'])
df['player2'] = le_player.transform(df['player2'])
df['winner'] = (df['winner'] == df['player1']).astype(int)

# Create features and target
X = df[['surface', 'player1', 'player2']]
y = df['winner']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
train_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)

print(f"Train accuracy: {train_accuracy:.2f}")
print(f"Test accuracy: {test_accuracy:.2f}")

# Function to convert probability to decimal odds
def prob_to_odds(prob):
    # Ensure probability is not exactly 0 or 1
    prob = max(min(prob, 0.99), 0.01)
    return 1 / prob

# Function to predict match outcome
def predict_match(surface, player1, player2):
    try:
        # Encode input data
        surface_encoded = le_surface.transform([surface])[0]
    except ValueError:
        print(f"Unknown surface: {surface}")
        print(f"Known surfaces: {', '.join(le_surface.classes_)}")
        return None

    try:
        player1_encoded = le_player.transform([player1])[0]
    except ValueError:
        print(f"Unknown player: {player1}")
        return None

    try:
        player2_encoded = le_player.transform([player2])[0]
    except ValueError:
        print(f"Unknown player: {player2}")
        return None
    
    # Create a DataFrame for prediction to maintain consistency with training data
    X_pred = pd.DataFrame([[surface_encoded, player1_encoded, player2_encoded]], 
                          columns=['surface', 'player1', 'player2'])
    
    # Make prediction and get probabilities
    prediction = model.predict(X_pred)
    probabilities = model.predict_proba(X_pred)[0]
    
    # Debug information
    print(f"Raw prediction: {prediction}")
    print(f"Raw probabilities: {probabilities}")
    
    # Handle single probability output
    if len(probabilities) == 1:
        player1_prob = probabilities[0] if prediction[0] == 1 else 1 - probabilities[0]
        player2_prob = 1 - player1_prob
    else:
        player1_prob = probabilities[1]
        player2_prob = probabilities[0]
    
    # Calculate decimal odds
    player1_odds = prob_to_odds(player1_prob)
    player2_odds = prob_to_odds(player2_prob)
    
    winner = player1 if player1_prob > player2_prob else player2
    
    return {
        'winner': winner,
        'player1_prob': player1_prob,
        'player2_prob': player2_prob,
        'player1_odds': player1_odds,
        'player2_odds': player2_odds
    }

# Print known surfaces and a few sample players
print("Known surfaces:", ', '.join(le_surface.classes_))
print("Sample players:", ', '.join(le_player.classes_[:5]))  # Printing first 5 players as an example

# Example usage with Draper J. vs Paul T. on grass
result = predict_match("Grass", "Draper J.", "Paul T.")
if result:
    print(f"Predicted winner: {result['winner']}")
    print(f"Draper J. win probability: {result['player1_prob']:.4f}")
    print(f"Paul T. win probability: {result['player2_prob']:.4f}")
    print(f"Draper J. odds: {result['player1_odds']:.2f}")
    print(f"Paul T. odds: {result['player2_odds']:.2f}")
else:
    print("Prediction failed. One or more inputs may be unknown to the model.")

Train accuracy: 1.00
Test accuracy: 1.00
Known surfaces: Clay, Grass, Hard
Sample players: Ajdukovic D., Albot R., Alcaraz C., Altmaier D., Alves M.
Raw prediction: [0]
Raw probabilities: [1.]
Predicted winner: Paul T.
Draper J. win probability: 0.0000
Paul T. win probability: 1.0000
Draper J. odds: 100.00
Paul T. odds: 1.01
