## Notebook 5: Australian Open 2026 Simulation (Day 0)

## 1. Setup and Modeling

In [4]:
import pandas as pd
import numpy as np
import joblib
import re

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("--- Notebook 5: AO 2026 Simulation (Day 0) ---")

# 1. Load the Saved Model
model_filename = 'tennis_rf_model.pkl'
try:
    rf_model = joblib.load(model_filename)
    print(f"SUCCESS: Loaded model '{model_filename}'")
except FileNotFoundError:
    print(f"ERROR: Could not find '{model_filename}'. Please run Notebook 4 first.")

# 2. Load Historical Data (for player baselines)
try:
    df_final = pd.read_csv('master_data_final.csv')
    df_final['tourney_date'] = pd.to_datetime(df_final['tourney_date'])
    print("Historical data loaded successfully.")
except FileNotFoundError:
    print("ERROR: Could not find 'master_data_final.csv'.")

--- Notebook 5: AO 2026 Simulation (Day 0) ---
SUCCESS: Loaded model 'tennis_rf_model.pkl'


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Historical data loaded successfully.


## 2. Player Stats Engine
We use the MANUAL_UPDATES dictionary to ensure rising stars (who might have low stats in the historical data) are correctly rated for January 2026.

In [5]:
# 2026 "Live" Stats for Rising Stars & Top Seeds
# Derived from current trajectories for Jan 2026
MANUAL_UPDATES = {
    'Learner Tien': {'rank': 29, 'elo': 1950, 'age': 20},
    'Joao Fonseca': {'rank': 32, 'elo': 1960, 'age': 19},
    'Adam Walton': {'rank': 81, 'elo': 1600, 'age': 26},
    'Jacob Fearnley': {'rank': 77, 'elo': 1690, 'age': 24},
    'Michael Zheng': {'rank': 174, 'elo': 1700, 'age': 22},
    'Gabriel Diallo': {'rank': 90, 'elo': 1650, 'age': 24},
    'Rei Sakamoto': {'rank': 150, 'elo': 1550, 'age': 19},
    'Rafael Jodar': {'rank': 160, 'elo': 1500, 'age': 19},
    'Joel Schwaerzler': {'rank': 180, 'elo': 1500, 'age': 19},
    'Nicolai Budkov Kjaer': {'rank': 200, 'elo': 1480, 'age': 19}
}

def get_player_profile(name, df_history):
    """
    Returns features for a player. Prioritizes Manual Updates, then History.
    """
    profile = {}
    
    # 1. Manual Override
    if name in MANUAL_UPDATES:
        update = MANUAL_UPDATES[name]
        profile['rank'] = update.get('rank')
        profile['elo'] = update.get('elo')
        profile['age'] = update.get('age')
        # Defaults
        profile['surface_elo'] = update.get('elo') 
        profile['ht'] = 185
        profile['rolling_ace_pct'] = 0.08
        profile['rolling_sv_win_pct'] = 0.65
        profile['rolling_bp_save_pct'] = 0.60
        profile['form_win_pct'] = 0.60
        profile['surface_win_pct'] = 0.60
        return profile

    # 2. History Lookup
    mask = (df_history['p1_name'] == name) | (df_history['p2_name'] == name)
    player_history = df_history[mask].sort_values('tourney_date', ascending=False)
    
    if not player_history.empty:
        last_match = player_history.iloc[0]
        prefix = 'p1' if last_match['p1_name'] == name else 'p2'
        
        profile['rank'] = last_match[f'{prefix}_rank']
        profile['elo'] = last_match[f'{prefix}_elo']
        profile['surface_elo'] = last_match[f'{prefix}_surface_elo']
        profile['age'] = last_match[f'{prefix}_age'] + 1
        profile['ht'] = last_match[f'{prefix}_ht']
        
        if prefix == 'p1':
            profile['rolling_ace_pct'] = last_match['rolling_p1_ace_pct']
            profile['rolling_sv_win_pct'] = last_match['rolling_p1_sv_win_pct']
            profile['rolling_bp_save_pct'] = last_match['rolling_p1_bp_save_pct']
            profile['form_win_pct'] = last_match['p1_form_win_pct']
            profile['surface_win_pct'] = last_match['p1_surface_win_pct']
        else:
            profile['rolling_ace_pct'] = 0.06
            profile['rolling_sv_win_pct'] = 0.64
            profile['rolling_bp_save_pct'] = 0.58
            profile['form_win_pct'] = 0.50
            profile['surface_win_pct'] = 0.50
            
        return profile

    # 3. Unknown Player Fallback
    return {
        'rank': 150, 'elo': 1450, 'surface_elo': 1450, 'age': 22, 'ht': 183,
        'rolling_ace_pct': 0.05, 'rolling_sv_win_pct': 0.60,
        'rolling_bp_save_pct': 0.55, 'form_win_pct': 0.50, 'surface_win_pct': 0.50
    }

## 3. Draw Parser

In [6]:

raw_draw_text = """
1. ALCARAZ, Carlos (ESP) [1]
2. WALTON, Adam (AUS)
3. HANFMANN, Yannick (GER)
4. SVAJDA, Zachary (USA) -Q
5. ZHENG, Michael (USA) -Q
6. KORDA, Sebastian (USA)
7. SCHOOLKATE, Tristan (AUS)
8. MOUTET, Corentin (FRA) [32]
9. PAUL, Tommy (USA) [19]
10. KOVACEVIC, Aleksandar (USA)
11. TIRANTE, Thiago Agustin (ARG)
12. VUKIC, Aleksandar (AUS)
13. BUDKOV KJAER, Nicolai (NOR) -Q
14. OPELKA, Reilly (USA)
15. MISOLIC, Filip (AUT)
16. DAVIDOVICH FOKINA, Alejandro (ESP) [14]
17. BUBLIK, Alexander (KAZ) [10]
18. BROOKSBY, Jenson (USA)
19. UGO CARABELLI, Camilo (ARG)
20. FUCSOVICS, Marton (HUN)
21. KECMANOVIC, Miomir (SRB)
22. ETCHEVERRY, Tomas Martin (ARG)
23. FERY, Arthur (GBR) -Q
24. COBOLLI, Flavio (ITA) [20]
25. TIAFOE, Frances (USA) [29]
26. KUBLER, Jason (AUS) -Q
27. KYPSON, Patrick (USA) -WC
28. COMESANA, Francisco (ARG)
29. NAVONE, Mariano (ARG)
30. MEDJEDOVIC, Hamad (SRB)
31. BERRETTINI, Matteo (ITA)
32. DE MINAUR, Alex (AUS) [6]
33. ZVEREV, Alexander (GER) [3]
34. DIALLO, Gabriel (CAN)
35. POPYRIN, Alexei (AUS)
36. MULLER, Alexandre (FRA)
37. NAVA, Emilio (USA)
38. JACQUET, Kyrian (FRA) -WC
39. BONZI, Benjamin (FRA)
40. NORRIE, Cameron (GBR) [26]
41. CERUNDOLO, Francisco (ARG) [18]
42. ZHANG, Zhizhen (CHN)
43. DRAXL, Liam (CAN) -Q
44. DZUMHUR, Damir (BIH)
45. CAZAUX, Arthur (FRA)
46. FARIA, Jaime (POR) -Q
47. ARNALDI, Matteo (ITA)
48. RUBLEV, Andrey [13]
49. MEDVEDEV, Daniil [11]
50. DE JONG, Jesper (NED)
51. HALYS, Quentin (FRA)
52. TABILO, Alejandro (CHI)
53. MAJCHRZAK, Kamil (POL)
54. FEARNLEY, Jacob (GBR)
55. MAROZSAN, Fabian (HUN)
56. RINDERKNECH, Arthur (FRA) [24]
57. TIEN, Learner (USA) [25]
58. GIRON, Marcos (USA)
59. YMER, Elias (SWE) -Q
60. SHEVCHENKO, Alexander (KAZ)
61. CERUNDOLO, Juan Manuel (ARG)
62. THOMPSON, Jordan (AUS)-WC
63. BORGES, Nuno (POR)
64. AUGER-ALIASSIME, Felix (CAN) [7]
65. MUSETTI, Lorenzo (ITA) [5]
66. COLLIGNON, Raphael (BEL)
67. SONEGO, Lorenzo (ITA)
68. TABERNER, Carlos (ESP)
69. DIMITROV, Grigor (BUL)
70. MACHAC, Tomas (CZE)
71. MOCHIZUKI, Shintaro (JPN)
72. TSITSIPAS, Stefanos (GRE) [31]
73. LEHECKA, Jiri (CZE) [17]
74. GEA, Arthur (FRA)-Q
75. DJERE, Laslo (SRB)
76. WAWRINKA, Stan (SUI) -WC
77. KOPRIVA, Vit (CZE)
78. STRUFF, Jan-Lennard (GER)
79. ROYER, Valentin (FRA)
80. FRITZ, Taylor (USA) [9]
81. MENSIK, Jakub (CZE) [16]
82. CARRENO BUSTA, Pablo (ESP)
83. SAKAMOTO, Rei (JPN) -Q
84. JODAR, Rafael (ESP)-Q
85. HURKACZ, Hubert (POL)
86. BERGS, Zizou (BEL)
87. QUINN, Ethan (USA)
88. GRIEKSPOOR, Tallon (NED) [23]
89. NAKASHIMA, Brandon (USA) [27]
90. VAN DE ZANDSCHULP, Botic (NED)
91. SHANG, Juncheng (CHN)
92. BAUTISTA AGUT, Roberto (ESP)
93. ATMANE, Terence (FRA)
94. MAESTRELLI, Francesco (ITA) -Q
95. MARTINEZ, Pedro (ESP)
96. DJOKOVIC, Novak (SRB) [4]
97. SHELTON, Ben (USA) [8]
98. HUMBERT, Ugo (FRA)
99. SWEENY, Dane (AUS) -Q
100. MONFILS, Gael (FRA)
101. MANNARINO, Adrian (FRA)
102. HIJIKATA, Rinky (AUS) -WC
103. DAMM, Martin (USA) -Q
104. VACHEROT, Valentin (MON) [30]
105. SHAPOVALOV, Denis (CAN) [21]
106. BU, Yunchaokete (CHN) -WC
107. ALTMAIER, Daniel (GER)
108. CILIC, Marin (CRO)
109. MUNAR, Jaume (ESP)
110. SVRCINA, Dalibor (CZE)
111. BELLUCCI, Mattia (ITA)
112. RUUD, Casper (NOR) [12]
113. KHACHANOV, Karen [15]
114. MICHELSEN, Alex (USA)
115. O'CONNELL, Christopher (AUS)-WC
116. BASAVAREDDY, Nishesh (USA) -Q
117. MPETSHI PERRICARD, Giovanni (FRA)
118. BAEZ, Sebastian (ARG)
119. GARIN, Cristian (CHI)
120. DARDERI, Luciano (ITA) [22]
121. FONSECA, Joao (BRA) [28]
122. SPIZZIRRI, Eliot (USA)
123. NARDI, Luca (ITA)
124. WU, Yibing (CHN) -Q
125. DUCKWORTH, James (AUS) -WC
126. PRIZMIC, Dino (CRO) -LL
127. GASTON, Hugo (FRA)
128. SINNER, Jannik (ITA) [2]
"""

def parse_draw(text):
    """
    Parses the raw text to extract 128 player names.
    Format: "1. ALCARAZ, Carlos (ESP) [1]" -> "Carlos Alcaraz"
    """
    lines = text.strip().split('\n')
    players = []
    
    # Regex to capture: Number. Surname, Firstname (Everything else)
    # Group 1 = Surname, Group 2 = Firstname
    pattern = r"^\d+\.\s+([^,]+),\s+([^(]+)"
    
    for line in lines:
        match = re.search(pattern, line)
        if match:
            surname = match.group(1).strip().title()
            firstname = match.group(2).strip()
            
            # Formatting Name: "Carlos Alcaraz"
            full_name = f"{firstname} {surname}"
            players.append(full_name)
            
    return players

def predict_winner(p1_name, p2_name):
    """Runs the RF Model on two players."""
    # 1. Get Stats
    p1 = get_player_profile(p1_name, df_final)
    p2 = get_player_profile(p2_name, df_final)
    
    # 2. Build Feature Vector (Must match Model Training Order)
    features = pd.DataFrame([{
        'surface_code': 0, # Hard
        'draw_size': 128,
        'tourney_level_code': 0, # Grand Slam
        'p1_rank': p1['rank'], 'p2_rank': p2['rank'],
        'p1_elo': p1['elo'], 'p2_elo': p2['elo'],
        'p1_surface_elo': p1['surface_elo'], 'p2_surface_elo': p2['surface_elo'],
        'p1_age': p1['age'], 'p2_age': p2['age'],
        'p1_ht': p1['ht'], 'p2_ht': p2['ht'],
        'p1_h2h_wins': 0, 'p1_h2h_losses': 0,
        
        'rolling_p1_ace_pct': p1['rolling_ace_pct'],
        'rolling_p1_sv_win_pct': p1['rolling_sv_win_pct'],
        'rolling_p1_bp_save_pct': p1['rolling_bp_save_pct'],
        'p1_surface_win_pct': p1['surface_win_pct'],
        'p1_form_win_pct': p1['form_win_pct'],
        
        # Fillers for non-critical features
        'rolling_p1_df_pct': 0.04, 
        'rolling_p1_1st_in_pct': 0.62,
        'rolling_p1_1st_win_pct': 0.73, 
        'rolling_p1_2nd_win_pct': 0.50,
        'rolling_p1_bp_convert_pct': 0.40
    }])
    
    # 3. Predict
    prob = rf_model.predict_proba(features)[0][1]
    
    if prob >= 0.50:
        return p1_name, prob
    else:
        return p2_name, 1-prob

print("Draw Parser & Prediction Engine Ready.")

Draw Parser & Prediction Engine Ready.


## 4. Execute Simulation

In [7]:
# 1. Parse Draw
players = parse_draw(raw_draw_text)
if len(players) != 128:
    print(f"WARNING: Expected 128 players, found {len(players)}. Check parser.")
else:
    print(f"Draw Loaded: {len(players)} players ready.")

# 2. Setup Round 1 Matches (1 vs 2, 3 vs 4...)
current_round = []
for i in range(0, len(players), 2):
    current_round.append((players[i], players[i+1]))

rounds = ["Round 1", "Round 2", "Round 3", "Round 4", "Quarterfinals", "Semifinals", "Final"]

# 3. Play Tournament
for r_name in rounds:
    print(f"\n=== {r_name} ===")
    next_round = []
    
    for p1, p2 in current_round:
        winner, prob = predict_winner(p1, p2)
        next_round.append(winner)
        
        # Print significant matches
        # Show all matches for QF, SF, F. Show fewer for early rounds.
        if len(current_round) <= 8 or prob < 0.55 or p1 == "Carlos Alcaraz" or p2 == "Novak Djokovic" or p2 == "Jannik Sinner":
             print(f"  {winner} def. {p1 if winner!=p1 else p2} ({prob:.1%})")
            
    # Create pairings for next round
    new_matches = []
    for i in range(0, len(next_round), 2):
        new_matches.append((next_round[i], next_round[i+1]))
        
    current_round = new_matches
    
    if len(next_round) == 1:
        print(f"\nAustralian Open 2026 Champion: {next_round[0]}")

Draw Loaded: 128 players ready.

=== Round 1 ===


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.
