In [50]:
import pandas as pd
import numpy as np

In [51]:
#number of unique players
n_players = 20
print(f"generating a player roster with {n_players} unique players")

generating a player roster with 20 unique players


In [52]:
#assign win rate based on skill (higher skill -> higher win rate)
#added noise to make it more realistic
base_win_rates = (1 - (np.random.randint(1, 200, n_players) / 200)) * 0.6 + 0.2 + np.random.uniform(-0.05, 0.05, n_players)

In [53]:
#assemble the player roster DF and sort by skill
players_df = pd.DataFrame({
    'player_id': [f'P{1001+i}' for i in range(n_players)],
    'base_skill': np.random.randint(1, 200, n_players),
    'win_rate': np.clip(base_win_rates, 0.1, 0.9).round(2)
}).sort_values('base_skill').reset_index(drop=True)

In [54]:
print("/n--- Full Player Roster ---")
print(players_df)

/n--- Full Player Roster ---
   player_id  base_skill  win_rate
0      P1013          19      0.22
1      P1006          38      0.58
2      P1005          58      0.48
3      P1014          59      0.74
4      P1007          67      0.49
5      P1017          80      0.34
6      P1011          89      0.70
7      P1002          90      0.73
8      P1019         106      0.36
9      P1020         110      0.61
10     P1010         116      0.66
11     P1009         123      0.19
12     P1003         128      0.67
13     P1015         131      0.43
14     P1008         139      0.38
15     P1001         160      0.74
16     P1018         172      0.78
17     P1016         175      0.40
18     P1004         184      0.30
19     P1012         185      0.23


# Simulate Matches


In [55]:
#define the number of matches to simulate
n_matches = 2000

In [56]:
# Randomly select all Player 1s and Player 2s at once
p1_indices = np.random.randint(0, n_players, n_matches)
p2_indices = np.random.randint(0, n_players, n_matches)

In [57]:
#ensure player 1 and player 2 are never the same in a match
mask = p1_indices == p2_indices
while np.any(mask):
  p2_indices[mask] = np.random.randint(0, n_players, np.sum(mask))
  mask = p1_indices == p2_indices

In [58]:
#create match DF by merging player data
matches_df = players_df.iloc[p1_indices].reset_index(drop=True).add_prefix('p1_')
p2_data = players_df.iloc[p2_indices].reset_index(drop=True).add_prefix('p2_')
matches_df = pd.concat([matches_df, p2_data], axis=1)

# Calculate Match Details and Winner

In [59]:
#calculate match ranks with daily variation (vectorized)
matches_df['p1_rank'] = (matches_df['p1_base_skill'] + np.random.randint(-10, 10)).clip(lower=1)
matches_df['p2_rank'] = (matches_df['p2_base_skill'] + np.random.randint(-10, 10)).clip(lower=1)

In [60]:
#calculate win probability for P1 (vectorized)
prob_p1_wins = 0.5 + \
              (matches_df['p2_rank'] - matches_df['p1_rank']) * 0.003 + \
              (matches_df['p1_win_rate'] - matches_df['p2_win_rate']) * 0.4
prob_p1_wins = prob_p1_wins.clip(0, 1)

In [61]:
#determine the winner (vectorized)
matches_df['winner'] = np.where(np.random.rand(n_matches) < prob_p1_wins, 'Player 1', 'Player 2')

# **Add Enviromental Features**

In [62]:
matches_df['court_type'] = np.random.choice(['Indoor', 'Outdoor'], n_matches, p=[0.4, 0.6])
matches_df['weather'] = np.where(matches_df['court_type'] == 'Indoor', 'N/A', np.random.choice(['Sunny', 'Cloudy', 'Windy'], n_matches))

rank_diff = (matches_df['p1_rank'] - matches_df['p2_rank']).abs()
matches_df['match_duration_minutes'] = (60 - (rank_diff / 5) + np.random.randint(-10, 15, n_matches)).clip(25, 120).astype(int)

# **Finalize and Display**

In [67]:
#define and reorder final columns
cols_to_keep = [
    'p1_rank', 'p2_rank', 'p1_win_rate', 'p2_win_rate',
    'court_type', 'weather', 'match_duration_minutes', 'winner',
    'p1_player_id', 'p2_player_id'
]
final_df = matches_df[cols_to_keep]

#renamed the columns
final_df = final_df.rename(columns={
    'p1_player_id': 'p1_id',
    'p2_player_id': 'p2_id'
})

print(final_df.head())

   p1_rank  p2_rank  p1_win_rate  p2_win_rate court_type weather  \
0      194      133         0.23         0.43     Indoor     N/A   
1       99      112         0.73         0.61    Outdoor   Windy   
2      119      174         0.61         0.78     Indoor     N/A   
3      132      177         0.19         0.40    Outdoor   Windy   
4       67       92         0.48         0.73    Outdoor   Sunny   

   match_duration_minutes    winner  p1_id  p2_id  
0                      45  Player 2  P1012  P1015  
1                      52  Player 1  P1002  P1020  
2                      63  Player 1  P1020  P1018  
3                      54  Player 2  P1009  P1016  
4                      53  Player 2  P1005  P1002  


In [68]:
#saving dataset
final_df.to_csv('pickleball_matches.csv', index=False)
