In [675]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'


## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [676]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = + data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [677]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

In [678]:
# # Save the dictionary of DataFrames as a pickle
# with open(data_path + 'dfs_dict.pkl', 'wb') as f:
#     pickle.dump(dfs, f)

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [None]:
players_df = dfs['players_df']
players_df.head()


In [None]:
ranking_df = dfs['ranking_df']
ranking_df.head()

In [None]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

In [None]:
sets_df = dfs['sets_df']
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
sets_df.shape



In [None]:
tournament_info_df = dfs['tournament_info_df']
tournament_info_df.head()


## Games played by character ##
Count the games played by each character and plot.

In [None]:
from collections import Counter

# Filter out rows where 'characters' is not a dictionary
valid_rows = players_df[players_df['characters'].apply(lambda x: isinstance(x, dict))]

# Use apply and Counter to aggregate counts
total_character_counts = Counter()

# Sum up all character counts
total_character_counts.update(valid_rows['characters'].apply(Counter).sum())

# Sort by number of games played
total_character_counts = dict(total_character_counts)
sorted_characters = sorted(total_character_counts.items(), key=lambda x: x[1], reverse=True)
sorted_characters = dict([(char.split('/')[1], count) for char, count in sorted_characters])

# Plot the data
plt.figure(figsize=(10,6))
plot = sns.barplot(sorted_characters, orient='h', alpha = .8, capsize=2)
plot.set_title('Games played by character')
plot.set_xlabel('Games played')
plt.show()

## Investigate data for Zain
We choose zain because he is in the best player in the head of the players_df.

In [None]:
zain_df = players_df.iloc[2]
print('Zain has played in', len(zain_df['placings']), 'tournaments.')

# Dataframe of tournaments that zain has played in.
zain_tournament_keys = [tournament['key'] for tournament in zain_df['placings']]
zain_tournament_info_df = tournament_info_df[tournament_info_df['key'].isin(zain_tournament_keys)]

In [None]:
print(zain_tournament_info_df.info())
zain_tournament_info_df.head()

Here we make a dataframe containing all the sets that zain has played in and add some columns so that we can identify him more easily.

In [None]:
# Filter sets_df with Zain as a player and make a copy
zain_sets_df = sets_df[(sets_df['p1_id'] == zain_df['player_id']) | (sets_df['p2_id'] == zain_df['player_id'])].copy()
print('Zain has played', zain_sets_df.shape[0], 'sets.')

# Assuming zain_df['player_id'] gives Zain's player ID
zain_id = zain_df['player_id']

# Add a 'zain_win' column using .loc
zain_sets_df.loc[:, 'zain_win'] = ((zain_sets_df['p1_id'] == zain_id) & (zain_sets_df['p1_score'] > zain_sets_df['p2_score'])) | \
                                  ((zain_sets_df['p2_id'] == zain_id) & (zain_sets_df['p2_score'] > zain_sets_df['p1_score']))

# Add an 'opponent' column using .loc
zain_sets_df.loc[:, 'opponent'] = zain_sets_df.apply(
    lambda row: row['p1_id'] if row['p2_id'] == zain_id else row['p2_id'],
    axis=1
)

# Add 'zain_score' and 'opponent_score' columns using .loc
zain_sets_df.loc[:, 'zain_score'] = zain_sets_df.apply(
    lambda row: row['p1_score'] if row['p1_id'] == zain_id else row['p2_score'],
    axis=1
)

zain_sets_df.loc[:, 'opponent_score'] = zain_sets_df.apply(
    lambda row: row['p2_score'] if row['p1_id'] == zain_id else row['p1_score'],
    axis=1
)

# Filter the zain_sets_df for rows where 'game_data' is not an empty list
game_data_zain_sets_df = zain_sets_df[zain_sets_df['game_data'].apply(lambda x: len(x) > 0)]

# Display the result
print('Zain has played', game_data_zain_sets_df.shape[0], 'sets with game_data.')
print(game_data_zain_sets_df.iloc[1]['game_data'])


Here we check to see that zain_df['characters'] counts the number of games that zain has played that character.

In [None]:
loser_char = [game_data_dict['loser_char'] for game_data_dict in game_data_zain_sets_df.iloc[1]['game_data']]
winner_char = [game_data_dict['winner_char'] for game_data_dict in game_data_zain_sets_df.iloc[1]['game_data']]
print(loser_char)
print(winner_char)

In [None]:
# Ensure zain_id is a string to match the data in game_data
zain_id = str(zain_id)

# Initialize a dictionary to count Zain's characters
zain_characters_count = defaultdict(int)

# Loop through the 'game_data' for each set Zain played
for game_data in game_data_zain_sets_df['game_data']:
    for game in game_data:
        # Check if Zain was the winner or loser and increment the count of the character he used
        if str(game['winner_id']) == zain_id:  # Compare as strings
            zain_characters_count[game['winner_char']] += 1
        elif str(game['loser_id']) == zain_id:  # Compare as strings
            zain_characters_count[game['loser_char']] += 1

# Convert the defaultdict to a regular dict and display the result
zain_characters_count = dict(zain_characters_count)
# print(zain_characters_count)
# print(zain_df['characters'])

# Extract the characters dictionary from zain_df
zain_characters_actual = zain_df['characters']

# Find the keys that are common to both dictionaries
common_keys = set(zain_characters_count.keys()).intersection(set(zain_characters_actual.keys()))

# Compare the values for the keys that are common
for key in common_keys:
    if zain_characters_count[key] == zain_characters_actual[key]:
        print(f"{key}: Match - {zain_characters_count[key]} games")
    else:
        print(f"{key}: Mismatch - counted {zain_characters_count[key]} games, actual {zain_characters_actual[key]} games")

## Overall Glicko-2 Exploration ##


Here we see what happens if we only have one update to a player's glicko rating.  It can blow up. We simulate what happens when a player's first update contains 3 wins against players with glicko-2 rating i and  rd value of 100.

In [None]:
from glicko2 import Player
player = Player()
for i in range(0,3200, 200):
    player = Player()
    # player._tau = 1.2
    # print(player._tau)
    player.update_player([i, i, i],[100, 100, 100],[1, 1, 1])
    print(f'{i} : {int(player.getRating())}')

## Highest Glicko every 6 months ##
Here we investigate who has the highest Glicko-2 rating every 6 months of the dataset.

In [691]:
glicko2_df = pd.read_pickle(data_path + 'overall_players_ranking.pkl')
melee_release_date = datetime.datetime(2001, 11, 21)
glicko2_df['dates'] = glicko2_df['dates'].apply(lambda x: np.insert(x, 0, melee_release_date))
glicko2_df['rating_history'] = glicko2_df['rating_history'].apply(lambda x: np.insert(x, 0, 1500))
glicko2_df['rd_history'] = glicko2_df['rd_history'].apply(lambda x: np.insert(x, 0, 350))

In [None]:
glicko2_df.head(1)

If we just look for the highest rating of the dataset, we get some nonsense (players we don't expect to see appear in the list).

In [693]:
# def get_index_of_date(dates, target_date):
#     return np.searchsorted(dates, target_date) - 1

# # filtered_glicko = glicko2_df.copy()


# for year in range(2016, 2025):
#     for month in [1, 6]:
#         filtered_glicko = glicko2_df.copy()
        
#         target_date = datetime.datetime(year, month, 1)

#         indices = filtered_glicko['dates'].apply(lambda x: get_index_of_date(x, target_date))
        
#         # Filter out the players that have not entered a tournament yet.
#         # filtered_glicko = filtered_glicko[indices > 0]
        
#         # Extract ratings as a Series, ensuring correct data type
#         ratings_on_date = filtered_glicko.apply(
#             lambda row: row['rating_history'][indices[row.name]], axis=1
#         )

#         rd_on_date = filtered_glicko.apply(
#             lambda row: row['rd_history'][indices[row.name]], axis=1
#         )

#         # Ensure ratings_on_date is a Series and sort it
#         top_5 = ratings_on_date.sort_values(ascending=False)[:10]

#         # Retrieve the player tags along with their ratings
#         top_5_df = players_df[players_df['player_id'].isin(top_5.index)]
#         top_5_df = top_5_df.set_index('player_id').loc[top_5.index]
#         top_5_df['rating'] = top_5.values.astype(int)

#         # Display the top 5 players sorted by rating
#         print(f"Date: {target_date.strftime('%Y-%m-%d')}")
#         print(top_5_df[['tag', 'rating']].to_string(index=False))
#         print()


Knowing that players with only one update to their rank can be very high (see above), we filter out players that have gone to fewer than 5 tournaments.

In [694]:
# def get_index_of_date(dates, target_date):
#     return np.searchsorted(dates, target_date) - 1


# for year in range(2016, 2025):
#     for month in [1, 6]:
#         # First filter out the players who have only a few updates (played in 10 tournaments)
#         filtered_glicko = glicko2_df[glicko2_df['dates'].apply(len) > 9]
        
#         target_date = datetime.datetime(year, month, 1)

#         indices = filtered_glicko['dates'].apply(lambda x: get_index_of_date(x, target_date))
        
#         # Filter out the players that have not entered a tournament yet.
#         # filtered_glicko = filtered_glicko[indices > 0]

#         # Extract ratings as a Series, ensuring correct data type
#         ratings_on_date = filtered_glicko.apply(
#             lambda row: row['rating_history'][indices[row.name]], axis=1
#         )

#         rd_on_date = filtered_glicko.apply(
#             lambda row: row['rd_history'][indices[row.name]], axis=1
#         )

#         # Ensure ratings_on_date is a Series and sort it
#         top_5 = ratings_on_date.sort_values(ascending=False)[:20]

#         # Retrieve the player tags along with their ratings
#         top_5_df = players_df[players_df['player_id'].isin(top_5.index)]
#         top_5_df = top_5_df.set_index('player_id').loc[top_5.index]
#         top_5_df['rating'] = top_5.values.astype(int)

#         # Display the top 5 players sorted by rating
#         print(f"Date: {target_date.strftime('%Y-%m-%d')}")
#         print(top_5_df[['tag', 'rating']].to_string(index=False))
#         print()

## Baseline Glicko-2 Prediction ##
We see how often the player with the higher Glicko-2 rating wins. The baseline of 72.3% does not initially seem too bad.

In [695]:
# random_set_sample = sets_df 

# # Merge with 'tournament_info_df' to get 'start' date
# random_set_sample = random_set_sample.merge(
#     tournament_info_df[['key', 'start']],
#     left_on='tournament_key',
#     right_on='key',
#     how='left'
# )

# # Ensure date columns are datetime
# random_set_sample['start'] = pd.to_datetime(random_set_sample['start'])

# # Flatten 'glicko2_df'
# glicko2_long = glicko2_df[['dates', 'rating_history']].reset_index()

# glicko2_long = glicko2_long.explode(['dates', 'rating_history'])

# glicko2_long.rename(columns={'dates': 'date', 'rating_history': 'rating'}, inplace=True)
# glicko2_long['date'] = pd.to_datetime(glicko2_long['date'])

# # For Player 1
# p1_data = random_set_sample[['p1_id', 'start']].rename(columns={'p1_id': 'player_id'})
# p1_data['key'] = p1_data.index

# p1_ratings = pd.merge_asof(
#     p1_data.sort_values('start'),
#     glicko2_long.sort_values('date'),
#     by='player_id',
#     left_on='start',
#     right_on='date',
#     direction='backward'
# ).set_index('key')

# # For Player 2
# p2_data = random_set_sample[['p2_id', 'start']].rename(columns={'p2_id': 'player_id'})
# p2_data['key'] = p2_data.index

# p2_ratings = pd.merge_asof(
#     p2_data.sort_values('start'),
#     glicko2_long.sort_values('date'),
#     by='player_id',
#     left_on='start',
#     right_on='date',
#     direction='backward'
# ).set_index('key')

# # Combine ratings
# combined_ratings = pd.DataFrame({
#     'p1_rating': p1_ratings['rating'],
#     'p2_rating': p2_ratings['rating'],
#     'winner_id': random_set_sample['winner_id'],
#     'p1_id': random_set_sample['p1_id'],
#     'p2_id': random_set_sample['p2_id']
# })

# # Drop missing ratings
# combined_ratings.dropna(subset=['p1_rating', 'p2_rating'], inplace=True)

# # Update number of sets
# num_sets = combined_ratings.shape[0]

# # Determine if the higher-rated player won
# higher_p1_wins = (
#     (combined_ratings['p1_rating'] > combined_ratings['p2_rating']) &
#     (combined_ratings['winner_id'] == combined_ratings['p1_id'])
# )
# higher_p2_wins = (
#     (combined_ratings['p2_rating'] > combined_ratings['p1_rating']) &
#     (combined_ratings['winner_id'] == combined_ratings['p2_id'])
# )
# correct_predictions = (higher_p1_wins | higher_p2_wins).sum()

# print(f'The higher Glicko-2 rating wins {correct_predictions / num_sets:0.1%} of the time')
# print(f'There were {correct_predictions} correct predictions out of {num_sets} sets.')

I don't think that a single number tells the whole story. We make a bar chart that shows the accuracy of the prediction for different skill gaps in rating. We see that the pridictive accuracy of the Glicko-2 rating is barely better than a guess when the difference in rating is less than 100.

In [None]:
# Compute rating difference
combined_ratings['rating_diff'] = abs(combined_ratings['p1_rating'] - combined_ratings['p2_rating'])

# Determine if the higher-rated player won
combined_ratings['higher_rated_won'] = (
    ((combined_ratings['rating_diff'] > 0) & (combined_ratings['winner_id'] == combined_ratings['p1_id'])) |
    ((combined_ratings['rating_diff'] < 0) & (combined_ratings['winner_id'] == combined_ratings['p2_id']))
)

# Bin the rating differences with bin sizes of 100
bin_size = 100
min_rating_diff = combined_ratings['rating_diff'].min()
max_rating_diff = combined_ratings['rating_diff'].max()
bins = np.arange(
    np.floor(min_rating_diff / bin_size) * bin_size,
    np.ceil(max_rating_diff / bin_size) * bin_size + bin_size,
    bin_size, dtype=int
)

# Assign bins without specifying labels (default labels are intervals)
combined_ratings['rating_diff_bin'] = pd.cut(combined_ratings['rating_diff'], bins)

# Group by bins and compute the percentage of higher-rated player wins
result = combined_ratings.groupby('rating_diff_bin', observed=False)['higher_rated_won'].agg(['mean', 'count'])
result['mean'] = result['mean'] * 100  # Convert to percentage

# Reset index to turn 'rating_diff_bin' into a column
result = result.reset_index()

# Convert interval labels to strings for x-axis labels
result['rating_diff_bin_str'] = result['rating_diff_bin'].astype(str)

# Plot the histogram
plt.figure(figsize=(12, 6))
ax = result['mean'].plot(kind='bar', color='skyblue', edgecolor='black')

# Set x-axis labels to be the interval strings
ax.set_xticklabels(result['rating_diff_bin_str'])

plt.title('Percentage of Times the Higher Glicko-2 Rated Player Wins vs. Rating Difference')
plt.xlabel('Rating Difference Interval')
plt.ylabel('Percentage of Wins by Higher-Rated Player (%)')
plt.xticks(rotation=45, ha='right')

# Add percentage labels on top of each bar
for p, value in zip(ax.patches, result['mean']):
    height = p.get_height()
    ax.annotate(f'{value:.1f}%', 
                (p.get_x() + p.get_width() / 2, height), 
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

To check to see how much of a problem this is, we see how many games are played between players of different rating gaps. As we see, most games are played by players whose rating are close. This will likely be a problem for us.

In [None]:
# Compute rating difference
combined_ratings['rating_diff'] = abs(combined_ratings['p1_rating'] - combined_ratings['p2_rating'])

# Bin the rating differences with bin sizes of 100
bin_size = 100
min_rating_diff = combined_ratings['rating_diff'].min()
max_rating_diff = combined_ratings['rating_diff'].max()
bins = np.arange(
    np.floor(min_rating_diff / bin_size) * bin_size,
    np.ceil(max_rating_diff / bin_size) * bin_size + bin_size,
    bin_size, dtype=int
)

# Assign bins without specifying labels (default labels are intervals)
combined_ratings['rating_diff_bin'] = pd.cut(combined_ratings['rating_diff'], bins)

# Group by bins and compute the count of games played
result = combined_ratings.groupby('rating_diff_bin', observed=False).size().reset_index(name='count')

# Convert interval labels to strings for x-axis labels
result['rating_diff_bin_str'] = result['rating_diff_bin'].astype(str)

# Plot the histogram
plt.figure(figsize=(12, 6))
ax = result['count'].plot(kind='bar', color='skyblue', edgecolor='black')

# Set x-axis labels to be the interval strings
ax.set_xticklabels(result['rating_diff_bin_str'])

plt.title('Number of Games Played vs. Rating Difference')
plt.xlabel('Rating Difference Interval')
plt.ylabel('Number of Games Played')
plt.xticks(rotation=45, ha='right')

# Add count labels on top of each bar
for p, value in zip(ax.patches, result['count']):
    height = p.get_height()
    ax.annotate(f'{value}', 
                (p.get_x() + p.get_width() / 2, height), 
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Compute rating difference
combined_ratings['rating_diff'] = abs(combined_ratings['p1_rating'] - combined_ratings['p2_rating'])

# Bin the rating differences with bin sizes of 100
bin_size = 100
min_rating_diff = combined_ratings['rating_diff'].min()
max_rating_diff = min(1200, combined_ratings['rating_diff'].max())  # Limit to 1200

bins = np.arange(
    np.floor(min_rating_diff / bin_size) * bin_size,
    np.ceil(max_rating_diff / bin_size) * bin_size + bin_size,
    bin_size, dtype=int
)

# Assign bins without labels
combined_ratings['rating_diff_bin'] = pd.cut(combined_ratings['rating_diff'], bins)

# Filter out intervals beyond 1200
filtered_data = combined_ratings[combined_ratings['rating_diff'] <= 1200]

# Determine if the higher-rated player won, using .loc to avoid SettingWithCopyWarning
filtered_data = filtered_data.copy()
filtered_data.loc[:, 'higher_rated_won'] = (
    ((filtered_data['p1_rating'] > filtered_data['p2_rating']) & (filtered_data['winner_id'] == filtered_data['p1_id'])) |
    ((filtered_data['p1_rating'] < filtered_data['p2_rating']) & (filtered_data['winner_id'] == filtered_data['p2_id']))
)

# Group by bins and compute count and win stats
game_counts = filtered_data.groupby('rating_diff_bin', observed=False).size().reset_index(name='count')
win_stats = filtered_data.groupby('rating_diff_bin', observed=False)['higher_rated_won'].agg(['mean', 'count'])
win_stats['mean'] = win_stats['mean'] * 100  # Convert to percentage

# Merge both dataframes
result = pd.merge(game_counts, win_stats, on='rating_diff_bin')

# Convert interval labels to strings for x-axis
result['rating_diff_bin_str'] = result['rating_diff_bin'].astype(str)

# Plot the data
fig, ax1 = plt.subplots(figsize=(14, 7))

# X-axis positions for bars
bar_width = 0.35  # Width of each bar
x = np.arange(len(result))

# Plot the number of games
bars1 = ax1.bar(x - bar_width / 2, result['count_x'], bar_width, 
                label='Number of Games', color='skyblue', edgecolor='black')
ax1.set_xlabel('Rating Difference Interval')
ax1.set_ylabel('Number of Sets Played', color='skyblue')
ax1.tick_params(axis='y', labelcolor='skyblue')

# Create a twin y-axis for the win percentage
ax2 = ax1.twinx()
bars2 = ax2.bar(x + bar_width / 2, result['mean'], bar_width, 
                label='Win Percentage (Higher Rated)', color='lightgreen', edgecolor='black')
ax2.set_ylabel('Win Percentage (%)', color='lightgreen')
ax2.tick_params(axis='y', labelcolor='lightgreen')

# Set x-axis labels and ticks
ax1.set_xticks(x)
ax1.set_xticklabels(result['rating_diff_bin_str'], rotation=45, ha='right')

# Add labels for win percentage bars
for bar, value in zip(bars2, result['mean']):
    height = bar.get_height()
    ax2.annotate(f'{round(value)}%', 
                 (bar.get_x() + bar.get_width() / 2, height), 
                 ha='center', va='bottom', fontsize=9)

# Add a title and ensure layout is tight
plt.title('Comparison of Number of Sets Played and Win Percentage vs. Rating Difference')
fig.tight_layout()

plt.show()


Import weekly updated Glicko-2 rating.

In [None]:
player_ratings_df = pd.read_pickle(data_path + 'overall_players_ranking_new_weekly.pkl')
print(player_ratings_df.shape)
player_ratings_df.head()

## Number of Glicko-2 updates
Running total of number of updates to each players glicko-2 rating. We use numba njit and prange to speed up the loops in the function. We save the results so that we only need to run the calculation once.

In [None]:
from numba import njit, prange

@njit(parallel=True)
def previous_updates(array):
    """ This funcion returns an array like array with the number of times the value above i,j entry of array has changed.
    Args:
        array (np): the array

    Returns:
        np: the number of times array has changed above the i,j entry
    """
    previous_updates = np.zeros_like(array, dtype=np.int32)
    
    for i in range(1, array.shape[0]): # row i
        for j in prange(array.shape[1]): # col j
            col_above_i_j = array[:i,j]
            values  = np.unique(col_above_i_j)
            previous_updates[i,j] += int(values.shape[0] - 1)
            # print(f"row {i}, col {j}, {col_above_i_j}, num_updates {values.shape[0]-1}")

    return previous_updates

## Testing array
# array = np.array([
#     [1, 1, 1],
#     [1, 1, 2],
#     [1, 2, 3],
#     [1, 3, 4]])

# print(array)
# previous_updates(array)
# print(num_previous_updates(array))

# Do the calculation once.
# player_ratings_np = player_ratings_df.to_numpy()
# number_of_rating_updates_df = pd.DataFrame(columns=player_ratings_df.columns, index=player_ratings_df.index, data=previous_updates(player_ratings_np))
# number_of_rating_updates_df.head()

# # Save the results
# number_of_rating_updates_df.to_pickle(data_path + 'number_of_rating_updates_df.pkl')

## Load the results
number_of_rating_updates_df = pd.read_pickle(data_path + 'number_of_rating_updates_df.pkl')
number_of_rating_updates_df.head()

## Add some columns to sets_df
We add the start of the tournament, the player ratings at the start of the tournament, and the number of times the player's rating has been updated before the start of the tournament.

In [701]:
# Perform a merge on 'key' and 'tournament_key' to bring 'start' dates into sets_df
merged_df = sets_df.merge(tournament_info_df[['key', 'start']], left_on='tournament_key', right_on='key', how='left')


In [None]:
tqdm.pandas()
# Function to get both Player 1 and Player 2 ratings and the number of rating updates
def get_ratings_and_updates(row, player_ratings_df, number_of_rating_updates_df):
    # Find the closest date in player_ratings_df that is <= 'start' date
    closest_date = player_ratings_df.index[player_ratings_df.index <= row['start']].max()
    
    # If there's no valid date, return None for ratings and updates
    if pd.isnull(closest_date):
        return pd.Series([None, None, None, None], index=['p1_rating', 'p2_rating', 'p1_updates', 'p2_updates'])
    
    # Fetch Player 1's and Player 2's ratings on the closest date
    p1_rating = player_ratings_df.loc[closest_date, row['p1_id']] if row['p1_id'] in player_ratings_df.columns else None
    p2_rating = player_ratings_df.loc[closest_date, row['p2_id']] if row['p2_id'] in player_ratings_df.columns else None
    
    # Fetch Player 1's and Player 2's number of rating updates on the closest date
    p1_updates = number_of_rating_updates_df.loc[closest_date, row['p1_id']] if row['p1_id'] in number_of_rating_updates_df.columns else None
    p2_updates = number_of_rating_updates_df.loc[closest_date, row['p2_id']] if row['p2_id'] in number_of_rating_updates_df.columns else None
    
    # Return all values as a pandas Series
    return pd.Series([p1_rating, p2_rating, p1_updates, p2_updates], 
                     index=['p1_rating', 'p2_rating', 'p1_updates', 'p2_updates'])

## Apply the function to each row in merged_df
# merged_df[['p1_rating', 'p2_rating', 'p1_updates', 'p2_updates']] = merged_df.progress_apply(
#     get_ratings_and_updates, axis=1, 
#     player_ratings_df=player_ratings_df, 
#     number_of_rating_updates_df=number_of_rating_updates_df
# )

## Save
# merged_df.to_pickle(data_path + 'augmented_sets_df.pkl')

## Load
augmented_sets_df = pd.read_pickle(data_path + 'augmented_sets_df.pkl')
augmented_sets_df.head()

## Top 8 Locations
Here we look for what the sets corresponding to the top 8 of a tournament are labeled as in the column 'location_names'. We do this by inspection.

In [703]:
# print(sets_df['location_names'].value_counts().to_string())

Get all top 8 sets.

In [None]:
# The vast majority of the top 8 games have these "location_names"
top_8_locations = [
        ['WSF', 'Winners Semis', 'Winners Semi-Final'],
        ['WF', 'Winners Final', 'Winners Final'],
        ['LSF', 'Losers Semis', 'Losers Semi-Final'],
        ['LF', 'Losers Final', 'Losers Final'],
        ['GF', 'Grand Final', 'Grand Final'],
        ['GFR', 'GF Reset', 'Grand Final Reset']
    ] 

top_8_sets_df = augmented_sets_df[augmented_sets_df["location_names"].isin(top_8_locations)]
top_8_tournament_keys = top_8_sets_df['tournament_key'].unique()
print(f"There are {len(top_8_tournament_keys)} tournaments with double elimination finals.")
top_8_sets_df.head()

In [None]:
tournament_sets_with_top_8_df = augmented_sets_df[augmented_sets_df['tournament_key'].isin(top_8_tournament_keys)]
non_top_8_sets_df = tournament_sets_with_top_8_df[~ tournament_sets_with_top_8_df['location_names'].isin(top_8_locations)]
non_top_8_sets_df.head()

In [None]:
# Flatten the top_8_locations to a single list of all location name variations
top_8_flat_list = [location for sublist in top_8_locations for location in sublist]

# Add a 'top_8' column based on whether 'location_names' matches any entry in the top_8_flat_list
tournament_sets_with_top_8_df['top_8'] = tournament_sets_with_top_8_df['location_names'].apply(
    lambda locations: any(location in top_8_flat_list for location in locations)
)

tournament_sets_with_top_8_df.head()

In [707]:
# tournament_sets_with_top_8_df.groupby('top_8')['p1_rating'].describe()

In [None]:
tournament_sets_with_top_8_df.groupby('top_8')['p2_rating'].describe()



In [None]:
tournament_sets_with_top_8_df.groupby('top_8')['p1_updates'].describe()

In [None]:
tournament_sets_with_top_8_df.groupby('top_8')['p2_updates'].describe()

In [None]:
tournament_sets_with_top_8_df.loc[:, 'rating_difference'] = np.abs(tournament_sets_with_top_8_df['p1_rating'] - tournament_sets_with_top_8_df['p2_rating'])
tournament_sets_with_top_8_df.head()


In [None]:
sns.catplot(data=tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['rating_difference'] > 0], x='rating_difference', col='top_8', kind='violin')

In [None]:
# Make a deep copy of the dataframe to avoid the warning
tournament_sets_with_top_8_df = tournament_sets_with_top_8_df.copy()
tournament_sets_with_top_8_df = tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['rating_difference'] > 0]

# Now safely create the 'higher_rated_won' column
tournament_sets_with_top_8_df['higher_rated_won'] = (
    ((tournament_sets_with_top_8_df['winner_id'] == tournament_sets_with_top_8_df['p1_id']) & 
     (tournament_sets_with_top_8_df['p1_rating'] > tournament_sets_with_top_8_df['p2_rating'])) |
    ((tournament_sets_with_top_8_df['winner_id'] == tournament_sets_with_top_8_df['p2_id']) & 
     (tournament_sets_with_top_8_df['p2_rating'] > tournament_sets_with_top_8_df['p1_rating']))
)
print(tournament_sets_with_top_8_df.shape)
# tournament_sets_with_top_8_df.info()

In [None]:
top_8_df = tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['top_8'] == True]
print(f"Top 8 rating baseline: {top_8_df['higher_rated_won'].sum() / top_8_df.shape[0]:.0%}")
non_top_8_df = tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['top_8'] == False]
print(f"Non top 8 rating baseline: {non_top_8_df['higher_rated_won'].sum() / non_top_8_df.shape[0]:.0%}")


In [734]:
# Now safely create the 'higher_rated_won' column
tournament_sets_with_top_8_df['more_updates_won'] = (
    ((tournament_sets_with_top_8_df['winner_id'] == tournament_sets_with_top_8_df['p1_id']) & 
     (tournament_sets_with_top_8_df['p1_updates'] > tournament_sets_with_top_8_df['p2_updates'])) |
    ((tournament_sets_with_top_8_df['winner_id'] == tournament_sets_with_top_8_df['p2_id']) & 
     (tournament_sets_with_top_8_df['p2_updates'] > tournament_sets_with_top_8_df['p1_updates']))
)

In [None]:
top_8_df = tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['top_8'] == True]
print(f"Top 8 updates baseline: {top_8_df['more_updates_won'].sum() / top_8_df.shape[0]:.0%}")
non_top_8_df = tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['top_8'] == False]
print(f"Non top 8 updates baseline: {non_top_8_df['more_updates_won'].sum() / non_top_8_df.shape[0]:.0%}")

In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.minimum(top_8_df['p2_updates'], top_8_df['p1_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 50)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask]['higher_rated_won'].sum() / masked_top_8[difference_mask].shape[0] * 100)

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) >= updates) & (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 50)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask]['higher_rated_won'].sum() / masked_non_top_8[difference_mask].shape[0] * 100)

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Accuracy of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}%', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Accuracy of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}%', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Min Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 500, 50))  # Matches your `rating_difference` range


In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.minimum(top_8_df['p1_updates'], top_8_df['p2_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 10)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask]['higher_rated_won'].sum() / masked_top_8[difference_mask].shape[0] * 100)

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) >= updates) & (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 10)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask]['higher_rated_won'].sum() / masked_non_top_8[difference_mask].shape[0] * 100)

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Accuracy of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}%', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Accuracy of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}%', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Min Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 100, 10))  # Matches your `rating_difference` range

In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 10)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask]['higher_rated_won'].sum() / masked_top_8[difference_mask].shape[0] * 100)

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) >= updates) & (np.maximum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 10)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask]['higher_rated_won'].sum() / masked_non_top_8[difference_mask].shape[0] * 100)

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Accuracy of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}%', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Accuracy of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}%', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Max Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 100, 10))  # Matches your `rating_difference` range

In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.minimum(top_8_df['p1_updates'], top_8_df['p2_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 50)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask].shape[0])

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) >= updates) & (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 50)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask].shape[0])


fig, axs = plt.subplots(1, 2, figsize=(20, 10))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Set Count of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Set Count of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Min Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 500, 50))  # Matches your `rating_difference` range


In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 50)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask].shape[0])

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) >= updates) & (np.maximum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 50)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask].shape[0])


fig, axs = plt.subplots(1, 2, figsize=(20, 10))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Set Count of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Set Count of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Max Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 500, 50))  # Matches your `rating_difference` range

In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 10)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask].shape[0])

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(non_top_8_df['p2_updates'], non_top_8_df['p2_updates']) >= updates) & (np.maximum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 10)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask].shape[0])


fig, axs = plt.subplots(1, 2, figsize=(20, 10))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Set Count of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Set Count of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Max Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 100, 10))  # Matches your `rating_difference` range


## Glicko-2 updated weekly

In [None]:
player_ratings_df = pd.read_pickle(data_path + 'overall_players_ranking_new_weekly.pkl')
player_ratings_df.shape

In [None]:

print(player_ratings_df.max().sort_values()[-20:])
print(player_ratings_df.max().describe())
print(sum(player_ratings_df.max() == 1500))
sns.violinplot(player_ratings_df.max())


## Matchup specfic Glicko-2 ##

In [None]:
matchup_glicko = pd.read_pickle(data_path + 'player_ratings_matchup_df.pkl')
matchup_glicko.head()

## aMSa v Cody rating ##
Here we plot aMSa's vs Fox and Cody's vs Yoshi. Cody pretty much always beats aMSa and I would expect that his Fox vs Yoshi matchup rating to be higher than aMSa's Yoshi vs Fox rating. We need to understand why the ratings do not reflect the results.

In [None]:
# Extract the relevant row from the DataFrame
player_data = matchup_glicko.loc[matchup_glicko['player_id'] == '1021', 'rating_history_dict'].values[0]
date_value = matchup_glicko.loc[matchup_glicko['player_id'] == '1021', 'dates_dict'].values[0]
# Access the specific nested data: melee/yoshi vs melee/fox
yoshi_vs_fox_ratings = player_data['melee/yoshi']['melee/fox']

plt.figure()
plt.plot(date_value['melee/yoshi']['melee/fox'], player_data['melee/yoshi']['melee/fox'], label='aMSa')

# Extract the relevant row from the DataFrame
player_data = matchup_glicko.loc[matchup_glicko['player_id'] == '19554', 'rating_history_dict'].values[0]
date_value = matchup_glicko.loc[matchup_glicko['player_id'] == '19554', 'dates_dict'].values[0]
# Access the specific nested data: melee/yoshi vs melee/fox
fox_vs_yoshi_ratings = player_data['melee/fox']['melee/yoshi']
plt.title('aMSa v Cody')
plt.xlabel('Date')
plt.ylabel('Glicko-2 Rating')

plt.plot(date_value['melee/fox']['melee/yoshi'], player_data['melee/fox']['melee/yoshi'], label='Cody')
plt.legend()
plt.show()

In [None]:
matchup_glicko.loc[matchup_glicko['player_id'] == '1021', 'game_count_dict'].values[0]['melee/yoshi']['melee/fox']

In [None]:
matchup_glicko.loc[matchup_glicko['player_id'] == '19554', 'game_count_dict'].values[0]['melee/fox']['melee/yoshi']

In [725]:
# # Define the player and opponent characters
# player_character = 'melee/fox'
# opponent_character = 'melee/falco'



# def get_matchup(matchup_glicko, player_character, opponent_character):
#     # Safely access nested dictionaries with error handling
#     def extract_dates(nested_dict):
#         """Safely extract the dates for a given player and opponent matchup."""
#         try:
#             # Extract the dates if both keys are present
#             return nested_dict.get(player_character, {}).get(opponent_character, [])
#         except AttributeError:
#             # Handle cases where the structure is not as expected
#             return []  # Default fallback
        
#     # Copy the original DataFrame
#     glicko2_matchup = matchup_glicko.copy()
    
#     # Apply the extraction function
#     glicko2_matchup['dates'] = glicko2_matchup['dates_dict'].apply(extract_dates)
#     glicko2_matchup['rating_history'] = glicko2_matchup['rating_history_dict'].apply(extract_dates)
#     glicko2_matchup['rd_history'] = glicko2_matchup['rating_history_dict'].apply(extract_dates)
#     # glicko2_matchup['extracted_dates'] = glicko2_matchup['dates_dict'].apply(extract_dates)

#     # glicko2_matchup.

#     # Check the first few rows to verify the result
#     # print(glicko2_matchup[['player_id','dates_dict', 'extracted_dates','extracted_rating']].head())
#     glicko2_matchup_filtered = glicko2_matchup[glicko2_matchup['dates'].apply(len) > 0][['player_id','dates', 'rating_history','rd_history']]
#     glicko2_matchup_filtered.set_index(glicko2_matchup_filtered['player_id'], inplace=True)

#     return glicko2_matchup_filtered[['dates', 'rating_history','rd_history']]

# def get_index_of_date(dates, target_date):
#     return np.searchsorted(dates, target_date) - 1


# print(get_matchup(matchup_glicko,'melee/fox', 'melee/falco'))
# # print(glicko2_df)

# def print_top_players(glicko2_df, n_players = 5):
#     for year in range(2016, 2025):
#         for month in [1, 6]:
#             # First filter out the players who have only a few updates (played in 5 tournaments)
#             filtered_glicko = glicko2_df[glicko2_df['dates'].apply(len) > 4]
            
#             target_date = datetime.datetime(year, month, 1)

#             indices = filtered_glicko['dates'].apply(lambda x: get_index_of_date(x, target_date))
            
#             # Filter out the players that have not entered a tournament yet.
#             filtered_glicko = filtered_glicko[indices > 0]

#             # Extract ratings as a Series, ensuring correct data type
#             # print(indices)
#             ratings_on_date = filtered_glicko.apply(
#                 lambda row: row['rating_history'][indices[row.name]], axis=1
#             )

#             rd_on_date = filtered_glicko.apply(
#                 lambda row: row['rd_history'][indices[row.name]], axis=1
#             )

#             # Ensure ratings_on_date is a Series and sort it
#             top_5 = ratings_on_date.sort_values(ascending=False)[:n_players]

#             # Retrieve the player tags along with their ratings
#             top_5_df = players_df[players_df['player_id'].isin(top_5.index)]
#             top_5_df = top_5_df.set_index('player_id').loc[top_5.index]

#             top_5_df['rating'] = top_5.values.astype(int)

#             # Display the top 5 players sorted by rating
#             print(f"Date: {target_date.strftime('%Y-%m-%d')}")
#             print(top_5_df[['tag', 'rating']].to_string(index=False))
#             print()


# # Example usage: print top players for Fox vs Falco
# print_top_players(get_matchup(matchup_glicko, 'melee/fox', 'melee/falco'), 10)

# # get_matchup(matchup_glicko,'melee/fox', 'melee/falco')

# print_top_players(get_matchup(matchup_glicko,'melee/fox', 'melee/falco'), 10)


This code will visualize the effect of different values of tau in the calculation.

In [726]:
# matchup_glicko_3 = pd.read_pickle('../player_ratings_matchup_data/player_ratings_matchup_tau_3_df.pkl')
# matchup_glicko_5 = pd.read_pickle('../player_ratings_matchup_data/player_ratings_matchup_tau_5_df.pkl')
# matchup_glicko_7 = pd.read_pickle('../player_ratings_matchup_data/player_ratings_matchup_tau_7_df.pkl')
# matchup_glicko_9 = pd.read_pickle('../player_ratings_matchup_data/player_ratings_matchup_tau_9_df.pkl')
# matchup_glicko_11 = pd.read_pickle('../player_ratings_matchup_data/player_ratings_matchup_tau_11_df.pkl')
# matchup_glicko_13 = pd.read_pickle('../player_ratings_matchup_data/player_ratings_matchup_tau_13_df.pkl')


In [727]:
# # Extract the relevant row from the DataFrame
# glicko_data_list = [matchup_glicko_3, matchup_glicko_5, matchup_glicko_7, matchup_glicko_9, matchup_glicko_11, matchup_glicko_13]
# # glicko_data_list = [matchup_glicko_3, matchup_glicko_5]
# titles  = ['tau = .3', 'tau = .5', 'tau = .7', 'tau = .9', 'tau = 1.1', 'tau = 1.3']

# # Create subplots
# fig, ax = plt.subplots(3, 2, figsize=(16, 18))

# # Add a main title to the figure
# fig.suptitle('aMSa and Cody matchup rating by tau', fontsize=16)


# for i, data in enumerate(glicko_data_list):
#     amsa_matchup = data.loc[data['player_id'] == '1021']
#     cody_matchup = data.loc[data['player_id'] == '19554']
    
#     amsa_dates = amsa_matchup['dates_dict'].values[0]['melee/yoshi']['melee/fox']
#     cody_dates = cody_matchup['dates_dict'].values[0]['melee/fox']['melee/yoshi']
    
#     amsa_ratings = amsa_matchup['rating_history_dict'].values[0]['melee/yoshi']['melee/fox']
#     cody_ratings = cody_matchup['rating_history_dict'].values[0]['melee/fox']['melee/yoshi']
    
#     # Determine the subplot position in the 3x2 grid
#     row, col = divmod(i, 2)
    
#     ax[row, col].plot(amsa_dates, amsa_ratings, label='aMSa')
#     ax[row, col].plot(cody_dates, cody_ratings, label='Cody')
#     ax[row, col].legend()
#     ax[row, col].set_title(titles[i])


