In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'


## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [2]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = + data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [3]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

In [4]:
# # Save the dictionary of DataFrames as a pickle
# with open(data_path + 'dfs_dict.pkl', 'wb') as f:
#     pickle.dump(dfs, f)

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [None]:
players_df = dfs['players_df']
players_df.head()


In [None]:
ranking_df = dfs['ranking_df']
ranking_df.head()

In [None]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

In [None]:
sets_df = dfs['sets_df']
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
sets_df.shape



In [None]:
tournament_info_df = dfs['tournament_info_df']
print(tournament_info_df.shape)
tournament_info_df.head()


## Overall Glicko-2 Exploration ##


Import weekly updated Glicko-2 rating.

In [None]:
player_ratings_df = pd.read_pickle(data_path + 'overall_players_ranking_new_weekly.pkl')
print(player_ratings_df.shape)
player_ratings_df.head()

## Number of Glicko-2 updates
Running total of number of updates to each players glicko-2 rating. We use numba njit and prange to speed up the loops in the function. We save the results so that we only need to run the calculation once.

In [None]:
from numba import njit, prange
@njit(parallel=True)
def previous_updates(array):
    """ This funcion returns an array like array with the number of times the value above i,j entry of array has changed.
    Args:
        array (np): the array

    Returns:
        np: the number of times array has changed above the i,j entry
    """
    previous_updates = np.zeros_like(array, dtype=np.int32)
    
    for i in prange(1, array.shape[0]-1): # row i
        previous_row = array[i-1,:]
        # print(previous_row)
        current_row = array[i,:]
        # print(current_row)
        change = (previous_row != current_row).astype(np.int32)
        change

        previous_updates[i+1,:] = previous_updates[i,:] + change

    return previous_updates

## Testing array
# array = np.array([
#     [1, 1, 1],
#     [1, 1, 2],
#     [1, 2, 3],
#     [1, 3, 4]])

# print(array)
# previous_updates(array)
# print(previous_updates(array))

# # Do the calculation once.
# player_ratings_np = player_ratings_df.to_numpy()
# start = time.time()
# number_of_rating_updates_df = pd.DataFrame(columns=player_ratings_df.columns, index=player_ratings_df.index, data=previous_updates(player_ratings_np))
# end = time.time()
# print(f'time = {end-start:.2f}')
# number_of_rating_updates_df.head()

# # Save the results
# number_of_rating_updates_df.to_pickle(data_path + 'number_of_rating_updates_df.pkl')

## Load the results
number_of_rating_updates_df = pd.read_pickle(data_path + 'number_of_rating_updates_df.pkl')
number_of_rating_updates_df.head()

## Add some columns to sets_df
We add the start of the tournament, the player ratings at the start of the tournament, and the number of times the player's rating has been updated before the start of the tournament.

In [12]:
# Perform a merge on 'key' and 'tournament_key' to bring 'start' dates into sets_df
merged_df = sets_df.merge(tournament_info_df[['key', 'start']], left_on='tournament_key', right_on='key', how='left')


In [None]:
tqdm.pandas()
import swifter

# Function to get both Player 1 and Player 2 ratings and the number of rating updates
def get_ratings_and_updates(row, player_ratings_df, number_of_rating_updates_df):
    # Find the closest date in player_ratings_df that is <= 'start' date
    closest_date = player_ratings_df.index[player_ratings_df.index <= row['start']].max()
    
    # If there's no valid date, return None for ratings and updates
    if pd.isnull(closest_date):
        return pd.Series([None, None, None, None], index=['p1_rating', 'p2_rating', 'p1_updates', 'p2_updates'])
    
    # Fetch Player 1's and Player 2's ratings on the closest date
    p1_rating = player_ratings_df.loc[closest_date, row['p1_id']] if row['p1_id'] in player_ratings_df.columns else None
    p2_rating = player_ratings_df.loc[closest_date, row['p2_id']] if row['p2_id'] in player_ratings_df.columns else None
    
    # Fetch Player 1's and Player 2's number of rating updates on the closest date
    p1_updates = number_of_rating_updates_df.loc[closest_date, row['p1_id']] if row['p1_id'] in number_of_rating_updates_df.columns else None
    p2_updates = number_of_rating_updates_df.loc[closest_date, row['p2_id']] if row['p2_id'] in number_of_rating_updates_df.columns else None
    
    # Return all values as a pandas Series
    return pd.Series([p1_rating, p2_rating, p1_updates, p2_updates], 
                     index=['p1_rating', 'p2_rating', 'p1_updates', 'p2_updates'])

# Apply the function to each row in merged_df
# merged_df[['p1_rating', 'p2_rating', 'p1_updates', 'p2_updates']] = merged_df.progress_apply(
#     get_ratings_and_updates, axis=1, 
#     player_ratings_df=player_ratings_df, 
#     number_of_rating_updates_df=number_of_rating_updates_df,
# )

# Save
# merged_df.to_pickle(data_path + 'augmented_sets_df.pkl')

# # Load
augmented_sets_df = pd.read_pickle(data_path + 'augmented_sets_df.pkl')
augmented_sets_df.head()


## Top 8 Locations
Here we look for what the sets corresponding to the top 8 of a tournament are labeled as in the column 'location_names'. We do this by inspection.

In [None]:
# Filter the rows where 'location_names' exactly matches ['GF', 'Grand Final', 'Grand Final']
gf_sets_df = sets_df[sets_df['location_names'].apply(lambda x: x == ['GF', 'Grand Final', 'Grand Final'])]

# Extract the tournament keys for the Grand Finals
gf_tournament_keys = list(gf_sets_df['tournament_key'])

# Filter the sets_df to include only the sets from tournaments that had Grand Finals
valid_tournament_sets_df = sets_df[sets_df['tournament_key'].isin(gf_tournament_keys)]

# Display the result
print(valid_tournament_sets_df['location_names'].value_counts().to_string())

In [None]:
print(sets_df['location_names'].value_counts().to_string())

Get all top 8 sets.

In [None]:
# The vast majority of the top 8 games have these "location_names"
top_8_locations = [
        ['WQF', 'Winners Quarters', 'Winners Quarter-Final'],                                   
        ['WSF', 'Winners Semis', 'Winners Semi-Final'],
        ['LQF', 'Losers Quarters', 'Losers Quarter-Final'],
        ['WF', 'Winners Final', 'Winners Final'],
        ['LSF', 'Losers Semis', 'Losers Semi-Final'],
        ['LF', 'Losers Final', 'Losers Final'],
        ['GF', 'Grand Final', 'Grand Final'],
        ['GFR', 'GF Reset', 'Grand Final Reset']
    ] 

top_8_sets_df = augmented_sets_df[augmented_sets_df["location_names"].isin(top_8_locations)]
top_8_tournament_keys = top_8_sets_df['tournament_key'].unique()
print(f"There are {len(top_8_tournament_keys)} tournaments with double elimination finals.")
top_8_sets_df.head()

Restrict our data set to tournaments that have the double elimination format.

In [None]:
tournament_sets_with_top_8_df = augmented_sets_df[augmented_sets_df['tournament_key'].isin(top_8_tournament_keys)]
non_top_8_sets_df = tournament_sets_with_top_8_df[~ tournament_sets_with_top_8_df['location_names'].isin(top_8_locations)]
non_top_8_sets_df.head()

In [None]:
# Flatten the top_8_locations to a single list of all location name variations
top_8_flat_list = [location for sublist in top_8_locations for location in sublist]

# Add a 'top_8' column based on whether 'location_names' matches any entry in the top_8_flat_list
tournament_sets_with_top_8_df.loc[:,'top_8'] = tournament_sets_with_top_8_df['location_names'].apply(
    lambda locations: any(location in top_8_flat_list for location in locations)
)

tournament_sets_with_top_8_df.head()

In [None]:
tournament_sets_with_top_8_df.groupby('top_8')['p1_rating'].describe()

In [None]:
tournament_sets_with_top_8_df.groupby('top_8')['p2_rating'].describe()



In [None]:
tournament_sets_with_top_8_df.groupby('top_8')['p1_updates'].describe()

In [None]:
tournament_sets_with_top_8_df.groupby('top_8')['p2_updates'].describe()

Add a column with the absolute rating difference.

In [None]:
tournament_sets_with_top_8_df['rating_difference'] = np.abs(tournament_sets_with_top_8_df['p1_rating'] - tournament_sets_with_top_8_df['p2_rating'])
tournament_sets_with_top_8_df.head()


In [None]:
sns.catplot(data=tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['rating_difference'] > 0], x='rating_difference', col='top_8', kind='violin')
plt.show()

In [None]:
# Make a deep copy of the dataframe to avoid the warning
tournament_sets_with_top_8_df = tournament_sets_with_top_8_df.copy()
tournament_sets_with_top_8_df = tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['rating_difference'] > 1]

# Now safely create the 'higher_rated_won' column
tournament_sets_with_top_8_df['higher_rated_won'] = (
    ((tournament_sets_with_top_8_df['winner_id'] == tournament_sets_with_top_8_df['p1_id']) & 
     (tournament_sets_with_top_8_df['p1_rating'] > tournament_sets_with_top_8_df['p2_rating'])) |
    ((tournament_sets_with_top_8_df['winner_id'] == tournament_sets_with_top_8_df['p2_id']) & 
     (tournament_sets_with_top_8_df['p2_rating'] > tournament_sets_with_top_8_df['p1_rating']))
)
print(tournament_sets_with_top_8_df.shape)
# tournament_sets_with_top_8_df.info()

In [None]:
print(f"Overall rating baseline: {tournament_sets_with_top_8_df['higher_rated_won'].sum() / tournament_sets_with_top_8_df.shape[0]:.2%}")

top_8_df = tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['top_8'] == True]
print(f"Top 8 rating baseline: {top_8_df['higher_rated_won'].sum() / top_8_df.shape[0]:.2%}")
non_top_8_df = tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['top_8'] == False]
print(f"Non top 8 rating baseline: {non_top_8_df['higher_rated_won'].sum() / non_top_8_df.shape[0]:.2%}")


In [27]:
# Now safely create the 'higher_rated_won' column
tournament_sets_with_top_8_df['more_updates_won'] = (
    ((tournament_sets_with_top_8_df['winner_id'] == tournament_sets_with_top_8_df['p1_id']) & 
     (tournament_sets_with_top_8_df['p1_updates'] > tournament_sets_with_top_8_df['p2_updates'])) |
    ((tournament_sets_with_top_8_df['winner_id'] == tournament_sets_with_top_8_df['p2_id']) & 
     (tournament_sets_with_top_8_df['p2_updates'] > tournament_sets_with_top_8_df['p1_updates']))
)

In [None]:
top_8_df = tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['top_8'] == True]
print(f"Top 8 updates baseline: {top_8_df['more_updates_won'].sum() / top_8_df.shape[0]:.0%}")
non_top_8_df = tournament_sets_with_top_8_df[tournament_sets_with_top_8_df['top_8'] == False]
print(f"Non top 8 updates baseline: {non_top_8_df['more_updates_won'].sum() / non_top_8_df.shape[0]:.0%}")

In [29]:
# Now safely create the 'higher_rated_won' column
tournament_sets_with_top_8_df['p1_won'] = (tournament_sets_with_top_8_df['winner_id'] == tournament_sets_with_top_8_df['p1_id']) 



In [None]:
print(f"Player 1 won {tournament_sets_with_top_8_df['p1_won'].sum() / tournament_sets_with_top_8_df.shape[0]:.1%} of the time.")

In [31]:
# tournament_sets_with_top_8_df.to_pickle(data_path + 'tournament_sets_with_top_8_df.pkl')=

In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.minimum(top_8_df['p2_updates'], top_8_df['p1_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 50)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask]['higher_rated_won'].sum() / masked_top_8[difference_mask].shape[0] * 100)

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) >= updates) & (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 50)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask]['higher_rated_won'].sum() / masked_non_top_8[difference_mask].shape[0] * 100)

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Accuracy of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}%', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Accuracy of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}%', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Min Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 500, 50))  # Matches your `rating_difference` range


In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.minimum(top_8_df['p1_updates'], top_8_df['p2_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 10)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask]['higher_rated_won'].sum() / masked_top_8[difference_mask].shape[0] * 100)

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) >= updates) & (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 10)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask]['higher_rated_won'].sum() / masked_non_top_8[difference_mask].shape[0] * 100)

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Accuracy of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}%', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Accuracy of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}%', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Min Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 100, 10))  # Matches your `rating_difference` range

In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 10)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask]['higher_rated_won'].sum() / masked_top_8[difference_mask].shape[0] * 100)

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) >= updates) & (np.maximum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 10)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask]['higher_rated_won'].sum() / masked_non_top_8[difference_mask].shape[0] * 100)

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Accuracy of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}%', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Accuracy of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}%', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Max Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 100, 10))  # Matches your `rating_difference` range

In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.minimum(top_8_df['p1_updates'], top_8_df['p2_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 50)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask].shape[0])

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) >= updates) & (np.minimum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 50)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask].shape[0])


fig, axs = plt.subplots(1, 2, figsize=(20, 10))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Set Count of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Set Count of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Min Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 500, 50))  # Matches your `rating_difference` range


In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 50)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask].shape[0])

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) >= updates) & (np.maximum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,500, 50)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 50)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask].shape[0])


fig, axs = plt.subplots(1, 2, figsize=(20, 10))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Set Count of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Set Count of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Max Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 500, 50))  # Matches your `rating_difference` range

In [None]:
accuracy_of_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) >= updates) & (np.maximum(top_8_df['p1_updates'], top_8_df['p2_updates']) < updates + 5)
    masked_top_8 = top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_top_8['rating_difference'] >= difference) & (masked_top_8['rating_difference'] < difference + 10)
        accuracy_of_top_8[j,i] = int(masked_top_8[difference_mask].shape[0])

accuracy_of_non_top_8 = np.zeros((10,10), dtype=np.int32)

for i, updates in enumerate(range(0, 50, 5)):
    updates_mask = (np.maximum(non_top_8_df['p2_updates'], non_top_8_df['p2_updates']) >= updates) & (np.maximum(non_top_8_df['p1_updates'], non_top_8_df['p2_updates']) < updates + 5)
    masked_non_top_8 = non_top_8_df[updates_mask]
    for j, difference in enumerate(range(0,100, 10)):
        difference_mask = (masked_non_top_8['rating_difference'] >= difference) & (masked_non_top_8['rating_difference'] < difference + 10)
        accuracy_of_non_top_8[j,i] = int(masked_non_top_8[difference_mask].shape[0])


fig, axs = plt.subplots(1, 2, figsize=(20, 10))

# Heatmap for top 8
cax1 = axs[0].imshow(accuracy_of_top_8, cmap='Blues', aspect='auto')
axs[0].set_title('Set Count of Top 8')

# Annotating values in the cells for top 8
for i in range(accuracy_of_top_8.shape[0]):
    for j in range(accuracy_of_top_8.shape[1]):
        axs[0].text(j, i, f'{accuracy_of_top_8[i, j]}', ha='center', va='center', color='black')

# Heatmap for non-top 8
cax2 = axs[1].imshow(accuracy_of_non_top_8, cmap='Blues', aspect='auto')
axs[1].set_title('Set Count of Non-Top 8')

# Annotating values in the cells for non-top 8
for i in range(accuracy_of_non_top_8.shape[0]):
    for j in range(accuracy_of_non_top_8.shape[1]):
        axs[1].text(j, i, f'{accuracy_of_non_top_8[i, j]}', ha='center', va='center', color='black')

# Set labels and ticks for both heatmaps
for ax in axs:
    ax.set_xlabel('Max Updates to Rating')  # This is correct for the y-axis
    ax.set_ylabel('Rating Difference')  # This should be the x-axis label
    ax.set_xticks(np.arange(0, 10))  # These ticks are fine
    ax.set_yticks(np.arange(0, 10))  # These ticks are fine
    ax.set_xticklabels(np.arange(0, 50, 5))  # Matches your `updates` range
    ax.set_yticklabels(np.arange(0, 100, 10))  # Matches your `rating_difference` range


## Highest Ranked Player Wins Tournament


In [None]:
len(top_8_tournament_keys)

In [None]:
tournaments_with_top_8_df = tournament_info_df[tournament_info_df['key'].isin(top_8_tournament_keys)]
print(tournaments_with_top_8_df.head(3))

In [None]:
print(type(tournaments_with_top_8_df['placings'].iloc[4]))
tournaments_with_top_8_df['placings'].head(10)

In [None]:
print(tournaments_with_top_8_df['placings'].iloc[4])

In [None]:
import pandas as pd

# Define a function to extract the top 3 player IDs from the 'placings' list, handling None values
def extract_top_3(placings):
    # Check if placings is None or an empty list
    if placings is None or len(placings) == 0:
        return pd.Series([None, None, None], index=['1', '2', '3'])
    
    # Filter out invalid entries (ensure they are lists of length 2 with valid player_id and rank)
    valid_placings = [p for p in placings if isinstance(p, list) and len(p) == 2 and isinstance(p[1], int)]
    
    # Sort the valid placings based on the rank (second element in each sublist)
    sorted_placings = sorted(valid_placings, key=lambda x: x[1])
    
    # Extract the player IDs of the top 3 (if available)
    top_3 = [p[0] for p in sorted_placings[:3]]
    
    # If there are less than 3 players, fill the remaining spots with None
    while len(top_3) < 3:
        top_3.append(None)
    
    return pd.Series(top_3, index=['1', '2', '3'])

# Apply the function to extract top 3 players to the 'placings' column
top_3_df = tournaments_with_top_8_df[['key', 'placings']].copy()
top_3_df[['1', '2', '3']] = top_3_df['placings'].apply(extract_top_3)

# Drop the 'placings' column, as we only need the 'key' and top 3 player IDs
top_3_df = top_3_df.drop(columns=['placings'])

# Display the resulting DataFrame
top_3_df.head()


In [None]:
top_8_sets_df.head(16)

In [44]:
top_8_locations = [
        ['WSF', 'Winners Semis', 'Winners Semi-Final'],
        ['LQF', 'Losers Quarters', 'Losers Quarter-Final'],
        ['LSF', 'Losers Semis', 'Losers Semi-Final'],
        ['WF', 'Winners Final', 'Winners Final'],
        ['LF', 'Losers Final', 'Losers Final'],
        ['GF', 'Grand Final', 'Grand Final'],
        ['GFR', 'GF Reset', 'Grand Final Reset']
    ] 

In [45]:
temp_df = sets_df[sets_df['tournament_key'] == 's@sh7']

In [None]:
print(temp_df.shape)
temp_df.head()

In [47]:
losers_1_player_df = temp_df[temp_df['p1_id'] == 'lain']
losers_2_player_df = temp_df[temp_df['p2_id'] == 'lain']
losers_player_df = pd.concat([losers_1_player_df,losers_2_player_df])

In [None]:
losers_player_df

In [None]:
temp_df

In [None]:
temp_df['location_names'].value_counts()