# Exploration of Top 8
In this notebook we want to:
- Filter out tournaments that do not have the canonical sets_df['location_names']
- Label the top 8 sets of a tournament.
- Determine the bracket, i.e. which of the losers of the winners set plays which of the winners of the losers sets.

We are also interested in:
- How often does a grand finals reset occur?
- How often does the winner of the loser's finals win the tournament?
- How often does a player coming into the top 8 from losers win the tournament.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'


## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [5]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = + data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [6]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

In [7]:
# # Save the dictionary of DataFrames as a pickle
# with open(data_path + 'dfs_dict.pkl', 'wb') as f:
#     pickle.dump(dfs, f)

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [8]:
players_df = dfs['players_df']
players_df.head()


Unnamed: 0,game,player_id,tag,all_tags,prefixes,social,country,state,region,c_country,c_state,c_region,placings,characters,alias
0,melee,Rishi,Rishi,[Rishi],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
1,melee,15634,lloD,"[lloD, VGz | lloD, Llod]",[],{'twitter': ['lloD74']},United States,VA,,US,CA,Laurel,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/peach': 1089, 'melee/falco': 1, 'melee...",
2,melee,6126,Zain,"[Zain, DontTestMe]",[PG],{'twitter': ['PG_Zain']},United States,VA,,US,CA,Los Angeles,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/marth': 1065, 'melee/pichu': 1, 'melee...",DontTestMe
3,melee,Chu,Chu,[Chu],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
4,melee,5620,Junebug,"[Junebug, LS | VGz Junebug]",[],{'twitter': ['arJunebug']},United States,VA,,US,VA,Richmond,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/sheik': 46, 'melee/falco': 4, 'melee/g...",


In [9]:
ranking_df = dfs['ranking_df']
ranking_df.head()

Unnamed: 0,game,ranking_name,priority,region,seasons,tournaments,icon
0,melee,SSBMRank,0,world,"[2015, 2016, 2017, 2018, 2019]",[],miom


In [10]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

Unnamed: 0,game,ranking_name,season,start,end,total,by_id,by_placing,final,name
0,melee,SSBMRank,2015,1420070400,1451606399,100,"{'6189': 1, '1004': 2, '4465': 3, '1000': 4, '...","{'1': '6189', '2': '1004', '3': '4465', '4': '...",0,
1,melee,SSBMRank,2016,1451606400,1483228799,100,"{'6189': 1, '1004': 2, '1000': 3, '1003': 4, '...","{'1': '6189', '2': '1004', '3': '1000', '4': '...",0,
2,melee,SSBMRank,2017,1483228800,1514764799,100,"{'1004': 1, '6189': 2, '1000': 3, '1003': 4, '...","{'1': '1004', '2': '6189', '3': '1000', '4': '...",0,
3,melee,SSBMRank,2018,1514793600,1546329600,100,"{'1004': 1, '6189': 2, '4465': 3, '15990': 4, ...","{'1': '1004', '2': '6189', '3': '4465', '4': '...",0,
4,melee,SSBMRank,2019,1546329600,1577836800,100,"{'1004': 1, '4465': 2, '1000': 3, '16342': 4, ...","{'1': '1004', '2': '4465', '3': '1000', '4': '...",0,


In [11]:
sets_df = dfs['sets_df']
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
sets_df.head()



32.9% percent of sets have some game data


Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,5620,Chillin,3,1,"[R1, Round 1, Round 1]",,1,A,5,[]
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,Aglet,2,3,"[R1, Round 1, Round 1]",,1,B,5,[]
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,6126,1097,3,0,"[R1, Round 1, Round 1]",,1,C,5,[]
3,104675846,melee,mdva-invitational-2017-(challonge-mirror),1069,Chu,1069,0,3,"[R1, Round 1, Round 1]",,1,D,5,[]
4,104675847,melee,mdva-invitational-2017-(challonge-mirror),Rishi,Jerry,Rishi,1,3,"[R1, Round 1, Round 1]",,1,E,5,[]


In [9]:
tournament_info_df = dfs['tournament_info_df']
print(tournament_info_df.shape)
tournament_info_df.head()


(39675, 20)


Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,country,state,city,entrants,placings,losses,bracket_types,online,lat,lng
0,melee,mdva-invitational-2017-(challonge-mirror),MDVA Invitational 2017 (Challonge Mirror),challonge,https://challonge.com/mdva_invitational_2017,,17,,2017-11-26 08:05:11,2017-11-26 08:48:09,US,VA,Fall's Church,10,"[[Rishi, 1], [15634, 3], [6126, 4], [Chu, 8], ...",{},b'{}',0,,
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,US,MI,Ann Arbor,92,[],{},b'{}',0,,
2,melee,slippi-champions-league-week-1__melee-singles,Slippi Champions League Week 1,pgstats,slippi-champions-league-week-1,melee-singles,20,,2020-10-11 14:00:00,2020-10-11 14:00:00,,,,20,"[[1000, 1], [6126, 2], [4107, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0
3,melee,slippi-champions-league-week-2__melee-singles,Slippi Champions League Week 2,pgstats,slippi-champions-league-week-2,melee-singles,20,,2020-10-18 14:00:00,2020-10-18 14:00:00,,,,20,"[[6126, 1], [4107, 2], [1000, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0
4,melee,slippi-champions-league-week-3__melee-singles,Slippi Champions League Week 3,pgstats,slippi-champions-league-week-3,melee-singles,20,,2020-10-25 14:00:00,2020-10-25 14:00:00,,,,20,"[[6126, 1], [3359, 2], [19554, 3], [4107, 3], ...",{},b'{}',1,0.0,0.0


## Overall Glicko-2 Exploration ##


Import weekly updated Glicko-2 rating.

In [10]:
player_ratings_df = pd.read_pickle(data_path + 'overall_players_ranking_new_weekly.pkl')
print(player_ratings_df.shape)
player_ratings_df.head()

(515, 96524)


Unnamed: 0,1617201,1497667,2653190,3657740,41449,6039,3621289,3682293,3332271,2034855,...,1911774,4106746,Black hayato,The boy,138692,3293641,1701726,2408932,SmyD,15495
2015-01-01,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2015-01-08,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2015-01-15,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2015-01-22,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2015-01-29,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0


## Number of Glicko-2 updates
Running total of number of updates to each players glicko-2 rating. We use numba njit and prange to speed up the loops in the function. We save the results so that we only need to run the calculation once.

In [11]:

def previous_updates(array):
    """ This funcion returns an array like array with the number of times the value above i,j entry of array has changed.
    Args:
        array (np): the array

    Returns:
        np: the number of times array has changed above the i,j entry
    """
    previous_updates = np.zeros_like(array, dtype=np.int32)
    
    for i in range(1, array.shape[0]-1): # row i
        previous_row = array[i-1,:]
        # print(previous_row)
        current_row = array[i,:]
        # print(current_row)
        change = (previous_row != current_row).astype(np.int32)
        change

        previous_updates[i+1,:] = previous_updates[i,:] + change

    return previous_updates

## Testing array
# array = np.array([
#     [1, 1, 1],
#     [1, 1, 2],
#     [1, 2, 3],
#     [1, 3, 4]])

# print(array)
# previous_updates(array)
# print(previous_updates(array))

# # Do the calculation once.
# player_ratings_np = player_ratings_df.to_numpy()
# start = time.time()
# number_of_rating_updates_df = pd.DataFrame(columns=player_ratings_df.columns, index=player_ratings_df.index, data=previous_updates(player_ratings_np))
# end = time.time()
# print(f'time = {end-start:.2f}')
# number_of_rating_updates_df.head()

# # Save the results
# number_of_rating_updates_df.to_pickle(data_path + 'number_of_rating_updates_df.pkl')

## Load the results
number_of_rating_updates_df = pd.read_pickle(data_path + 'number_of_rating_updates_df.pkl')
number_of_rating_updates_df.head()

Unnamed: 0,3612758,181699,340266,629311,410076,742562,15095,1105962,2387953,2772668,...,3130307,2702567,2986921,jBomb,Adderall Admiral,1565080,2620119,2155545,2971127,130638
2015-01-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-08,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


## Add some columns to sets_df
We add the start of the tournament, the player ratings at the start of the tournament, and the number of times the player's rating has been updated before the start of the tournament.

In [12]:
# Perform a merge on 'key' and 'tournament_key' to bring 'start' dates into sets_df
merged_df = sets_df.merge(tournament_info_df[['key', 'start']], left_on='tournament_key', right_on='key', how='left')


In [13]:
tqdm.pandas()
import swifter

# Function to get both Player 1 and Player 2 ratings and the number of rating updates
def get_ratings_and_updates(row, player_ratings_df, number_of_rating_updates_df):
    # Find the closest date in player_ratings_df that is <= 'start' date
    closest_date = player_ratings_df.index[player_ratings_df.index <= row['start']].max()
    
    # If there's no valid date, return None for ratings and updates
    if pd.isnull(closest_date):
        return pd.Series([None, None, None, None], index=['p1_rating', 'p2_rating', 'p1_updates', 'p2_updates'])
    
    # Fetch Player 1's and Player 2's ratings on the closest date
    p1_rating = player_ratings_df.loc[closest_date, row['p1_id']] if row['p1_id'] in player_ratings_df.columns else None
    p2_rating = player_ratings_df.loc[closest_date, row['p2_id']] if row['p2_id'] in player_ratings_df.columns else None
    
    # Fetch Player 1's and Player 2's number of rating updates on the closest date
    p1_updates = number_of_rating_updates_df.loc[closest_date, row['p1_id']] if row['p1_id'] in number_of_rating_updates_df.columns else None
    p2_updates = number_of_rating_updates_df.loc[closest_date, row['p2_id']] if row['p2_id'] in number_of_rating_updates_df.columns else None
    
    # Return all values as a pandas Series
    return pd.Series([p1_rating, p2_rating, p1_updates, p2_updates], 
                     index=['p1_rating', 'p2_rating', 'p1_updates', 'p2_updates'])

# Apply the function to each row in merged_df
# merged_df[['p1_rating', 'p2_rating', 'p1_updates', 'p2_updates']] = merged_df.progress_apply(
#     get_ratings_and_updates, axis=1, 
#     player_ratings_df=player_ratings_df, 
#     number_of_rating_updates_df=number_of_rating_updates_df,
# )

# Save
# merged_df.to_pickle(data_path + 'augmented_sets_df.pkl')

# # Load
augmented_sets_df = pd.read_pickle(data_path + 'tournament_sets_with_top_8_df.pkl')
augmented_sets_df.head()

Unnamed: 0,key_x,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,...,start,p1_rating,p2_rating,p1_updates,p2_updates,top_8,rating_difference,higher_rated_won,more_updates_won,p1_won
45,90101028,melee,s@sh7,Fija,Fija,Sasha,1,0,"[W1, Winners 1, Winners Round 1]",,...,2017-06-13 10:27:01,1667.529088,1500.0,1.0,0.0,False,167.529088,True,True,True
47,90101030,melee,s@sh7,Bird,Empty Spirits,Bird,0,1,"[W1, Winners 1, Winners Round 1]",,...,2017-06-13 10:27:01,1500.0,1622.33761,0.0,2.0,False,122.33761,True,True,False
48,90101031,melee,s@sh7,Stitchface,3551,Stitchface,0,1,"[W1, Winners 1, Winners Round 1]",,...,2017-06-13 10:27:01,1523.49794,1500.0,3.0,0.0,False,23.49794,False,False,False
50,90101033,melee,s@sh7,rodohk,phlops,rodohk,0,1,"[W1, Winners 1, Winners Round 1]",,...,2017-06-13 10:27:01,1252.681917,1500.0,1.0,0.0,False,247.318083,True,False,False
55,90101038,melee,s@sh7,Sorry,Psythr,Sorry,0,1,"[W1, Winners 1, Winners Round 1]",,...,2017-06-13 10:27:01,1400.124736,1500.0,1.0,0.0,False,99.875264,True,False,False


## Top 8 Locations
We look for tournaments with a sets with 'location_names' = ['GF', 'Grand Final', 'Grand Final'].
The assumption is that tournaments with that location name will have a consistent location name structure.

In [14]:

# # Filter the rows where 'location_names' exactly matches ['GF', 'Grand Final', 'Grand Final']
# gf_sets_df = sets_df[augmented_sets_df['location_names'].apply(lambda x: x == ['GF', 'Grand Final', 'Grand Final'])]

# # Extract the tournament keys for the Grand Finals
# gf_tournament_keys = list(gf_sets_df['tournament_key'])

# Filter the rows where 'location_names' exactly matches ['GF', 'Grand Final', 'Grand Final']
gf_sets_df = augmented_sets_df[augmented_sets_df['location_names'].apply(lambda x: x == ['GF', 'Grand Final', 'Grand Final'])]

# Extract the tournament keys for the Grand Finals
gf_tournament_keys = list(gf_sets_df['tournament_key'])

# Filter the sets_df to include only the sets from tournaments that had Grand Finals
valid_tournament_sets_df = augmented_sets_df[augmented_sets_df['tournament_key'].isin(gf_tournament_keys)]

# Display the result
print(valid_tournament_sets_df.head(3))
print(valid_tournament_sets_df['location_names'].value_counts().to_string())

    key_x   game                                 tournament_key winner_id  \
429        melee  httpsparagonchallongecomla_2015_melee_singles      1000   
430        melee  httpsparagonchallongecomla_2015_melee_singles      1032   
431        melee  httpsparagonchallongecomla_2015_melee_singles      1036   

    p1_id          p2_id  p1_score  p2_score  \
429  1000  Azen Zagenite         2         0   
430  1032           1089         2         1   
431  1036    MIOM | Toph         2         0   

                       location_names bracket_name  ...               start  \
429  [W1, Winners 1, Winners Round 1]               ... 2015-09-06 23:45:46   
430  [W1, Winners 1, Winners Round 1]               ... 2015-09-06 23:45:46   
431  [W1, Winners 1, Winners Round 1]               ... 2015-09-06 23:45:46   

       p1_rating    p2_rating p1_updates p2_updates  top_8  rating_difference  \
429  1950.348732  1500.000000        6.0        0.0  False         450.348732   
430  1769.147167  1

The location names of the top 8 games are the following:
- [f"L{n}", f"Losers {n}", f"Losers Round {n}], # Where n is the maximum n in all such location of the  tournament.  
- ['WSF', 'Winners Semis', 'Winners Semi-Final'],
- ['LQF', 'Losers Quarters', 'Losers Quarter-Final'],
- ['WF', 'Winners Final', 'Winners Final'],
- ['LSF', 'Losers Semis', 'Losers Semi-Final'],
- ['LF', 'Losers Final', 'Losers Final'],
- ['GF', 'Grand Final', 'Grand Final'],
- ['GFR', 'GF Reset', 'Grand Final Reset']

In [15]:
# The vast majority of the top 8 games have these "location_names"
top_8_locations = [                                   
        ['WSF', 'Winners Semis', 'Winners Semi-Final'],
        ['LQF', 'Losers Quarters', 'Losers Quarter-Final'],
        ['WF', 'Winners Final', 'Winners Final'],
        ['LSF', 'Losers Semis', 'Losers Semi-Final'],
        ['LF', 'Losers Final', 'Losers Final'],
        ['GF', 'Grand Final', 'Grand Final'],
        ['GFR', 'GF Reset', 'Grand Final Reset']
    ] 

In [16]:
def take_first_item(location_list):
    return location_list[0]
valid_tournament_sets_df['location_names'] = valid_tournament_sets_df['location_names'].progress_apply(take_first_item)


  0%|          | 0/1625864 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_tournament_sets_df['location_names'] = valid_tournament_sets_df['location_names'].progress_apply(take_first_item)


In [17]:
# import re



# def max_losers_round_pattern(location_name):
#     losers_round_pattern = r'^L(\d+)$'
#     return re.match(losers_round_pattern, location_name)
        
# def find_last_losers(tournament_key):
#     tournament_sets_df = valid_tournament_sets_df[valid_tournament_sets_df['tournament_key'] == tournament_key]
    
#     mask = tournament_sets_df['location_names'].apply(max_losers_round_pattern)
    
#     losers_rounds = tournament_sets_df[mask]['location_names'][1:].to(int)
#     return max(losers_rounds)



# valid_tournaments_df = tournament_info_df[tournament_info_df['key'].isin(gf_tournament_keys)]

# valid_tournaments_df['last_losers_round'] = valid_tournaments_df['key'].progress_apply(find_last_losers)

    

In [18]:
import re
# import pandas as pd

# Compile the regex pattern once
losers_round_pattern = re.compile(r'^L(\d+)$')

# Function to extract the first item from the list in 'location_names'
def take_first_item(location_list):
    return location_list[0]

# Function to extract the round number 'n' from location names like 'L{n}', returns None if no match
def extract_losers_round(location_name):
    match = losers_round_pattern.match(location_name)
    if match:
        return int(match.group(1))
    return None

# Extract the losers round numbers in one go
valid_tournament_sets_df['losers_round_n'] = valid_tournament_sets_df['location_names'].apply(extract_losers_round)

# Group by 'tournament_key' and find the maximum losers round for each tournament
max_losers_round_by_tournament = valid_tournament_sets_df.groupby('tournament_key')['losers_round_n'].max().reset_index()

# Filter tournaments based on 'gf_tournament_keys'
valid_tournaments_df = tournament_info_df[tournament_info_df['key'].isin(gf_tournament_keys)]

# Merge the max losers round with the tournaments DataFrame
valid_tournaments_df = valid_tournaments_df.merge(max_losers_round_by_tournament, left_on='key', right_on='tournament_key', how='left')

# Rename the column for clarity
valid_tournaments_df.rename(columns={'losers_round_n': 'last_losers_round'}, inplace=True)

# Fill NaN values with a default value (e.g., -1 or 0, depending on your requirements)
valid_tournaments_df['last_losers_round'].fillna(0, inplace=True)

# Convert the column to integer type
valid_tournaments_df['last_losers_round'] = valid_tournaments_df['last_losers_round'].astype(int)

# Display the result
print(valid_tournaments_df['last_losers_round'])


0        7
1        7
2        6
3        7
4        5
        ..
35030    5
35031    0
35032    2
35033    0
35034    2
Name: last_losers_round, Length: 35035, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_tournament_sets_df['losers_round_n'] = valid_tournament_sets_df['location_names'].apply(extract_losers_round)


In [19]:
top_8_locations = ['WSF']

# Create a dictionary to map each tournament_key to its last_losers_round for faster lookups
last_losers_round_dict = valid_tournaments_df.set_index('key')['last_losers_round'].to_dict()

# Vectorized check if 'location_names' is in the top 8 predefined locations
top_8_mask = valid_tournament_sets_df['location_names'].isin(top_8_locations)

# Vectorized check if the 'location_names' is equal to the corresponding 'last_losers_round' for each tournament_key
losers_round_mask = valid_tournament_sets_df.apply(
    lambda row: row['location_names'] == f"L{last_losers_round_dict.get(row['tournament_key'], '')}",
    axis=1
)

# Combine the two conditions using bitwise OR (|) to get the final mask
top_8_combined_mask = top_8_mask | losers_round_mask

# Filter the DataFrame based on the combined mask
valid_top_8_sets_df = valid_tournament_sets_df[top_8_combined_mask]

# Display the result
valid_top_8_sets_df.head()


Unnamed: 0,key_x,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,...,p1_rating,p2_rating,p1_updates,p2_updates,top_8,rating_difference,higher_rated_won,more_updates_won,p1_won,losers_round_n
489,,melee,httpsparagonchallongecomla_2015_melee_singles,1000,1000,16342,3,1,WSF,,...,1950.348732,1888.611974,6.0,6.0,True,61.736758,True,False,True,
490,,melee,httpsparagonchallongecomla_2015_melee_singles,1003,4465,1003,1,3,WSF,,...,1964.348729,1876.719754,11.0,6.0,True,87.628975,False,False,False,
548,,melee,httpsparagonchallongecomla_2015_melee_singles,1008,1023,1008,0,3,L7,,...,1802.177215,1878.237223,4.0,8.0,False,76.060007,True,True,False,7.0
549,,melee,httpsparagonchallongecomla_2015_melee_singles,1004,1004,13932,3,1,L7,,...,2001.144322,1823.077409,7.0,6.0,False,178.066913,True,True,True,7.0
615,,melee,httpsdl4-5challongecomdl45meleeprobracket,6189,6189,6103,3,0,WSF,,...,1983.490955,1500.0,5.0,0.0,True,483.490955,True,True,True,


In [20]:
valid_top_8_sets_df.head(20)

Unnamed: 0,key_x,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,...,p1_rating,p2_rating,p1_updates,p2_updates,top_8,rating_difference,higher_rated_won,more_updates_won,p1_won,losers_round_n
489,,melee,httpsparagonchallongecomla_2015_melee_singles,1000,1000,16342,3,1,WSF,,...,1950.348732,1888.611974,6.0,6.0,True,61.736758,True,False,True,
490,,melee,httpsparagonchallongecomla_2015_melee_singles,1003,4465,1003,1,3,WSF,,...,1964.348729,1876.719754,11.0,6.0,True,87.628975,False,False,False,
548,,melee,httpsparagonchallongecomla_2015_melee_singles,1008,1023,1008,0,3,L7,,...,1802.177215,1878.237223,4.0,8.0,False,76.060007,True,True,False,7.0
549,,melee,httpsparagonchallongecomla_2015_melee_singles,1004,1004,13932,3,1,L7,,...,2001.144322,1823.077409,7.0,6.0,False,178.066913,True,True,True,7.0
615,,melee,httpsdl4-5challongecomdl45meleeprobracket,6189,6189,6103,3,0,WSF,,...,1983.490955,1500.0,5.0,0.0,True,483.490955,True,True,True,
616,,melee,httpsdl4-5challongecomdl45meleeprobracket,4465,4465,12870,3,0,WSF,,...,1865.556343,1626.664862,5.0,1.0,True,238.891482,True,True,True,
675,,melee,httpsdl4-5challongecomdl45meleeprobracket,Thomas,4215,Thomas,1,3,L7,,...,1669.123922,1572.12945,2.0,1.0,False,96.994472,False,False,False,7.0
981,,melee,httpsapex2015meleechallongecomsingles,4465,1000,4465,1,3,WSF,,...,1669.343857,1884.124269,1.0,2.0,True,214.780412,True,True,False,
982,,melee,httpsapex2015meleechallongecomsingles,1002,6189,1002,2,3,WSF,,...,1735.105186,1500.0,2.0,0.0,True,235.105186,False,False,False,
1025,,melee,httpsapex2015meleechallongecomsingles,1004,1004,1013,3,2,L6,,...,1941.455548,1669.343857,1.0,1.0,False,272.11169,True,False,True,6.0


In [21]:
# Step 1: Group by 'tournament_key' and check if any of the players in top 8 have updates == 0
tournaments_with_zero_updates = valid_top_8_sets_df.groupby('tournament_key').filter(
    lambda x: not ((x['p1_updates'] <= 1).any() or (x['p2_updates'] <= 1).any())
)
tournaments_with_zero_updates['higher_rated_won'].sum()/tournaments_with_zero_updates.shape[0]
# Step 2: Display the filtered DataFrame
# tournaments_with_zero_updates.head(20)


0.7711373107256426

In [22]:
tournaments_with_zero_updates['higher_rated_won'].sum()/tournaments_with_zero_updates.shape[0]

0.7711373107256426

In [23]:
tournaments_with_zero_updates.shape[0]

87109

In [None]:
sets_with_top_8 = sets_df.copy()

In [36]:
top_8_col = sets_df.apply(lambda row: row.index.isin(sets_with_top_8.index))

In [42]:
sets_with_top_8['top_8'] = top_8_combined_mask

In [48]:
sets_with_top_8['top_8'].sum() / sets_with_top_8.shape[0]

0.08983444164080369

In [51]:
# Filter the rows where 'location_names' exactly matches ['GF', 'Grand Final', 'Grand Final']
gf_sets_df = sets_df[sets_df['location_names'].apply(lambda x: x == ['GF', 'Grand Final', 'Grand Final'])]

# Extract the tournament keys for the Grand Finals
gf_tournament_keys = list(gf_sets_df['tournament_key'])

# Filter the sets_df to include only the sets from tournaments that had Grand Finals
valid_tournament_sets_df = sets_df[sets_df['tournament_key'].isin(gf_tournament_keys)]

valid_tournament_sets_df['top_8'] = top_8_combined_mask

# Display the result
valid_tournament_sets_df.head(3)
# print(valid_tournament_sets_df['location_names'].value_counts().to_string())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_tournament_sets_df['top_8'] = top_8_combined_mask


Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data,top_8
45,90101028,melee,s@sh7,Fija,Fija,Sasha,1,0,"[W1, Winners 1, Winners Round 1]",,1,A,0,[],
46,90101029,melee,s@sh7,Don Juan,Don Juan,absynthe,1,0,"[W1, Winners 1, Winners Round 1]",,1,B,0,[],
47,90101030,melee,s@sh7,Bird,Empty Spirits,Bird,0,1,"[W1, Winners 1, Winners Round 1]",,1,C,0,[],


In [54]:
valid_tournament_sets_df['top_8'].sum()/ valid_tournament_sets_df.shape[0] * 100

9.547014819374082

In [59]:
valid_tournament_sets_df.to_pickle(data_path + 'sets_with_labeled_top_8_df.pkl')

In [60]:
pd.read_pickle(data_path + 'sets_with_labeled_top_8_df.pkl')

Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data,top_8
45,90101028,melee,s@sh7,Fija,Fija,Sasha,1,0,"[W1, Winners 1, Winners Round 1]",,1,A,0,[],
46,90101029,melee,s@sh7,Don Juan,Don Juan,absynthe,1,0,"[W1, Winners 1, Winners Round 1]",,1,B,0,[],
47,90101030,melee,s@sh7,Bird,Empty Spirits,Bird,0,1,"[W1, Winners 1, Winners Round 1]",,1,C,0,[],
48,90101031,melee,s@sh7,Stitchface,3551,Stitchface,0,1,"[W1, Winners 1, Winners Round 1]",,1,D,0,[],
49,90101032,melee,s@sh7,Pham,Pham,Juicebox,1,0,"[W1, Winners 1, Winners Round 1]",,1,E,0,[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795676,gg__76279646,melee,dice-dance-17__1v1-melee,908884,908884,3196854,3,1,"[L2, Losers 2, Losers Round 2]",Bracket final,2,O,3,"[{'winner_id': 3196854, 'loser_id': 908884, 'w...",True
1795677,gg__76279647,melee,dice-dance-17__1v1-melee,2791218,2791218,495503,3,1,"[LQF, Losers Quarters, Losers Quarter-Final]",Bracket final,2,P,3,"[{'winner_id': 2791218, 'loser_id': 495503, 'w...",False
1795678,gg__76279648,melee,dice-dance-17__1v1-melee,908884,2407110,908884,0,3,"[LQF, Losers Quarters, Losers Quarter-Final]",Bracket final,2,Q,3,"[{'winner_id': 908884, 'loser_id': 2407110, 'w...",False
1795679,gg__76279649,melee,dice-dance-17__1v1-melee,2791218,2791218,908884,3,0,"[LSF, Losers Semis, Losers Semi-Final]",Bracket final,2,R,3,"[{'winner_id': 2791218, 'loser_id': 908884, 'w...",False
