# Exploration of Top 8
In this notebook we want to:
- Filter out tournaments that do not have the canonical sets_df['location_names']
- Label the top 8 sets of a tournament.
- Determine the bracket, i.e. which of the losers of the winners set plays which of the winners of the losers sets.

We are also interested in:
- How often does a grand finals reset occur?
- How often does the winner of the loser's finals win the tournament?
- How often does a player coming into the top 8 from losers win the tournament.

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'


## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [89]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = + data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [90]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

In [91]:
# # Save the dictionary of DataFrames as a pickle
# with open(data_path + 'dfs_dict.pkl', 'wb') as f:
#     pickle.dump(dfs, f)

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [92]:
players_df = dfs['players_df']
players_df.head()


Unnamed: 0,game,player_id,tag,all_tags,prefixes,social,country,state,region,c_country,c_state,c_region,placings,characters,alias
0,melee,Rishi,Rishi,[Rishi],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
1,melee,15634,lloD,"[lloD, VGz | lloD, Llod]",[],{'twitter': ['lloD74']},United States,VA,,US,CA,Laurel,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/peach': 1089, 'melee/falco': 1, 'melee...",
2,melee,6126,Zain,"[Zain, DontTestMe]",[PG],{'twitter': ['PG_Zain']},United States,VA,,US,CA,Los Angeles,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/marth': 1065, 'melee/pichu': 1, 'melee...",DontTestMe
3,melee,Chu,Chu,[Chu],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
4,melee,5620,Junebug,"[Junebug, LS | VGz Junebug]",[],{'twitter': ['arJunebug']},United States,VA,,US,VA,Richmond,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/sheik': 46, 'melee/falco': 4, 'melee/g...",


In [93]:
ranking_df = dfs['ranking_df']
ranking_df.head()

Unnamed: 0,game,ranking_name,priority,region,seasons,tournaments,icon
0,melee,SSBMRank,0,world,"[2015, 2016, 2017, 2018, 2019]",[],miom


In [94]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

Unnamed: 0,game,ranking_name,season,start,end,total,by_id,by_placing,final,name
0,melee,SSBMRank,2015,1420070400,1451606399,100,"{'6189': 1, '1004': 2, '4465': 3, '1000': 4, '...","{'1': '6189', '2': '1004', '3': '4465', '4': '...",0,
1,melee,SSBMRank,2016,1451606400,1483228799,100,"{'6189': 1, '1004': 2, '1000': 3, '1003': 4, '...","{'1': '6189', '2': '1004', '3': '1000', '4': '...",0,
2,melee,SSBMRank,2017,1483228800,1514764799,100,"{'1004': 1, '6189': 2, '1000': 3, '1003': 4, '...","{'1': '1004', '2': '6189', '3': '1000', '4': '...",0,
3,melee,SSBMRank,2018,1514793600,1546329600,100,"{'1004': 1, '6189': 2, '4465': 3, '15990': 4, ...","{'1': '1004', '2': '6189', '3': '4465', '4': '...",0,
4,melee,SSBMRank,2019,1546329600,1577836800,100,"{'1004': 1, '4465': 2, '1000': 3, '16342': 4, ...","{'1': '1004', '2': '4465', '3': '1000', '4': '...",0,


In [95]:
sets_df = dfs['sets_df']
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
sets_df.shape



32.9% percent of sets have some game data


(1795681, 14)

In [96]:
tournament_info_df = dfs['tournament_info_df']
print(tournament_info_df.shape)
tournament_info_df.head()


(39675, 20)


Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,country,state,city,entrants,placings,losses,bracket_types,online,lat,lng
0,melee,mdva-invitational-2017-(challonge-mirror),MDVA Invitational 2017 (Challonge Mirror),challonge,https://challonge.com/mdva_invitational_2017,,17,,2017-11-26 08:05:11,2017-11-26 08:48:09,US,VA,Fall's Church,10,"[[Rishi, 1], [15634, 3], [6126, 4], [Chu, 8], ...",{},b'{}',0,,
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,US,MI,Ann Arbor,92,[],{},b'{}',0,,
2,melee,slippi-champions-league-week-1__melee-singles,Slippi Champions League Week 1,pgstats,slippi-champions-league-week-1,melee-singles,20,,2020-10-11 14:00:00,2020-10-11 14:00:00,,,,20,"[[1000, 1], [6126, 2], [4107, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0
3,melee,slippi-champions-league-week-2__melee-singles,Slippi Champions League Week 2,pgstats,slippi-champions-league-week-2,melee-singles,20,,2020-10-18 14:00:00,2020-10-18 14:00:00,,,,20,"[[6126, 1], [4107, 2], [1000, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0
4,melee,slippi-champions-league-week-3__melee-singles,Slippi Champions League Week 3,pgstats,slippi-champions-league-week-3,melee-singles,20,,2020-10-25 14:00:00,2020-10-25 14:00:00,,,,20,"[[6126, 1], [3359, 2], [19554, 3], [4107, 3], ...",{},b'{}',1,0.0,0.0


## Filter out some touraments
We start by looking for sets_df['location_names'] are the most common.

In [97]:
# We use .to_string() so that we print out all the values.
print(sets_df['location_names'].value_counts().to_string())

location_names
[W1, Winners 1, Winners Round 1]                                                              218928
[L2, Losers 2, Losers Round 2]                                                                178053
[W2, Winners 2, Winners Round 2]                                                              176575
[WQF, Winners Quarters, Winners Quarter-Final]                                                171715
[L1, Losers 1, Losers Round 1]                                                                163507
[L3, Losers 3, Losers Round 3]                                                                111521
[WSF, Winners Semis, Winners Semi-Final]                                                       89587
[LQF, Losers Quarters, Losers Quarter-Final]                                                   83806
[R1, Round 1, Round 1]                                                                         60476
[R2, Round 2, Round 2]                                                      

From the value counts we see that there are several sets_df['location_names'] that correspond to the finals of the tournament:
- ['GF', 'Grand Final',' Grand Final']              35523
- ['F', 'Final', 'Final']                           615
- ['Grand Finals', 'Grand Finals', 'Grand Finals']  7
- [Grand Final, Grand Final, Grand Final]           1

We will filter out the tournaments that do not have a set with ['GF', 'Grand Final',' Grand Final'] in their location names. That way the location names of all the sets in the tournament should be consistent.

In [98]:
# Filter the rows where 'location_names' exactly matches ['GF', 'Grand Final', 'Grand Final']
gf_sets_df = sets_df[sets_df['location_names'].apply(lambda x: x == ['GF', 'Grand Final', 'Grand Final'])]

# Extract the tournament keys for the Grand Finals
gf_tournament_keys = list(gf_sets_df['tournament_key'])

# Filter the sets_df to include only the sets from tournaments that had Grand Finals
valid_tournament_sets_df = sets_df[sets_df['tournament_key'].isin(gf_tournament_keys)]


Here is the structure of a typical top 8 bracket.
![alt text](top_8.png "Top 8 Bracket")
We need to figure out what location names correspond to which positions.



I suspect that the location names of the top 8 games are the following:
- [f"L{n}", f"Losers {n}", f"Losers Round {n}], # Where n is the maximum n in all such location of the  tournament.  
- ['WSF', 'Winners Semis', 'Winners Semi-Final'],
- ['LQF', 'Losers Quarters', 'Losers Quarter-Final'],
- ['WF', 'Winners Final', 'Winners Final'],
- ['LSF', 'Losers Semis', 'Losers Semi-Final'],
- ['LF', 'Losers Final', 'Losers Final'],
- ['GF', 'Grand Final', 'Grand Final'],'
- ['GFR', 'GF Reset', 'Grand Final Reset']

We will test that hypothesis.

In [99]:
# For now we ignore the L{n} location name.
top_8_locations = [                                   
        ['WSF', 'Winners Semis', 'Winners Semi-Final'],
        ['LQF', 'Losers Quarters', 'Losers Quarter-Final'],
        ['WF', 'Winners Final', 'Winners Final'],
        ['LSF', 'Losers Semis', 'Losers Semi-Final'],
        ['LF', 'Losers Final', 'Losers Final'],
        ['GF', 'Grand Final', 'Grand Final'],
        ['GFR', 'GF Reset', 'Grand Final Reset']
    ] 

valid_tournament_sets_df[valid_tournament_sets_df['location_names'].isin(top_8_locations)]['location_names'].value_counts()

location_names
[WSF, Winners Semis, Winners Semi-Final]        87795
[LQF, Losers Quarters, Losers Quarter-Final]    82289
[WF, Winners Final, Winners Final]              37839
[LSF, Losers Semis, Losers Semi-Final]          37771
[LF, Losers Final, Losers Final]                35660
[GF, Grand Final, Grand Final]                  35523
[GFR, GF Reset, Grand Final Reset]              10817
Name: count, dtype: int64

If our hypothesis was correct, the there should be the same number of sets with location_names WF, LF, and GF, because the grand finals consisit of the winner from the losers final and the winners of the winners final. But the counts of those in our filtered data set do not match.

In [100]:
print('The number of tourmanets in our filtered dataset is', len(gf_tournament_keys))
print()
# Display the value counts of the remaining location names.
print(valid_tournament_sets_df['location_names'].value_counts().to_string())

The number of tourmanets in our filtered dataset is 35523

location_names
[W1, Winners 1, Winners Round 1]                       213439
[L2, Losers 2, Losers Round 2]                         173671
[W2, Winners 2, Winners Round 2]                       172018
[WQF, Winners Quarters, Winners Quarter-Final]         167356
[L1, Losers 1, Losers Round 1]                         159329
[L3, Losers 3, Losers Round 3]                         108786
[WSF, Winners Semis, Winners Semi-Final]                87795
[LQF, Losers Quarters, Losers Quarter-Final]            82289
[L4, Losers 4, Losers Round 4]                          55326
[R1, Round 1, Round 1]                                  47676
[R2, Round 2, Round 2]                                  47401
[R3, Round 3, Round 3]                                  47179
[R4, Round 4, Round 4]                                  38438
[R5, Round 5, Round 5]                                  37896
[WF, Winners Final, Winners Final]                      37

In [101]:
valid_tournament_sets_df['location_names'] = valid_tournament_sets_df['location_names'].apply(lambda x: x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_tournament_sets_df['location_names'] = valid_tournament_sets_df['location_names'].apply(lambda x: x[0])


We need to process the tournaments with empty bracket_name seperately from the tournaments with non-empty bracket names. We start with tournaments with nothing in the bracket_name column.

In [102]:
# Filter the sets from tournaments with empty bracket_name
valid_tournament_sets_no_bracket_df = valid_tournament_sets_df[valid_tournament_sets_df['bracket_name'] == ""]

# These are the top_8 locations, not including L{n}
top_8_locations = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']                        

# Filter the sets with those location_names
top_8_no_bracket_name_df = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'].isin(top_8_locations)]

# Check that the number of games matches up.
top_8_no_bracket_name_df['location_names'].value_counts()

location_names
WSF    30
LQF    30
GF     19
WF     15
LSF    15
LF     15
Name: count, dtype: int64

We should expect the same number of WF, LSF, and LF as there are GF and there should be double that for the LQF and WSF. I hypothesise that some of these are labelled as W{n} and L{n}. Lets check if any of the tournaments do not have a set labelled WF.

In [103]:
groups = valid_tournament_sets_no_bracket_df.groupby('tournament_key')

all_tournaments_have_wf = True
for key, frame in groups:
    if not frame['location_names'].isin(["WF"]).any():
        print(key, "has no WF sets.")
        all_tournaments_have_wf = False

if all_tournaments_have_wf:
    print("All tournaments have a WF set.")

All tournaments have a WF set.


All tournaments have a WF. Lets check to see how if there are tournaments with two GF. 

In [104]:
tournaments_with_gfr = []

for key, frame in groups:
    if frame['location_names'].isin(["GF"]).sum()==2:
        print(key, "has two gf sets.")
        tournaments_with_gfr.append(key)



dhwsmash has two gf sets.
httpsapex2015meleechallongecomsingles has two gf sets.
httpsteamheirchallongecomheir2singles has two gf sets.
pslmeleetop32 has two gf sets.


So in all these tournaments, a grand finals reset is has location name GF. Lets print out the GF sets in tournaments with a GF reset.

In [105]:
gfr_sets_df = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['tournament_key'].isin(tournaments_with_gfr)]
gfr_sets_df = gfr_sets_df[gfr_sets_df['location_names']=='GF']
gfr_sets_df

Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data
1030,,melee,httpsapex2015meleechallongecomsingles,6189,1002,6189,2,3,GF,,1,AV,5,[]
1031,,melee,httpsapex2015meleechallongecomsingles,1002,6189,1002,0,3,GF,,1,AW,5,[]
1219,,melee,pslmeleetop32,1000,1010,1000,0,3,GF,,1,AF,5,[]
1220,,melee,pslmeleetop32,1000,1000,1010,3,0,GF,,1,AG,5,[]
1720,,melee,dhwsmash,6189,1004,6189,0,3,GF,,1,DX,5,[]
1721,,melee,dhwsmash,1004,6189,1004,1,3,GF,,1,DY,5,[]
1903,,melee,httpsteamheirchallongecomheir2singles,6189,13932,6189,1,3,GF,,1,CN,5,[]
1904,,melee,httpsteamheirchallongecomheir2singles,6189,6189,13932,3,1,GF,,1,CO,5,[]


<span style="color:red">To Do: We need to figure out a way to determine which is the GF and which is the GF reset.</span>

From here, we need to get the top 8 tournaments sets with L{n} where n is maximal for that tournament.


In [106]:
# Sort the DataFrame by 'location_names' before grouping
losers_sets_df = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'].apply(lambda x: x[0]=='L')]
losers_sets_df = losers_sets_df.sort_values(['tournament_key','location_names'])
groups = losers_sets_df.groupby('tournament_key')

last_l_sets = []

for _, frame in groups:
    last_l_sets.append(frame.iloc[-6:-4]) # I want to append the index that that appears in losers_sets_df

all_top_8_no_bracket_name_df = pd.concat([top_8_no_bracket_name_df, pd.concat(last_l_sets)])
all_top_8_no_bracket_name_df.sort_values(['tournament_key','location_names'], inplace=True)
all_top_8_no_bracket_name_df.head(30)

Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data
1720,,melee,dhwsmash,6189,1004,6189,0,3,GF,,1,DX,5,[]
1721,,melee,dhwsmash,1004,6189,1004,1,3,GF,,1,DY,5,[]
1714,,melee,dhwsmash,1000,1013,1000,0,3,L9,,1,IP,5,[]
1715,,melee,dhwsmash,1037,1037,1024,3,2,L9,,1,IQ,5,[]
1719,,melee,dhwsmash,6189,6189,15990,3,0,LF,,1,IU,5,[]
1716,,melee,dhwsmash,1008,1008,1000,3,0,LQF,,1,IR,5,[]
1717,,melee,dhwsmash,15990,15990,1037,3,0,LQF,,1,IS,5,[]
1718,,melee,dhwsmash,15990,1008,15990,0,3,LSF,,1,IT,5,[]
1593,,melee,dhwsmash,1004,6189,1004,1,3,WF,,1,DW,5,[]
1591,,melee,dhwsmash,6189,6189,15990,3,0,WSF,,1,DU,5,[]


### ``all_top_8_no_bracket_name_df`` should be all the top 8 sets without a bracket_name. Looping over groups is slow, lets avoid that.

In [107]:
# Filter for tournaments with empty 'bracket_name'
valid_tournament_sets_no_bracket_df = valid_tournament_sets_df[valid_tournament_sets_df['bracket_name'] == ""]

# Define top_8 locations (excluding L{n})
top_8_locations = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']

# Filter sets with those location_names for top_8
top_8_no_bracket_name_df = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'].isin(top_8_locations)]

# Filter for rows where location_names start with 'L'
losers_sets_df = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'].str.startswith('L')]

# Sort by 'tournament_key' and 'location_names'
losers_sets_df = losers_sets_df.sort_values(['tournament_key', 'location_names'])

# Select the 5th and 6th last rows from each group in one go
last_l_sets = losers_sets_df.groupby('tournament_key').nth([-6, -5])

# Combine the filtered top_8 sets with the last sets in losers bracket
all_top_8_no_bracket_name_df_2 = pd.concat([top_8_no_bracket_name_df, last_l_sets]).sort_index()
all_top_8_no_bracket_name_df_2.head(10)


Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data
133,90101116,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),kjh,1,0,WSF,,1,CK,0,[]
134,90101117,melee,s@sh7,Ginger,1008,Ginger,0,1,WSF,,1,CL,0,[]
135,90101118,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),Ginger,1,0,WF,,1,CM,0,[]
220,90101203,melee,s@sh7,tm,Bbbbbbbbbexic,tm,0,1,L8,,1,FV,0,[]
221,90101204,melee,s@sh7,lain,lain,math,1,0,L8,,1,FW,0,[]
222,90101205,melee,s@sh7,1008,1008,tm,1,0,LQF,,1,FX,0,[]
223,90101206,melee,s@sh7,kjh,kjh,lain,1,0,LQF,,1,FY,0,[]
224,90101207,melee,s@sh7,kjh,1008,kjh,0,1,LSF,,1,FZ,0,[]
225,90101208,melee,s@sh7,kjh,Ginger,kjh,0,1,LF,,1,GA,0,[]
226,90101209,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),kjh,1,0,GF,,1,CN,0,[]


Check that the results are the same.

In [108]:
all_top_8_no_bracket_name_df.sort_index()

Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data
133,90101116,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),kjh,1,0,WSF,,1,CK,0,[]
134,90101117,melee,s@sh7,Ginger,1008,Ginger,0,1,WSF,,1,CL,0,[]
135,90101118,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),Ginger,1,0,WF,,1,CM,0,[]
220,90101203,melee,s@sh7,tm,Bbbbbbbbbexic,tm,0,1,L8,,1,FV,0,[]
221,90101204,melee,s@sh7,lain,lain,math,1,0,L8,,1,FW,0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1900,,melee,httpsteamheirchallongecomheir2singles,6189,Langley Trifasia,6189,0,3,LQF,,1,FY,5,[]
1901,,melee,httpsteamheirchallongecomheir2singles,6189,12870,6189,0,3,LSF,,1,FZ,5,[]
1902,,melee,httpsteamheirchallongecomheir2singles,6189,5625,6189,0,3,LF,,1,GA,5,[]
1903,,melee,httpsteamheirchallongecomheir2singles,6189,13932,6189,1,3,GF,,1,CN,5,[]


In [109]:
all_top_8_no_bracket_name_df.equals(all_top_8_no_bracket_name_df_2)

False

Lets now handle the tournaments with a bracket name.

In [110]:
# Filter for tournaments with a non-empty 'bracket_name'
valid_tournament_sets_with_bracket_df = valid_tournament_sets_df[valid_tournament_sets_df['bracket_name'] != ""]

gf_rows = valid_tournament_sets_with_bracket_df[valid_tournament_sets_with_bracket_df['location_names'] == 'GF']

gf_bracket_names = gf_rows[['tournament_key', 'bracket_name']]

# Merge to get the final bracket sets
tournament_sets_final_bracket_df = valid_tournament_sets_with_bracket_df.reset_index().merge(
    gf_bracket_names,
    on=['tournament_key', 'bracket_name'],
    how='inner',
    # indicatorbool=True
).set_index('index')

# Define top_8 locations (excluding L{n})
top_8_locations = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']

# Filter sets with those location_names for top_8
top_8_with_bracket_name_df = tournament_sets_final_bracket_df[tournament_sets_final_bracket_df['location_names'].isin(top_8_locations)]

# Filter for rows where location_names start with 'L'
losers_sets_df = tournament_sets_final_bracket_df[tournament_sets_final_bracket_df['location_names'].str.startswith('L')]

# Sort by 'tournament_key' and 'location_names'
losers_sets_df = losers_sets_df.sort_values(['tournament_key', 'location_names'])

# Select the 5th and 6th last rows from each group in one go
last_l_sets = losers_sets_df.groupby('tournament_key').nth([-6, -5])

# Combine the filtered top_8 sets with the last sets in losers bracket
all_top_8_with_bracket_name_df = pd.concat([top_8_with_bracket_name_df, last_l_sets]).sort_index()

all_top_8_with_bracket_name_df.head(10)

Unnamed: 0_level_0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
11482,,melee,shine-2018__melee-singles,6126,1004,6126,2,3,WSF,Top 8,4,A,3,"[{'loser_char': 'melee/marth', 'winner_score':..."
11483,,melee,shine-2018__melee-singles,1000,15990,1000,1,3,WSF,Top 8,4,B,3,"[{'loser_char': 'melee/samus', 'winner_score':..."
11484,,melee,shine-2018__melee-singles,6126,6126,1000,3,2,WF,Top 8,4,C,3,"[{'loser_char': 'melee/marth', 'winner_score':..."
11485,,melee,shine-2018__melee-singles,6126,6126,1004,3,1,GF,Top 8,4,0_D,3,"[{'loser_char': 'melee/jigglypuff', 'winner_sc..."
11486,,melee,shine-2018__melee-singles,1017,1017,1013,3,1,L1,Top 8,4,F,3,[]
11487,,melee,shine-2018__melee-singles,1077,1077,1019,3,2,L1,Top 8,4,G,3,"[{'loser_char': 'melee/fox', 'winner_score': 1..."
11488,,melee,shine-2018__melee-singles,15990,15990,1017,3,1,LQF,Top 8,4,H,3,"[{'loser_char': 'melee/captainfalcon', 'winner..."
11489,,melee,shine-2018__melee-singles,1004,1004,1077,3,0,LQF,Top 8,4,I,3,"[{'loser_char': 'melee/captainfalcon', 'winner..."
11490,,melee,shine-2018__melee-singles,1004,15990,1004,1,3,LSF,Top 8,4,J,3,[]
11491,,melee,shine-2018__melee-singles,1004,1000,1004,0,3,LF,Top 8,4,K,3,"[{'loser_char': 'melee/falco', 'winner_score':..."


We expect the WF, LF, and LSF to have the same value counts. We expect LQF and WSF to have the same value count which should be double that of the previous three. We see that this is not the case

In [111]:
all_top_8_with_bracket_name_df['location_names'].value_counts()

location_names
WSF    70070
LQF    68117
GF     35586
WF     35499
LF     35453
LSF    35452
L2     16006
L1     12896
L3     12424
L4     12000
GFR    10820
L5      5042
L6      2780
L7       898
L8       266
L9        32
Name: count, dtype: int64

We check to see if any location names are missing in each tournament. As we see, all looks good here.

In [112]:
locations = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']

for location in locations:
    # Check if each tournament has at least one "WF" in the location_names
    tournaments_with_wf = valid_tournament_sets_no_bracket_df[
        valid_tournament_sets_no_bracket_df['location_names'] == location
    ]['tournament_key'].unique()

    # Check if all tournament_keys are represented in tournaments_with_wf
    all_tournaments_have_wf = set(valid_tournament_sets_no_bracket_df['tournament_key'].unique()) <= set(tournaments_with_wf)

    if all_tournaments_have_wf:
        print(f"All tournaments have at least one {location} set.")
    else:
        print(f"At least one tournaments is missing a {location} set.")


All tournaments have at least one WSF set.
All tournaments have at least one LQF set.
All tournaments have at least one WF set.
All tournaments have at least one LSF set.
All tournaments have at least one LF set.
All tournaments have at least one GF set.
At least one tournaments is missing a GFR set.


In [113]:
locations = ['WF', 'LSF', 'LF', 'GF']
for location in locations:
    # Count the occurrences of "GF" in location_names for each tournament
    gf_counts = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'] == "GF"].groupby('tournament_key').size()

    # Identify tournaments with exactly two "GF" sets
    tournaments_with_gfr = gf_counts[gf_counts > 1].index.tolist()

    print(f'There are {len(tournaments_with_gfr)} tournaments with more than one {location} sets.')
    
print()

locations = ['WSF']
for location in locations:
    # Count the occurrences of "GF" in location_names for each tournament
    gf_counts = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'] == "GF"].groupby('tournament_key').size()

    # Identify tournaments with exactly two "GF" sets
    tournaments_with_gfr = gf_counts[gf_counts != 2].index.tolist()

    print(f'There are {len(tournaments_with_gfr)} tournaments without two {location} sets.')

There are 4 tournaments with more than one WF sets.
There are 4 tournaments with more than one LSF sets.
There are 4 tournaments with more than one LF sets.
There are 4 tournaments with more than one GF sets.

There are 11 tournaments without two WSF sets.


### To Do: Figure out why we don't always get the right number of location_names in a tournament.

<span style="color:red">To Do: For some reason, at least one tournament in the dataframe only has one Ln game. I don't know what that is about and need to investigate it. I lost which tournament it was.</span>

Put both dataframes together.

In [114]:
top_8_sets = pd.concat([all_top_8_with_bracket_name_df, all_top_8_no_bracket_name_df])
top_8_sets.sort_index(inplace=True)
top_8_sets.head(20)
# These are all the top 8 sets, including the ones without the bracket structure we are looking for
top_8_sets.to_pickle(data_path + 'top_8_sets.pkl')

## Remove bad tournaments
Check for tournaments without the right number of sets in each location.

In [115]:
bad_tournament_keys = []

# These location_names should only occur once in the top 8 of each tournament
locations = ['WF', 'LSF', 'LF', 'GF']

for location in locations:
    # Count the occurrences of location in location_names for each tournament
    location_counts = top_8_sets[top_8_sets['location_names'] == location].groupby('tournament_key').size()

    # Identify tournaments without exactly two location sets
    bad_tournaments_wrt_location = location_counts[location_counts != 1].index.tolist()
    
    print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly one {location} sets.')
    print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')
    
    bad_tournament_keys.extend(bad_tournaments_wrt_location)
print()


# This location_names should only occur exactly twice in the top 8 of each tournament
locations = ['WSF', 'LQF']
for location in locations:
    # Count the occurrences of location in location_names for each tournament
    location_counts = top_8_sets[top_8_sets['location_names'] == location].groupby('tournament_key').size()

    # Identify tournaments without exactly two location sets
    bad_tournaments_wrt_location = location_counts[location_counts != 2].index.tolist()

    print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly two {location} sets.')
    print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')
    
    bad_tournament_keys.extend(bad_tournaments_wrt_location)
print()

# There should be exactly two sets with different location_name than these
locations = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']

for location in locations:
    # Count the occurrences of location in location_names for each tournament
    location_counts = top_8_sets[~ top_8_sets['location_names'].isin(locations)].groupby('tournament_key').size()

    # Identify tournaments without exactly two location sets
    bad_tournaments_wrt_location = location_counts[location_counts != 2].index.tolist()

    print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly two Ln sets.')
    print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')
    
    bad_tournament_keys.extend(bad_tournaments_wrt_location)

# Delete duplicates
bad_tournament_keys = list(dict.fromkeys(bad_tournament_keys))
print(bad_tournament_keys)

print(f"There are {top_8_sets['tournament_key'].unique().shape[0]} - {len(bad_tournament_keys)} = {top_8_sets['tournament_key'].unique().shape[0] - len(bad_tournament_keys)} tournaments remaininng.")      

There are 83 tournaments without exactly one WF sets.
The bad tournaments are ['b-town-beatdown-89__melee-singles', 'b-town-beatdown-90__melee-singles', 'battlegateway-39__dx-melee-singles']
There are 81 tournaments without exactly one LSF sets.
The bad tournaments are ['b-town-beatdown-89__melee-singles', 'b-town-beatdown-90__melee-singles', 'battlegateway-39__dx-melee-singles']
There are 83 tournaments without exactly one LF sets.
The bad tournaments are ['b-town-beatdown-89__melee-singles', 'b-town-beatdown-90__melee-singles', 'battlegateway-39__dx-melee-singles']
There are 87 tournaments without exactly one GF sets.
The bad tournaments are ['b-town-beatdown-89__melee-singles', 'b-town-beatdown-90__melee-singles', 'battlegateway-39__dx-melee-singles']

There are 115 tournaments without exactly two WSF sets.
The bad tournaments are ['b-town-beatdown-89__melee-singles', 'b-town-beatdown-90__melee-singles', 'battlegateway-39__dx-melee-singles']
There are 795 tournaments without exactly

Check what is going on with one of the bad tournaments. https://www.start.gg/tournament/-1340/events

As we can see, the tournament structure is not what we are looking for.

![alt text](top_8_1340.png "Top 8 Bracket of 1340")


Remove the tournaments from top_8_sets.

In [116]:
# Make a list of tournament keys we are going to keep.
good_tournament_keys = [key for key in top_8_sets['tournament_key'].unique() if key not in bad_tournament_keys]
print(f"There are {len(good_tournament_keys)} good tournament keys and {len(bad_tournament_keys)} bad tournament keys.")

good_top_8_sets = top_8_sets[top_8_sets['tournament_key'].isin(good_tournament_keys)].copy()
print(f"We had {top_8_sets.shape[0]} top 8 sets and are left with {good_top_8_sets.shape[0]} good top 8 sets.")
print()

print(good_top_8_sets['location_names'].value_counts())

There are 33127 good tournament keys and 2277 bad tournament keys.
We had 353495 top 8 sets and are left with 332090 good top 8 sets.

location_names
WSF    65092
LQF    63864
GF     33127
WF     32982
LF     32981
LSF    32937
L2     15998
L3     12428
L4     12000
L1     11454
GFR    10191
L5      5048
L6      2780
L7       906
L8       270
L9        32
Name: count, dtype: int64


In [117]:
bad_tournament_keys = []

# Get all tournament keys
tournament_keys = top_8_sets['tournament_key'].unique()

# These location_names should occur exactly once in the top 8 of each tournament
locations_single = ['WF', 'LSF', 'LF', 'GF']

for location in locations_single:
    # Count the occurrences of location in location_names for each tournament
    location_counts = top_8_sets[top_8_sets['location_names'] == location].groupby('tournament_key').size()

    # Reindex to include all tournaments, filling missing counts with zero
    location_counts = location_counts.reindex(tournament_keys, fill_value=0)
    
    # Identify tournaments without exactly one occurrence
    bad_tournaments_wrt_location = location_counts[location_counts != 1].index.tolist()
    
    # print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly one {location} set.')
    # print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')
    
    bad_tournament_keys.extend(bad_tournaments_wrt_location)

# For 'GFR', the total per tournament should be either 0 or 1
gfr_counts = top_8_sets[top_8_sets['location_names'].isin(['GFR'])].groupby('tournament_key').size()

# Reindex and fill missing counts with zero
gfr_counts = gfr_counts.reindex(tournament_keys, fill_value=0)

# Identify tournaments where total GF + GFR is not 1 or 2
bad_tournaments_wrt_gfr = gfr_counts[~gfr_counts.isin([0, 1])].index.tolist()

# print(f'There are {len(bad_tournaments_wrt_gf)} tournaments without exactly 1 or 2 GF/GFR sets.')
# print(f'The bad tournaments are {bad_tournaments_wrt_gf[:3]}')

bad_tournament_keys.extend(bad_tournaments_wrt_gfr)

# These location_names should only occur exactly twice in the top 8 of each tournament
locations_double = ['WSF', 'LQF']

for location in locations_double:
    location_counts = top_8_sets[top_8_sets['location_names'] == location].groupby('tournament_key').size()
    location_counts = location_counts.reindex(tournament_keys, fill_value=0)
    bad_tournaments_wrt_location = location_counts[location_counts != 2].index.tolist()
    # print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly two {location} sets.')
    # print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')
    bad_tournament_keys.extend(bad_tournaments_wrt_location)

# For the remaining location names, there should be exactly two such sets per tournament
locations_exclude = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']

location_counts = top_8_sets[~top_8_sets['location_names'].isin(locations_exclude)].groupby('tournament_key').size()
location_counts = location_counts.reindex(tournament_keys, fill_value=0)
bad_tournaments_wrt_location = location_counts[location_counts != 2].index.tolist()

# print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly two Ln sets.')
# print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')

bad_tournament_keys.extend(bad_tournaments_wrt_location)

# Remove duplicates
bad_tournament_keys = list(set(bad_tournament_keys))

print(f"There are {len(bad_tournament_keys)} bad tournament keys.")

# Filter out bad tournaments
good_tournament_keys = [key for key in tournament_keys if key not in bad_tournament_keys]
good_top_8_sets = top_8_sets[top_8_sets['tournament_key'].isin(good_tournament_keys)].copy()

print(f"We had {top_8_sets.shape[0]} top 8 sets and are left with {good_top_8_sets.shape[0]} good top 8 sets.")
print()
print(good_top_8_sets['location_names'].value_counts())
ln_set_count = (~good_top_8_sets['location_names'].isin(['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR'])).sum()
print(f"Ln   {ln_set_count}")

There are 4950 bad tournament keys.
We had 353495 top 8 sets and are left with 314100 good top 8 sets.

location_names
WSF    60908
LQF    60908
WF     30454
LSF    30454
LF     30454
GF     30454
L2     15998
L3     12428
L4     12000
L1     11446
GFR     9560
L5      5048
L6      2780
L7       906
L8       270
L9        32
Name: count, dtype: int64
Ln   60908


In [118]:
# These are all the top 8 sets from tournaments with the correct bracket stricture
# good_top_8_sets.to_pickle(data_path + 'good_top_8_sets.pkl')

## Label the top 8 sets we will use for training.
This is how you would use the saved data frames to add the column to the data. You cannot have reset the original index of sets_df.

In [119]:
good_top_8_sets = pd.read_pickle(data_path + 'good_top_8_sets.pkl')

# Get the indices as a list
indices = list(good_top_8_sets.index)

# Copy the dataframe you wish to label
labelled_sets_df = sets_df.copy()

# Initialize the 'top_8' column with False
labelled_sets_df['good_top_8'] = False

# Set 'top_8' to True at the specified indices
labelled_sets_df.loc[indices, 'good_top_8'] = True


In [120]:
good_top_8_sets.shape

(314100, 14)

## Label the bracket
We modify the location names of the top 8 so be WSF_A, WSF_B, LN_A, LN_B, LQF_A, and LQF_B where WSF_A is the first WSF set as it appears in good_top_8_sets, and the loser of WSF_A plays the winner of LN_A in LQF_A. We will put those in a separate column named top_8_location_names.

In [123]:
# def assign_top8_labels(df):
#     # Ensure the 'top_8_location_names' and 'loser_id' columns exist in the original DataFrame
#     if 'top_8_location_names' not in df.columns:
#         df['top_8_location_names'] = ''
    
#     # Create a 'loser_id' column directly in the original DataFrame
#     df.loc[:, 'loser_id'] = df.apply(lambda row: row['p2_id'] if row['winner_id'] == row['p1_id'] else row['p1_id'], axis=1)
    
#     # Process each tournament individually
#     tournament_keys = df['tournament_key'].unique()[:10]  # Limiting to 4 tournaments for testing purposes
    
#     for tournament_key in tqdm(tournament_keys):
#         # Create a mask to select only the rows for the current tournament
#         tournament_mask = (df['tournament_key'] == tournament_key)
    
#         # Identify WSF matches and label them WSF_A and WSF_B
#         wsf_matches = df[tournament_mask & (df['location_names'] == 'WSF')].sort_index()
#         wsf_indices = wsf_matches.index.tolist()
#         wsf_labels = ['WSF_A', 'WSF_B']
#         df.loc[wsf_indices, 'top_8_location_names'] = wsf_labels[:len(wsf_indices)]

#         # Identify LQF matches and their indices
#         lqf_matches = df[tournament_mask & (df['location_names'] == 'LQF')]
#         lqf_indices = lqf_matches.index.tolist()

#         # Get the loser_id of 'WSF_A'
#         wsf_a_loser = df.loc[wsf_indices[0], 'loser_id']

#         # Get players in the first LQF match
#         lqf_first_set_players = df.loc[lqf_indices[0], ['p1_id', 'p2_id']].values

#         # Check if the loser of 'WSF_A' is in the first set of LQF and assign labels
#         if wsf_a_loser in lqf_first_set_players:
#             df.loc[lqf_indices[0], 'top_8_location_names'] = 'LQF_A'
#             df.loc[lqf_indices[1], 'top_8_location_names'] = 'LQF_B'
#         else:
#             df.loc[lqf_indices[1], 'top_8_location_names'] = 'LQF_A'
#             df.loc[lqf_indices[0], 'top_8_location_names'] = 'LQF_B'

#         # Identify LN matches and their indices
#         locations_exclude = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']
#         ln_matches = df[tournament_mask & ~df['location_names'].isin(locations_exclude)]
#         ln_indices = ln_matches.index.tolist()

#         # Ensure there are LN matches
#         if len(ln_indices) >= 2:
#             # Get the winner_id of the first LN match
#             ln_first_index_winner = df.loc[ln_indices[0], 'winner_id']
            
#             # Get players in the LQF_A match (after it has been assigned)
#             lqf_a_players = df.loc[tournament_mask & (df['top_8_location_names'] == 'LQF_A'), ['p1_id', 'p2_id']].values.flatten()
            
#             # Check if the winner of the first LN match is in the LQF_A set players
#             if ln_first_index_winner in lqf_a_players:
#                 df.loc[ln_indices[0], 'top_8_location_names'] = 'LN_A'
#                 df.loc[ln_indices[1], 'top_8_location_names'] = 'LN_B'
#             else:
#                 df.loc[ln_indices[1], 'top_8_location_names'] = 'LN_A'
#                 df.loc[ln_indices[0], 'top_8_location_names'] = 'LN_B'
                
#     df.drop(columns='loser_id', inplace=True)

#     return df

# # Apply the function to your DataFrame
# good_top_8_sets = assign_top8_labels(good_top_8_sets)

# good_top_8_sets_bracket.head(30)


  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data,top_8_location_names
133,90101116.0,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),kjh,1,0,WSF,,1,CK,0,[],WSF_A
134,90101117.0,melee,s@sh7,Ginger,1008,Ginger,0,1,WSF,,1,CL,0,[],WSF_B
135,90101118.0,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),Ginger,1,0,WF,,1,CM,0,[],
220,90101203.0,melee,s@sh7,tm,Bbbbbbbbbexic,tm,0,1,L8,,1,FV,0,[],LN_B
221,90101204.0,melee,s@sh7,lain,lain,math,1,0,L8,,1,FW,0,[],LN_A
222,90101205.0,melee,s@sh7,1008,1008,tm,1,0,LQF,,1,FX,0,[],LQF_B
223,90101206.0,melee,s@sh7,kjh,kjh,lain,1,0,LQF,,1,FY,0,[],LQF_A
224,90101207.0,melee,s@sh7,kjh,1008,kjh,0,1,LSF,,1,FZ,0,[],
225,90101208.0,melee,s@sh7,kjh,Ginger,kjh,0,1,LF,,1,GA,0,[],
226,90101209.0,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),kjh,1,0,GF,,1,CN,0,[],


In [124]:
# import numpy as np
# from tqdm import tqdm

# def assign_top8_labels(df):
#     # Ensure the 'top_8_location_names' column exists
#     # if 'top_8_location_names' not in df.columns:
#     df['top_8_location_names'] = df['location_names']
    
#     # Vectorize 'loser_id' calculation
#     df['loser_id'] = np.where(df['winner_id'] == df['p1_id'], df['p2_id'], df['p1_id'])
    
#     # Process each tournament
#     tournament_keys = df['tournament_key'].unique()[:10]  # Limiting to 1000 tournaments for performance testing
    
#     # Using masks and updating in bulk for each tournament
#     for tournament_key in tqdm(tournament_keys):
#         tournament_mask = df['tournament_key'] == tournament_key
        
#         # Assign WSF labels
#         wsf_mask = tournament_mask & (df['location_names'] == 'WSF')
#         wsf_indices = df[wsf_mask].index[:2]  # First two WSF matches
#         df.loc[wsf_indices, 'top_8_location_names'] = ['WSF_A', 'WSF_B']

#         # Assign LQF labels based on loser of WSF_A
#         lqf_mask = tournament_mask & (df['location_names'] == 'LQF')
#         lqf_indices = df[lqf_mask].index[:2]
        
#         # Get loser of WSF_A
#         wsf_a_loser = df.loc[wsf_indices[0], 'loser_id']
#         lqf_first_set_players = df.loc[lqf_indices[0], ['p1_id', 'p2_id']].values
        
#         if wsf_a_loser in lqf_first_set_players:
#             df.loc[lqf_indices, 'top_8_location_names'] = ['LQF_A', 'LQF_B']
#         else:
#             df.loc[lqf_indices, 'top_8_location_names'] = ['LQF_B', 'LQF_A']

#         # Assign LN labels based on LQF_A players
#         ln_mask = tournament_mask & ~df['location_names'].isin(['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR'])
#         ln_indices = df[ln_mask].index[:2]

#         if len(ln_indices) == 2:
#             ln_first_index_winner = df.loc[ln_indices[0], 'winner_id']
#             lqf_a_players = df.loc[lqf_indices[0] if wsf_a_loser in lqf_first_set_players else lqf_indices[1], ['p1_id', 'p2_id']].values

#             if ln_first_index_winner in lqf_a_players:
#                 df.loc[ln_indices, 'top_8_location_names'] = ['LN_A', 'LN_B']
#             else:
#                 df.loc[ln_indices, 'top_8_location_names'] = ['LN_B', 'LN_A']
        
        

#     # Clean up
#     df.drop(columns='loser_id', inplace=True)

#     return df
# # Apply the function to your DataFrame
# good_top_8_sets = assign_top8_labels(good_top_8_sets)

# good_top_8_sets_bracket.head(30)


In [126]:
def assign_top8_labels(df):
    # Ensure 'top_8_location_names' exists and initialize it with 'location_names' for default values
    df['top_8_location_names'] = df['location_names']
    
    # Vectorize 'loser_id' calculation using np.where
    df['loser_id'] = np.where(df['winner_id'] == df['p1_id'], df['p2_id'], df['p1_id'])
    
    # Process each tournament
    tournament_keys = df['tournament_key'].unique()

    # Precompute masks for each round type to avoid repeated filtering
    masks = {
        'WSF': df['location_names'] == 'WSF',
        'LQF': df['location_names'] == 'LQF',
        'LN': ~df['location_names'].isin(['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR'])
    }

    # Using masks and updating in bulk for each tournament
    for tournament_key in tqdm(tournament_keys):
        tournament_mask = df['tournament_key'] == tournament_key
        
        # Assign WSF labels
        wsf_indices = df[tournament_mask & masks['WSF']].index[:2]  # First two WSF matches
        df.loc[wsf_indices, 'top_8_location_names'] = ['WSF_A', 'WSF_B']

        # Assign LQF labels based on loser of WSF_A
        lqf_indices = df[tournament_mask & masks['LQF']].index[:2]
        
        # Get loser of WSF_A in bulk to avoid row-by-row lookups
        wsf_a_loser = df.loc[wsf_indices[0], 'loser_id']
        lqf_first_set_players = df.loc[lqf_indices[0], ['p1_id', 'p2_id']].values
        
        # Vectorized label assignment for LQF based on whether loser of WSF_A is in first LQF set
        if wsf_a_loser in lqf_first_set_players:
            df.loc[lqf_indices, 'top_8_location_names'] = ['LQF_A', 'LQF_B']
        else:
            df.loc[lqf_indices, 'top_8_location_names'] = ['LQF_B', 'LQF_A']

        # Assign LN labels based on LQF_A players
        ln_indices = df[tournament_mask & masks['LN']].index[:2]

        ln_first_index_winner = df.loc[ln_indices[0], 'winner_id']
        lqf_a_index = lqf_indices[0] if wsf_a_loser in lqf_first_set_players else lqf_indices[1]
        lqf_a_players = df.loc[lqf_a_index, ['p1_id', 'p2_id']].values

        # Vectorized label assignment for LN based on whether winner of first LN match is in LQF_A set
        if ln_first_index_winner in lqf_a_players:
            df.loc[ln_indices, 'top_8_location_names'] = ['LN_A', 'LN_B']
        else:
            df.loc[ln_indices, 'top_8_location_names'] = ['LN_B', 'LN_A']

    # Clean up
    df.drop(columns='loser_id', inplace=True)

    return df

# Apply the function to your DataFrame
good_top_8_sets_labels = assign_top8_labels(good_top_8_sets)

good_top_8_sets_labels.head(30)


  0%|          | 0/30454 [00:00<?, ?it/s]

Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data,top_8_location_names
133,90101116.0,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),kjh,1,0,WSF,,1,CK,0,[],WSF_A
134,90101117.0,melee,s@sh7,Ginger,1008,Ginger,0,1,WSF,,1,CL,0,[],WSF_B
135,90101118.0,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),Ginger,1,0,WF,,1,CM,0,[],WF
220,90101203.0,melee,s@sh7,tm,Bbbbbbbbbexic,tm,0,1,L8,,1,FV,0,[],LN_B
221,90101204.0,melee,s@sh7,lain,lain,math,1,0,L8,,1,FW,0,[],LN_A
222,90101205.0,melee,s@sh7,1008,1008,tm,1,0,LQF,,1,FX,0,[],LQF_B
223,90101206.0,melee,s@sh7,kjh,kjh,lain,1,0,LQF,,1,FY,0,[],LQF_A
224,90101207.0,melee,s@sh7,kjh,1008,kjh,0,1,LSF,,1,FZ,0,[],LSF
225,90101208.0,melee,s@sh7,kjh,Ginger,kjh,0,1,LF,,1,GA,0,[],LF
226,90101209.0,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),kjh,1,0,GF,,1,CN,0,[],GF


In [143]:
# good_top_8_sets_labels.to_pickle(data_path + 'good_top_8_sets_bracket.pkl')
good_top_8_sets_labels = pd.read_pickle(data_path + 'good_top_8_sets_bracket.pkl')

In [138]:
top_8_sets = pd.read_pickle(data_path + 'top_8_sets.pkl')

top_8_sets['top_8_location_names'] = top_8_sets['location_names']

excluded_locations = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']
top_8_sets.loc[~top_8_sets['location_names'].isin(excluded_locations),'top_8_location_names'] = 'LN'
top_8_sets.head()


Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data,top_8_location_names
133,90101116,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),kjh,1,0,WSF,,1,CK,0,[],WSF
134,90101117,melee,s@sh7,Ginger,1008,Ginger,0,1,WSF,,1,CL,0,[],WSF
135,90101118,melee,s@sh7,Mew2king (unpaid),Mew2king (unpaid),Ginger,1,0,WF,,1,CM,0,[],WF
220,90101203,melee,s@sh7,tm,Bbbbbbbbbexic,tm,0,1,L8,,1,FV,0,[],LN
221,90101204,melee,s@sh7,lain,lain,math,1,0,L8,,1,FW,0,[],LN


In [154]:
major_tournament_info_df = pd.read_pickle(data_path + 'major_tournament_info_df.pkl')

major_tournament_info_df.head(3)

Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,...,city,entrants,placings,losses,bracket_types,online,lat,lng,cleaned_name_cleaned,major
2,melee,slippi-champions-league-week-1__melee-singles,Slippi Champions League Week 1,pgstats,slippi-champions-league-week-1,melee-singles,20,,2020-10-11 14:00:00,2020-10-11 14:00:00,...,,20,"[[1000, 1], [6126, 2], [4107, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0,slippi champions league week 1,True
3,melee,slippi-champions-league-week-2__melee-singles,Slippi Champions League Week 2,pgstats,slippi-champions-league-week-2,melee-singles,20,,2020-10-18 14:00:00,2020-10-18 14:00:00,...,,20,"[[6126, 1], [4107, 2], [1000, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0,slippi champions league week 2,True
4,melee,slippi-champions-league-week-3__melee-singles,Slippi Champions League Week 3,pgstats,slippi-champions-league-week-3,melee-singles,20,,2020-10-25 14:00:00,2020-10-25 14:00:00,...,,20,"[[6126, 1], [3359, 2], [19554, 3], [4107, 3], ...",{},b'{}',1,0.0,0.0,slippi champions league week 3,True


In [158]:
labelled_sets_df = sets_df.copy()

# Label all the top 8 sets
index_list = list(top_8_sets.index)
labelled_sets_df['top_8'] = False
labelled_sets_df.loc[index_list, 'top_8'] = True
labelled_sets_df['top_8_location_names'] = ''
labelled_sets_df.loc[index_list, 'top_8_location_names'] = top_8_sets['top_8_location_names']


# Label all the top 8 sets with in a valid bracket
index_list = list(good_top_8_sets_labels.index)
labelled_sets_df['valid_top_8_bracket'] = False
labelled_sets_df.loc[index_list, 'valid_top_8_bracket'] = True
labelled_sets_df['top_8_bracket_location_names'] = ''
labelled_sets_df.loc[index_list, 'top_8_bracket_location_names'] = good_top_8_sets_labels['top_8_location_names']

# Label sets played in majors
major_tournament_keys = list(major_tournament_info_df['key'])
labelled_sets_df['major'] = False
labelled_sets_df.loc[labelled_sets_df['tournament_key'].isin(major_tournament_keys), 'major'] = True

print(labelled_sets_df[labelled_sets_df['top_8']==True].shape)
labelled_sets_df[labelled_sets_df['valid_top_8_bracket']==True].shape


(352483, 19)


(314100, 19)

In [159]:
labelled_sets_df.to_pickle(data_path + 'sets_top_8_labeled_df.pkl')