# Journey to top 8
Here we add columns to tournament_info_df that includes the top 8 players and their location in the top 8 bracket and the index of the earlier sets they played in the tournament. 

### Load packages


In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from tqdm import tqdm
import tqdm
import os

if os.path.exists('/workspace/data_2'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data_2/'
else:
    data_path = '../data/'

### Load Data

In [34]:
tournament_info_df = pd.read_pickle(data_path + 'tournament_info_df.pkl')
tournament_info_df.head(5)


Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,country,state,city,entrants,placings,losses,bracket_types,online,lat,lng
0,melee,mdva-invitational-2017-(challonge-mirror),MDVA Invitational 2017 (Challonge Mirror),challonge,https://challonge.com/mdva_invitational_2017,,17,,2017-11-26 08:05:11,2017-11-26 08:48:09,US,VA,Fall's Church,10,"[[Rishi, 1], [15634, 3], [6126, 4], [Chu, 8], ...",{},b'{}',0,,
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,US,MI,Ann Arbor,92,[],{},b'{}',0,,
2,melee,slippi-champions-league-week-1__melee-singles,Slippi Champions League Week 1,pgstats,slippi-champions-league-week-1,melee-singles,20,,2020-10-11 14:00:00,2020-10-11 14:00:00,,,,20,"[[1000, 1], [6126, 2], [4107, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0
3,melee,slippi-champions-league-week-2__melee-singles,Slippi Champions League Week 2,pgstats,slippi-champions-league-week-2,melee-singles,20,,2020-10-18 14:00:00,2020-10-18 14:00:00,,,,20,"[[6126, 1], [4107, 2], [1000, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0
4,melee,slippi-champions-league-week-3__melee-singles,Slippi Champions League Week 3,pgstats,slippi-champions-league-week-3,melee-singles,20,,2020-10-25 14:00:00,2020-10-25 14:00:00,,,,20,"[[6126, 1], [3359, 2], [19554, 3], [4107, 3], ...",{},b'{}',1,0.0,0.0


In [3]:
sets_df = pd.read_pickle(data_path + 'sets_top_8_labeled_df.pkl')
sets_df.head(1)

Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,Chillin,5620,1,3,"[R1, Round 1, Round 1]",,1,A,5,[],False,,False,,False


### Filter tournaments
We filter the tournaments with a valid top 8 bracket.

In [4]:
valid_bracket_keys = list(sets_df[sets_df['valid_top_8_bracket']==True]['tournament_key'].unique())
print(f"{len(valid_bracket_keys)/tournament_info_df.shape[0]:.1%} of tournaments have a valid top 8 bracket.")

valid_sets_df = sets_df[sets_df['tournament_key'].isin(valid_bracket_keys)].copy()
valid_tournament_info_df = tournament_info_df[tournament_info_df['key'].isin(valid_bracket_keys)].copy()

76.8% of tournaments have a valid top 8 bracket.


We add columns to ``tournament_info_df`` that contain the players in the top 8 and what their position in the bracket was.

In [31]:
def add_top_8_players(valid_sets_df=valid_sets_df, valid_tournament_info_df=valid_tournament_info_df):
    # Filter valid sets
    loc_names = ['WSF_A', 'WSF_B', 'LN_A', 'LN_B']
    masked_df = valid_sets_df[
        (valid_sets_df['valid_top_8_bracket'] == True) &
        (valid_sets_df['tournament_key'].isin(valid_bracket_keys)) &
        (valid_sets_df['top_8_bracket_location_names'].isin(loc_names))    
    ].copy()
    
    # Add 'key' column
    masked_df['key'] = masked_df['tournament_key']
    
    # Melt 'p1_id' and 'p2_id' into long format
    melted_df = masked_df.melt(
        id_vars=['key', 'top_8_bracket_location_names'],
        value_vars=['p1_id', 'p2_id'],
        var_name='pn_id',
        value_name='player_id'
    )
    # Create 'loc_id' column
    melted_df['loc_id'] = melted_df['top_8_bracket_location_names'] + '_' + melted_df['pn_id'].apply(lambda x: x.split('_')[0])
    
    # Pivot the table
    pivot_df = melted_df.pivot_table(
        index='key',
        columns='loc_id',
        values='player_id',
        aggfunc='first'
    ).reset_index()
    
    # Merge with tournament info
    df = valid_tournament_info_df.copy()
    df = df.merge(pivot_df, on='key', how='left')
    
    # Restore the original index
    df.index = valid_tournament_info_df.index
    
    return df

top_8_tournament_df = add_top_8_players()
top_8_tournament_df.head(2)
    

Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,...,lat,lng,LN_A_p1,LN_A_p2,LN_B_p1,LN_B_p2,WSF_A_p1,WSF_A_p2,WSF_B_p1,WSF_B_p2
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,...,,,math,lain,tm,Bbbbbbbbbexic,kjh,Mew2king (unpaid),Ginger,1008
11,melee,httpsparagonchallongecomla_2015_melee_singles,Paragon Los Angeles 2015,challonge,https://paragon.challonge.com/la_2015_melee_si...,,15,,2015-09-06 23:45:46,2015-09-07 20:33:07,...,,,1008,1023,13932,1004,1000,16342,1003,4465


Now we add columns to ``tournament_info_df`` that contain the 

In [40]:

def add_top_8_previous_sets(valid_sets_df=valid_sets_df, top_8_tournament_df=top_8_tournament_df):
    
    # Copy tournament dataframe
    valid_tourn_df = top_8_tournament_df.copy()
    
    # Make a list of the columns containing the top 8 players
    player_cols = ['LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2']
        
    for player_col in tqdm.tqdm(player_cols):
        
        # Add a losers_id column
        valid_df = valid_sets_df.copy()
        valid_df['loser_id'] = valid_df['p1_id']
        mask = valid_df['winner_id'] == valid_df['p1_id']
        valid_df.loc[mask, 'loser_id'] = valid_df['p2_id']

        # Add a column to valid_sets_copy that contains the player we are interested in.
        col_to_merge = ['key', player_col]
        merge_df = pd.merge(left=valid_df, right=top_8_tournament_df[col_to_merge], left_on='tournament_key', right_on='key')

        # NOTE: VERY IMPORTANT, the index is actually reset during the merge operation, and we really need it to be accurate if we wanna pull in data later
        merge_df.index = valid_df.index
        
        # Add a column to merge_df that says if the relevant player played in that set.
        merge_df['top_8_player_set'] = (merge_df['winner_id'] == merge_df[player_col]) | (merge_df['loser_id'] == merge_df[player_col])
        
        # Filter the sets to the ones where the player we are interested in played outside the top 8
        merge_df = merge_df[merge_df['top_8_player_set'] & (merge_df['valid_top_8_bracket'] == False)]

        # Add a column to merge_df that says if the relevant player won that set.
        merge_df['top_8_player_wins'] = (merge_df['winner_id'] == merge_df[player_col])

        # Add a column to merge_df that contains the index of the set and if the player we are interested in won.
        merge_df['top_8_player_matches'] = list(zip(merge_df.index, merge_df['top_8_player_wins']))

        dfs = []

        # Iterate through the tournaments using groupby object and collect the column top_8_player_matches into `dfs`
        for i, (key, group) in enumerate(merge_df.groupby('tournament_key')):
            dfs.append([key, list(group['top_8_player_matches'])])

        # Create the DataFrame with the corrected `data` parameter
        df = pd.DataFrame(dfs, columns=['key', (player_col + '_non_top_8_sets')])

        valid_tourn_df = pd.merge(valid_tourn_df, df, on='key', how='left')
        
    # Restore the original index
    valid_tourn_df.set_index(top_8_tournament_df.index, inplace=True)
    
    return valid_tourn_df


top_8_tournament_with_sets_df = add_top_8_previous_sets()
top_8_tournament_with_sets_df.iloc[:3].head()

100%|██████████| 8/8 [00:15<00:00,  1.94s/it]


Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,...,WSF_B_p1,WSF_B_p2,LN_A_p1_non_top_8_sets,LN_A_p2_non_top_8_sets,LN_B_p1_non_top_8_sets,LN_B_p2_non_top_8_sets,WSF_A_p1_non_top_8_sets,WSF_A_p2_non_top_8_sets,WSF_B_p1_non_top_8_sets,WSF_B_p2_non_top_8_sets
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,...,Ginger,1008,"[(84, True), (110, True), (123, False), (210, ...","[(77, True), (107, True), (122, False), (209, ...","[(85, True), (111, True), (124, True), (130, F...","[(92, True), (114, True), (125, False), (204, ...","[(81, True), (109, True), (123, True), (130, T...","[(73, True), (105, True), (121, True), (129, T...","[(97, True), (117, True), (127, True), (132, T...","[(89, True), (113, True), (125, True), (131, T..."
11,melee,httpsparagonchallongecomla_2015_melee_singles,Paragon Los Angeles 2015,challonge,https://paragon.challonge.com/la_2015_melee_si...,,15,,2015-09-06 23:45:46,2015-09-07 20:33:07,...,1003,4465,"[(457, True), (475, True), (484, True), (488, ...","[(440, True), (466, True), (479, False), (533,...","[(435, True), (464, False), (520, True), (530,...","[(437, True), (465, False), (519, True), (529,...","[(429, True), (461, True), (477, True), (485, ...","[(441, True), (467, True), (480, True), (486, ...","[(453, True), (473, True), (483, True), (488, ...","[(445, True), (469, True), (481, True), (487, ..."
12,melee,httpsdl4-5challongecomdl45meleeprobracket,DrommeLAN4.5,challonge,https://dl4-5.challonge.com/DL45meleeProBracket,,15,,2015-05-02 23:55:20,2015-05-03 04:14:15,...,4465,12870,"[(575, True), (597, True), (608, True), (613, ...","[(557, True), (588, True), (603, False), (661,...","[(561, True), (590, True), (604, True), (611, ...","[(567, True), (593, True), (606, True), (612, ...","[(563, True), (591, True), (605, True), (612, ...","[(555, True), (587, True), (603, True), (611, ...","[(571, True), (595, True), (607, True), (613, ...","[(579, True), (599, True), (609, True), (614, ..."


In [41]:
top_8_tournament_with_sets_df.loc[:3,['key', 'LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 
                 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2']]

Unnamed: 0,key,LN_A_p1,LN_A_p2,LN_B_p1,LN_B_p2,WSF_A_p1,WSF_A_p2,WSF_B_p1,WSF_B_p2
1,s@sh7,math,lain,tm,Bbbbbbbbbexic,kjh,Mew2king (unpaid),Ginger,1008


In [42]:
valid_sets_df.loc[[77, 107, 122, 209]]

Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major
77,90101060,melee,s@sh7,lain,lain,Stitchface,1,0,"[W2, Winners 2, Winners Round 2]",,1,AG,0,[],False,,False,,False
107,90101090,melee,s@sh7,lain,lain,Yung Bitch,1,0,"[W3, Winners 3, Winners Round 3]",,1,BK,0,[],False,,False,,False
122,90101105,melee,s@sh7,i4n,i4n,lain,1,0,"[W4, Winners 4, Winners Round 4]",,1,BZ,0,[],False,,False,,False
209,90101192,melee,s@sh7,lain,lain,Rich,1,0,"[L5, Losers 5, Losers Round 5]",,1,FK,0,[],False,,False,,False


In [43]:

col_to_merge = ['key','LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2',
       'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2',
       'LN_A_p1_non_top_8_sets', 'LN_A_p2_non_top_8_sets',
       'LN_B_p1_non_top_8_sets', 'LN_B_p2_non_top_8_sets',
       'WSF_A_p1_non_top_8_sets', 'WSF_A_p2_non_top_8_sets',
       'WSF_B_p1_non_top_8_sets', 'WSF_B_p2_non_top_8_sets']

In [44]:
result = pd.merge(tournament_info_df, top_8_tournament_with_sets_df[col_to_merge], how='left', left_on='key', right_on='key')
result.head()

Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,...,WSF_B_p1,WSF_B_p2,LN_A_p1_non_top_8_sets,LN_A_p2_non_top_8_sets,LN_B_p1_non_top_8_sets,LN_B_p2_non_top_8_sets,WSF_A_p1_non_top_8_sets,WSF_A_p2_non_top_8_sets,WSF_B_p1_non_top_8_sets,WSF_B_p2_non_top_8_sets
0,melee,mdva-invitational-2017-(challonge-mirror),MDVA Invitational 2017 (Challonge Mirror),challonge,https://challonge.com/mdva_invitational_2017,,17,,2017-11-26 08:05:11,2017-11-26 08:48:09,...,,,,,,,,,,
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,...,Ginger,1008.0,"[(84, True), (110, True), (123, False), (210, ...","[(77, True), (107, True), (122, False), (209, ...","[(85, True), (111, True), (124, True), (130, F...","[(92, True), (114, True), (125, False), (204, ...","[(81, True), (109, True), (123, True), (130, T...","[(73, True), (105, True), (121, True), (129, T...","[(97, True), (117, True), (127, True), (132, T...","[(89, True), (113, True), (125, True), (131, T..."
2,melee,slippi-champions-league-week-1__melee-singles,Slippi Champions League Week 1,pgstats,slippi-champions-league-week-1,melee-singles,20,,2020-10-11 14:00:00,2020-10-11 14:00:00,...,,,,,,,,,,
3,melee,slippi-champions-league-week-2__melee-singles,Slippi Champions League Week 2,pgstats,slippi-champions-league-week-2,melee-singles,20,,2020-10-18 14:00:00,2020-10-18 14:00:00,...,,,,,,,,,,
4,melee,slippi-champions-league-week-3__melee-singles,Slippi Champions League Week 3,pgstats,slippi-champions-league-week-3,melee-singles,20,,2020-10-25 14:00:00,2020-10-25 14:00:00,...,,,,,,,,,,


In [45]:
temp = result[result['key'] == 's@sh7'][['key', 'LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 
                 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2']]
print(temp)

     key LN_A_p1 LN_A_p2 LN_B_p1        LN_B_p2 WSF_A_p1           WSF_A_p2  \
1  s@sh7    math    lain      tm  Bbbbbbbbbexic      kjh  Mew2king (unpaid)   

  WSF_B_p1 WSF_B_p2  
1   Ginger     1008  


In [46]:
result.to_pickle(data_path + 'top_8_tournament_previous_sets_and_results_df')

In [47]:
result.columns

Index(['game', 'key', 'cleaned_name', 'source', 'tournament_name',
       'tournament_event', 'season', 'rank', 'start', 'end', 'country',
       'state', 'city', 'entrants', 'placings', 'losses', 'bracket_types',
       'online', 'lat', 'lng', 'LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2',
       'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2',
       'LN_A_p1_non_top_8_sets', 'LN_A_p2_non_top_8_sets',
       'LN_B_p1_non_top_8_sets', 'LN_B_p2_non_top_8_sets',
       'WSF_A_p1_non_top_8_sets', 'WSF_A_p2_non_top_8_sets',
       'WSF_B_p1_non_top_8_sets', 'WSF_B_p2_non_top_8_sets'],
      dtype='object')

In [33]:
temp = result[(result['key']=='s@sh7')]
temp

Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,...,WSF_B_p1,WSF_B_p2,LN_A_p1_non_top_8_sets,LN_A_p2_non_top_8_sets,LN_B_p1_non_top_8_sets,LN_B_p2_non_top_8_sets,WSF_A_p1_non_top_8_sets,WSF_A_p2_non_top_8_sets,WSF_B_p1_non_top_8_sets,WSF_B_p2_non_top_8_sets
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,...,Ginger,1008,"[(84, True), (110, True), (123, False), (210, ...","[(77, True), (107, True), (122, False), (209, ...","[(85, True), (111, True), (124, True), (130, F...","[(92, True), (114, True), (125, False), (204, ...","[(81, True), (109, True), (123, True), (130, T...","[(73, True), (105, True), (121, True), (129, T...","[(97, True), (117, True), (127, True), (132, T...","[(89, True), (113, True), (125, True), (131, T..."
