# Journey to top 8
Here we add columns to tournament_info_df that includes the top 8 players and their location in the top 8 bracket and the index of the earlier sets they played in the tournament. 

### Load packages


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from tqdm import tqdm
import tqdm
import os

if os.path.exists('/workspace/data_2'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data_2/'
else:
    data_path = '../data/'

### Load Data

In [None]:
tournament_info_df = pd.read_pickle(data_path + 'tournament_info_df.pkl')
tournament_info_df.head(1)


In [None]:
sets_df = pd.read_pickle(data_path + 'sets_top_8_labeled_df.pkl')
sets_df.head(1)

### Filter tournaments
We filter the tournaments with a valid top 8 bracket.

In [None]:
valid_bracket_keys = list(sets_df[sets_df['valid_top_8_bracket']==True]['tournament_key'].unique())
print(f"{len(valid_bracket_keys)/tournament_info_df.shape[0]:.1%} of tournaments have a valid top 8 bracket.")

valid_sets_df = sets_df[sets_df['tournament_key'].isin(valid_bracket_keys)].copy()
valid_tournament_info_df = tournament_info_df[tournament_info_df['key'].isin(valid_bracket_keys)].copy()

We add columns to ``tournament_info_df`` that contain the players in the top 8 and what their position in the bracket was.

In [None]:
def add_top_8_players(valid_sets_df=valid_sets_df, valid_tournament_info_df=valid_tournament_info_df):
    # Filter valid sets
    loc_names = ['WSF_A', 'WSF_B', 'LN_A', 'LN_B']
    masked_df = valid_sets_df[
        (valid_sets_df['valid_top_8_bracket'] == True) &
        (valid_sets_df['tournament_key'].isin(valid_bracket_keys)) &
        (valid_sets_df['top_8_bracket_location_names'].isin(loc_names))    
    ].copy()
    
    # Add 'key' column
    masked_df['key'] = masked_df['tournament_key']
    
    # Melt 'p1_id' and 'p2_id' into long format
    melted_df = masked_df.melt(
        id_vars=['key', 'top_8_bracket_location_names'],
        value_vars=['p1_id', 'p2_id'],
        var_name='pn_id',
        value_name='player_id'
    )
    # Create 'loc_id' column
    melted_df['loc_id'] = melted_df['top_8_bracket_location_names'] + '_' + melted_df['pn_id'].apply(lambda x: x.split('_')[0])
    
    # Pivot the table
    pivot_df = melted_df.pivot_table(
        index='key',
        columns='loc_id',
        values='player_id',
        aggfunc='first'
    ).reset_index()
    
    # Merge with tournament info
    df = valid_tournament_info_df.copy()
    df = df.merge(pivot_df, on='key', how='left')
    
    return df

top_8_tournament_df = add_top_8_players()
top_8_tournament_df.head(2)
    

Now we add columns to ``tournament_info_df`` that contain the 

In [None]:
def add_top_8_previous_sets(valid_sets_df=valid_sets_df, top_8_tournament_df=top_8_tournament_df):
    
    # Copy tournament dataframe
    valid_tourn_df = top_8_tournament_df.copy()
    
    # Make a list of the columns containing the top 8 players
    player_cols = ['LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2']
        
    for player_col in tqdm.tqdm(player_cols):
        
        # Add a losers_id column
        valid_df = valid_sets_df.copy()
        valid_df['loser_id'] = valid_df['p1_id']
        mask = valid_df['winner_id'] == valid_df['p1_id']
        valid_df.loc[mask, 'loser_id'] = valid_df['p2_id']

        # Add a column to valid_sets_copy that contains the player we are interested in.
        col_to_merge = ['key', player_col]
        merge_df = pd.merge(left=valid_df, right=top_8_tournament_df[col_to_merge], left_on='tournament_key', right_on='key')

        # NOTE: VERY IMPORTANT, the index is actually reset during the merge operation, and we really need it to be accurate if we wanna pull in data later
        merge_df.index = valid_df.index
        
        # Add a column to merge_df that says if the relevant player played in that set.
        merge_df['top_8_player_set'] = (merge_df['winner_id'] == merge_df[player_col]) | (merge_df['loser_id'] == merge_df[player_col])
        
        # Filter the sets to the ones where the player we are interested in played outside the top 8
        merge_df = merge_df[merge_df['top_8_player_set'] & (merge_df['valid_top_8_bracket'] == False)]

        # Add a column to merge_df that says if the relevant player won that set.
        merge_df['top_8_player_wins'] = (merge_df['winner_id'] == merge_df[player_col])

        # Add a column to merge_df that contains the index of the set and if the player we are interested in won.
        merge_df['top_8_player_matches'] = list(zip(merge_df.index, merge_df['top_8_player_wins']))

        dfs = []

        # Iterate through the tournaments using groupby object and collect the column top_8_player_matches into `dfs`
        for i, (key, group) in enumerate(merge_df.groupby('tournament_key')):
            dfs.append([key, list(group['top_8_player_matches'])])

        # Create the DataFrame with the corrected `data` parameter
        df = pd.DataFrame(dfs, columns=['key', (player_col + '_non_top_8_sets')])

        valid_tourn_df = pd.merge(valid_tourn_df, df, on='key', how='left')
    
    return valid_tourn_df


top_8_tournament_with_sets_df = add_top_8_previous_sets()
top_8_tournament_with_sets_df.iloc[:3,-16:].head()

In [None]:
valid_sets_df.loc[[77, 107, 122, 209]]

In [None]:

col_to_merge = ['LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2',
       'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2',
       'LN_A_p1_non_top_8_sets', 'LN_A_p2_non_top_8_sets',
       'LN_B_p1_non_top_8_sets', 'LN_B_p2_non_top_8_sets',
       'WSF_A_p1_non_top_8_sets', 'WSF_A_p2_non_top_8_sets',
       'WSF_B_p1_non_top_8_sets', 'WSF_B_p2_non_top_8_sets']

In [None]:
result = pd.merge(tournament_info_df, top_8_tournament_with_sets_df[col_to_merge], how='left', left_index=True, right_index=True)


In [None]:
result.to_pickle(data_path + 'top_8_tournament_previous_sets_and_results_df')

In [None]:
result.columns