# Journey to top 8
Here we build a dataset that includes the sets a player has played in a tournament to get to the top 8.

### Load packages


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'

### Load Data

In [None]:
if 'cell_has_run' not in globals():
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
    dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

In [None]:
players_df = dfs['players_df']
players_df.head(1)


In [None]:
tournament_info_df = dfs['tournament_info_df']
tournament_info_df.head(1)


In [None]:
sets_df = pd.read_pickle(data_path + 'sets_top_8_labeled_df.pkl')
sets_df.head(1)

### Filter tournaments
We filter the tournaments with a valid top 8 bracket.

In [None]:
valid_bracket_keys = list(sets_df[sets_df['valid_top_8_bracket']==True]['tournament_key'].unique())
print(f"{len(valid_bracket_keys)/tournament_info_df.shape[0]:.1%} of tournaments have a valid top 8 bracket.")

valid_sets_df = sets_df[sets_df['tournament_key'].isin(valid_bracket_keys)].copy()
valid_tournament_info_df = tournament_info_df[tournament_info_df['key'].isin(valid_bracket_keys)].copy()

In [None]:
def add_top_8_players(valid_sets_df=valid_sets_df, valid_tournament_info_df=valid_tournament_info_df):
    # Filter valid sets
    loc_names = ['WSF_A', 'WSF_B', 'LN_A', 'LN_B']
    masked_df = valid_sets_df[
        (valid_sets_df['valid_top_8_bracket'] == True) &
        (valid_sets_df['tournament_key'].isin(valid_bracket_keys)) &
        (valid_sets_df['top_8_bracket_location_names'].isin(loc_names))    
    ].copy()
    
    # Add 'key' column
    masked_df['key'] = masked_df['tournament_key']
    
    # Melt 'p1_id' and 'p2_id' into long format
    melted_df = masked_df.melt(
        id_vars=['key', 'top_8_bracket_location_names'],
        value_vars=['p1_id', 'p2_id'],
        var_name='pn_id',
        value_name='player_id'
    )
    # Create 'loc_id' column
    melted_df['loc_id'] = melted_df['top_8_bracket_location_names'] + '_' + melted_df['pn_id'].apply(lambda x: x.split('_')[0])
    
    # Pivot the table
    pivot_df = melted_df.pivot_table(
        index='key',
        columns='loc_id',
        values='player_id',
        aggfunc='first'
    ).reset_index()
    
    # Merge with tournament info
    df = valid_tournament_info_df.copy()
    df = df.merge(pivot_df, on='key', how='left')
    
    return df

top_8_tournament_df = add_top_8_players()
top_8_tournament_df.head(2)
    

In [None]:
def add_top_8_previous_sets(valid_sets_df=valid_sets_df, top_8_tournament_df=top_8_tournament_df):
    # Copy the tournament dataframe once
    valid_tourn_df = top_8_tournament_df.copy()
    player_cols = ['LN_A_p1', 'LN_A_p2', 'LN_B_p1', 'LN_B_p2', 'WSF_A_p1', 'WSF_A_p2', 'WSF_B_p1', 'WSF_B_p2']    # Prepare the initial merged DataFrame outside the loop
    valid_df = valid_sets_df.copy()
    valid_df['loser_id'] = valid_df['p1_id']
    mask = valid_df['winner_id'] == valid_df['p1_id']
    valid_df.loc[mask, 'loser_id'] = valid_df['p2_id']

    # Perform a single merge operation
    merge_df = pd.merge(
        valid_df,
        top_8_tournament_df[['key'] + player_cols],  # Keep only relevant columns
        left_on='tournament_key',
        right_on='key'
    )

    # Loop over each player column and create the non-top 8 set information
    for player_col in tqdm(player_cols):
        # Identify if the player participated in the set
        merge_df['top_8_player_set'] = (merge_df['winner_id'] == merge_df[player_col]) | (merge_df['loser_id'] == merge_df[player_col])

        # Filter for relevant matches outside the top 8
        player_sets = merge_df[merge_df['top_8_player_set'] & (~merge_df['top_8'])].copy()

        # Track wins for each relevant player
        player_sets['top_8_player_wins'] = (player_sets['winner_id'] == player_sets[player_col])

        # Create a column with tuples of match index and win status
        player_sets['top_8_player_matches'] = list(zip(player_sets.index, player_sets['top_8_player_wins']))

        # Aggregate the sets by tournament key, storing matches as a list
        df = player_sets.groupby('tournament_key')['top_8_player_matches'].agg(list).reset_index()
        df.columns = ['key', f'{player_col}_non_top_8_sets']

        # Merge with the tournament DataFrame
        valid_tourn_df = valid_tourn_df.merge(df, on='key', how='left')

    return valid_tourn_df

# Call the function
top_8_tournament_with_sets_df = add_top_8_previous_sets()
top_8_tournament_with_sets_df.iloc[:3, -16:].head()


In [None]:
# top_8_tournament_with_sets_df.to_pickle(data_path + 'top_8_tournament_previous_sets_and_results_df')