# Exploration of Top 8
In this notebook we want to:
- Filter out tournaments that do not have the canonical sets_df['location_names']
- Label the top 8 sets of a tournament.
- Determine the bracket, i.e. which of the losers of the winners set plays which of the winners of the losers sets.

We are also interested in:
- How often does a grand finals reset occur?
- How often does the winner of the loser's finals win the tournament?
- How often does a player coming into the top 8 from losers win the tournament.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'


## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [None]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = + data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [None]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

In [None]:
# # Save the dictionary of DataFrames as a pickle
# with open(data_path + 'dfs_dict.pkl', 'wb') as f:
#     pickle.dump(dfs, f)

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [None]:
players_df = dfs['players_df']
players_df.head()


In [None]:
ranking_df = dfs['ranking_df']
ranking_df.head()

In [None]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

In [None]:
sets_df = dfs['sets_df']
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
sets_df.shape



In [None]:
tournament_info_df = dfs['tournament_info_df']
print(tournament_info_df.shape)
tournament_info_df.head()


## Filter out some touraments
We start by looking for sets_df['location_names'] are the most common.

In [None]:
# We use .to_string() so that we print out all the values.
print(sets_df['location_names'].value_counts().to_string())

From the value counts we see that there are several sets_df['location_names'] that correspond to the finals of the tournament:
- ['GF', 'Grand Final',' Grand Final']              35523
- ['F', 'Final', 'Final']                           615
- ['Grand Finals', 'Grand Finals', 'Grand Finals']  7
- [Grand Final, Grand Final, Grand Final]           1

We will filter out the tournaments that do not have a set with ['GF', 'Grand Final',' Grand Final'] in their location names. That way the location names of all the sets in the tournament should be consistent.

In [None]:
# Filter the rows where 'location_names' exactly matches ['GF', 'Grand Final', 'Grand Final']
gf_sets_df = sets_df[sets_df['location_names'].apply(lambda x: x == ['GF', 'Grand Final', 'Grand Final'])]

# Extract the tournament keys for the Grand Finals
gf_tournament_keys = list(gf_sets_df['tournament_key'])

# Filter the sets_df to include only the sets from tournaments that had Grand Finals
valid_tournament_sets_df = sets_df[sets_df['tournament_key'].isin(gf_tournament_keys)]


Here is the structure of a typical top 8 bracket.
![alt text](top_8.png "Top 8 Bracket")
We need to figure out what location names correspond to which positions.



I suspect that the location names of the top 8 games are the following:
- [f"L{n}", f"Losers {n}", f"Losers Round {n}], # Where n is the maximum n in all such location of the  tournament.  
- ['WSF', 'Winners Semis', 'Winners Semi-Final'],
- ['LQF', 'Losers Quarters', 'Losers Quarter-Final'],
- ['WF', 'Winners Final', 'Winners Final'],
- ['LSF', 'Losers Semis', 'Losers Semi-Final'],
- ['LF', 'Losers Final', 'Losers Final'],
- ['GF', 'Grand Final', 'Grand Final'],'
- ['GFR', 'GF Reset', 'Grand Final Reset']

We will test that hypothesis.

In [None]:
# For now we ignore the L{n} location name.
top_8_locations = [                                   
        ['WSF', 'Winners Semis', 'Winners Semi-Final'],
        ['LQF', 'Losers Quarters', 'Losers Quarter-Final'],
        ['WF', 'Winners Final', 'Winners Final'],
        ['LSF', 'Losers Semis', 'Losers Semi-Final'],
        ['LF', 'Losers Final', 'Losers Final'],
        ['GF', 'Grand Final', 'Grand Final'],
        ['GFR', 'GF Reset', 'Grand Final Reset']
    ] 

valid_tournament_sets_df[valid_tournament_sets_df['location_names'].isin(top_8_locations)]['location_names'].value_counts()

If our hypothesis was correct, the there should be the same number of sets with location_names WF, LF, and GF, because the grand finals consisit of the winner from the losers final and the winners of the winners final. But the counts of those in our filtered data set do not match.

In [None]:
print('The number of tourmanets in our filtered dataset is', len(gf_tournament_keys))
print()
# Display the value counts of the remaining location names.
print(valid_tournament_sets_df['location_names'].value_counts().to_string())

In [None]:
valid_tournament_sets_df['location_names'] = valid_tournament_sets_df['location_names'].apply(lambda x: x[0])

We need to process the tournaments with empty bracket_name seperately from the tournaments with non-empty bracket names. We start with tournaments with nothing in the bracket_name column.

In [None]:
# Filter the sets from tournaments with empty bracket_name
valid_tournament_sets_no_bracket_df = valid_tournament_sets_df[valid_tournament_sets_df['bracket_name'] == ""]

# These are the top_8 locations, not including L{n}
top_8_locations = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']                        

# Filter the sets with those location_names
top_8_no_bracket_name_df = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'].isin(top_8_locations)]

# Check that the number of games matches up.
top_8_no_bracket_name_df['location_names'].value_counts()

We should expect the same number of WF, LSF, and LF as there are GF and there should be double that for the LQF and WSF. I hypothesise that some of these are labelled as W{n} and L{n}. Lets check if any of the tournaments do not have a set labelled WF.

In [None]:
groups = valid_tournament_sets_no_bracket_df.groupby('tournament_key')

all_tournaments_have_wf = True
for key, frame in groups:
    if not frame['location_names'].isin(["WF"]).any():
        print(key, "has no WF sets.")
        all_tournaments_have_wf = False

if all_tournaments_have_wf:
    print("All tournaments have a WF set.")

All tournaments have a WF. Lets check to see how if there are tournaments with two GF. 

In [None]:
tournaments_with_gfr = []

for key, frame in groups:
    if frame['location_names'].isin(["GF"]).sum()==2:
        print(key, "has two gf sets.")
        tournaments_with_gfr.append(key)



So in all these tournaments, a grand finals reset is has location name GF. Lets print out the GF sets in tournaments with a GF reset.

In [None]:
gfr_sets_df = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['tournament_key'].isin(tournaments_with_gfr)]
gfr_sets_df = gfr_sets_df[gfr_sets_df['location_names']=='GF']
gfr_sets_df

<span style="color:red">To Do: We need to figure out a way to determine which is the GF and which is the GF reset.</span>

From here, we need to get the top 8 tournaments sets with L{n} where n is maximal for that tournament.


In [None]:
# Sort the DataFrame by 'location_names' before grouping
losers_sets_df = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'].apply(lambda x: x[0]=='L')]
losers_sets_df = losers_sets_df.sort_values(['tournament_key','location_names'])
groups = losers_sets_df.groupby('tournament_key')

last_l_sets = []

for _, frame in groups:
    last_l_sets.append(frame.iloc[-6:-4]) # I want to append the index that that appears in losers_sets_df

all_top_8_no_bracket_name_df = pd.concat([top_8_no_bracket_name_df, pd.concat(last_l_sets)])
all_top_8_no_bracket_name_df.sort_values(['tournament_key','location_names'], inplace=True)
all_top_8_no_bracket_name_df.head(30)

### ``all_top_8_no_bracket_name_df`` should be all the top 8 sets without a bracket_name. Looping over groups is slow, lets avoid that.

In [None]:
# Filter for tournaments with empty 'bracket_name'
valid_tournament_sets_no_bracket_df = valid_tournament_sets_df[valid_tournament_sets_df['bracket_name'] == ""]

# Define top_8 locations (excluding L{n})
top_8_locations = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']

# Filter sets with those location_names for top_8
top_8_no_bracket_name_df = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'].isin(top_8_locations)]

# Filter for rows where location_names start with 'L'
losers_sets_df = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'].str.startswith('L')]

# Sort by 'tournament_key' and 'location_names'
losers_sets_df = losers_sets_df.sort_values(['tournament_key', 'location_names'])

# Select the 5th and 6th last rows from each group in one go
last_l_sets = losers_sets_df.groupby('tournament_key').nth([-6, -5])

# Combine the filtered top_8 sets with the last sets in losers bracket
all_top_8_no_bracket_name_df_2 = pd.concat([top_8_no_bracket_name_df, last_l_sets]).sort_index()
all_top_8_no_bracket_name_df_2.head(10)


Check that the results are the same.

In [None]:
all_top_8_no_bracket_name_df.sort_index()

In [None]:
all_top_8_no_bracket_name_df.equals(all_top_8_no_bracket_name_df_2)

Lets now handle the tournaments with a bracket name.

In [None]:
# Filter for tournaments with a non-empty 'bracket_name'
valid_tournament_sets_with_bracket_df = valid_tournament_sets_df[valid_tournament_sets_df['bracket_name'] != ""]

gf_rows = valid_tournament_sets_with_bracket_df[valid_tournament_sets_with_bracket_df['location_names'] == 'GF']

gf_bracket_names = gf_rows[['tournament_key', 'bracket_name']]

# Merge to get the final bracket sets
tournament_sets_final_bracket_df = valid_tournament_sets_with_bracket_df.reset_index().merge(
    gf_bracket_names,
    on=['tournament_key', 'bracket_name'],
    how='inner',
    # indicatorbool=True
).set_index('index')

# Define top_8 locations (excluding L{n})
top_8_locations = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']

# Filter sets with those location_names for top_8
top_8_with_bracket_name_df = tournament_sets_final_bracket_df[tournament_sets_final_bracket_df['location_names'].isin(top_8_locations)]

# Filter for rows where location_names start with 'L'
losers_sets_df = tournament_sets_final_bracket_df[tournament_sets_final_bracket_df['location_names'].str.startswith('L')]

# Sort by 'tournament_key' and 'location_names'
losers_sets_df = losers_sets_df.sort_values(['tournament_key', 'location_names'])

# Select the 5th and 6th last rows from each group in one go
last_l_sets = losers_sets_df.groupby('tournament_key').nth([-6, -5])

# Combine the filtered top_8 sets with the last sets in losers bracket
all_top_8_with_bracket_name_df = pd.concat([top_8_with_bracket_name_df, last_l_sets]).sort_index()

all_top_8_with_bracket_name_df.head(10)

We expect the WF, LF, and LSF to have the same value counts. We expect LQF and WSF to have the same value count which should be double that of the previous three. We see that this is not the case

In [None]:
all_top_8_with_bracket_name_df['location_names'].value_counts()

We check to see if any location names are missing in each tournament. As we see, all looks good here.

In [None]:
locations = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']

for location in locations:
    # Check if each tournament has at least one "WF" in the location_names
    tournaments_with_wf = valid_tournament_sets_no_bracket_df[
        valid_tournament_sets_no_bracket_df['location_names'] == location
    ]['tournament_key'].unique()

    # Check if all tournament_keys are represented in tournaments_with_wf
    all_tournaments_have_wf = set(valid_tournament_sets_no_bracket_df['tournament_key'].unique()) <= set(tournaments_with_wf)

    if all_tournaments_have_wf:
        print(f"All tournaments have at least one {location} set.")
    else:
        print(f"At least one tournaments is missing a {location} set.")


In [None]:
locations = ['WF', 'LSF', 'LF', 'GF']
for location in locations:
    # Count the occurrences of "GF" in location_names for each tournament
    gf_counts = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'] == "GF"].groupby('tournament_key').size()

    # Identify tournaments with exactly two "GF" sets
    tournaments_with_gfr = gf_counts[gf_counts > 1].index.tolist()

    print(f'There are {len(tournaments_with_gfr)} tournaments with more than one {location} sets.')
    
print()

locations = ['WSF']
for location in locations:
    # Count the occurrences of "GF" in location_names for each tournament
    gf_counts = valid_tournament_sets_no_bracket_df[valid_tournament_sets_no_bracket_df['location_names'] == "GF"].groupby('tournament_key').size()

    # Identify tournaments with exactly two "GF" sets
    tournaments_with_gfr = gf_counts[gf_counts != 2].index.tolist()

    print(f'There are {len(tournaments_with_gfr)} tournaments without two {location} sets.')

### To Do: Figure out why we don't always get the right number of location_names in a tournament.

<span style="color:red">To Do: For some reason, at least one tournament in the dataframe only has one Ln game. I don't know what that is about and need to investigate it. I lost which tournament it was.</span>

Put both dataframes together.

In [None]:
top_8_sets = pd.concat([all_top_8_with_bracket_name_df, all_top_8_no_bracket_name_df])
top_8_sets.sort_index(inplace=True)
top_8_sets.head(20)
# These are all the top 8 sets, including the ones without the bracket structure we are looking for
top_8_sets.to_pickle(data_path + 'top_8_sets.pkl')

## Remove bad tournaments
Check for tournaments without the right number of sets in each location.

In [None]:
bad_tournament_keys = []

# These location_names should only occur once in the top 8 of each tournament
locations = ['WF', 'LSF', 'LF', 'GF']

for location in locations:
    # Count the occurrences of location in location_names for each tournament
    location_counts = top_8_sets[top_8_sets['location_names'] == location].groupby('tournament_key').size()

    # Identify tournaments without exactly two location sets
    bad_tournaments_wrt_location = location_counts[location_counts != 1].index.tolist()
    
    print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly one {location} sets.')
    print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')
    
    bad_tournament_keys.extend(bad_tournaments_wrt_location)
print()


# This location_names should only occur exactly twice in the top 8 of each tournament
locations = ['WSF', 'LQF']
for location in locations:
    # Count the occurrences of location in location_names for each tournament
    location_counts = top_8_sets[top_8_sets['location_names'] == location].groupby('tournament_key').size()

    # Identify tournaments without exactly two location sets
    bad_tournaments_wrt_location = location_counts[location_counts != 2].index.tolist()

    print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly two {location} sets.')
    print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')
    
    bad_tournament_keys.extend(bad_tournaments_wrt_location)
print()

# There should be exactly two sets with different location_name than these
locations = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']

for location in locations:
    # Count the occurrences of location in location_names for each tournament
    location_counts = top_8_sets[~ top_8_sets['location_names'].isin(locations)].groupby('tournament_key').size()

    # Identify tournaments without exactly two location sets
    bad_tournaments_wrt_location = location_counts[location_counts != 2].index.tolist()

    print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly two Ln sets.')
    print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')
    
    bad_tournament_keys.extend(bad_tournaments_wrt_location)

# Delete duplicates
bad_tournament_keys = list(dict.fromkeys(bad_tournament_keys))
print(bad_tournament_keys)

print(f"There are {top_8_sets['tournament_key'].unique().shape[0]} - {len(bad_tournament_keys)} = {top_8_sets['tournament_key'].unique().shape[0] - len(bad_tournament_keys)} tournaments remaininng.")      

Check what is going on with one of the bad tournaments. https://www.start.gg/tournament/-1340/events

As we can see, the tournament structure is not what we are looking for.

![alt text](top_8_1340.png "Top 8 Bracket of 1340")


Remove the tournaments from top_8_sets.

In [None]:
# Make a list of tournament keys we are going to keep.
good_tournament_keys = [key for key in top_8_sets['tournament_key'].unique() if key not in bad_tournament_keys]
print(f"There are {len(good_tournament_keys)} good tournament keys and {len(bad_tournament_keys)} bad tournament keys.")

good_top_8_sets = top_8_sets[top_8_sets['tournament_key'].isin(good_tournament_keys)].copy()
print(f"We had {top_8_sets.shape[0]} top 8 sets and are left with {good_top_8_sets.shape[0]} good top 8 sets.")
print()

print(good_top_8_sets['location_names'].value_counts())

In [None]:
bad_tournament_keys = []

# Get all tournament keys
tournament_keys = top_8_sets['tournament_key'].unique()

# These location_names should occur exactly once in the top 8 of each tournament
locations_single = ['WF', 'LSF', 'LF', 'GF']

for location in locations_single:
    # Count the occurrences of location in location_names for each tournament
    location_counts = top_8_sets[top_8_sets['location_names'] == location].groupby('tournament_key').size()

    # Reindex to include all tournaments, filling missing counts with zero
    location_counts = location_counts.reindex(tournament_keys, fill_value=0)
    
    # Identify tournaments without exactly one occurrence
    bad_tournaments_wrt_location = location_counts[location_counts != 1].index.tolist()
    
    # print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly one {location} set.')
    # print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')
    
    bad_tournament_keys.extend(bad_tournaments_wrt_location)

# For 'GFR', the total per tournament should be either 0 or 1
gfr_counts = top_8_sets[top_8_sets['location_names'].isin(['GFR'])].groupby('tournament_key').size()

# Reindex and fill missing counts with zero
gfr_counts = gfr_counts.reindex(tournament_keys, fill_value=0)

# Identify tournaments where total GF + GFR is not 1 or 2
bad_tournaments_wrt_gfr = gfr_counts[~gfr_counts.isin([0, 1])].index.tolist()

# print(f'There are {len(bad_tournaments_wrt_gf)} tournaments without exactly 1 or 2 GF/GFR sets.')
# print(f'The bad tournaments are {bad_tournaments_wrt_gf[:3]}')

bad_tournament_keys.extend(bad_tournaments_wrt_gfr)

# These location_names should only occur exactly twice in the top 8 of each tournament
locations_double = ['WSF', 'LQF']

for location in locations_double:
    location_counts = top_8_sets[top_8_sets['location_names'] == location].groupby('tournament_key').size()
    location_counts = location_counts.reindex(tournament_keys, fill_value=0)
    bad_tournaments_wrt_location = location_counts[location_counts != 2].index.tolist()
    # print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly two {location} sets.')
    # print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')
    bad_tournament_keys.extend(bad_tournaments_wrt_location)

# For the remaining location names, there should be exactly two such sets per tournament
locations_exclude = ['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR']

location_counts = top_8_sets[~top_8_sets['location_names'].isin(locations_exclude)].groupby('tournament_key').size()
location_counts = location_counts.reindex(tournament_keys, fill_value=0)
bad_tournaments_wrt_location = location_counts[location_counts != 2].index.tolist()

# print(f'There are {len(bad_tournaments_wrt_location)} tournaments without exactly two Ln sets.')
# print(f'The bad tournaments are {bad_tournaments_wrt_location[:3]}')

bad_tournament_keys.extend(bad_tournaments_wrt_location)

# Remove duplicates
bad_tournament_keys = list(set(bad_tournament_keys))

print(f"There are {len(bad_tournament_keys)} bad tournament keys.")

# Filter out bad tournaments
good_tournament_keys = [key for key in tournament_keys if key not in bad_tournament_keys]
good_top_8_sets = top_8_sets[top_8_sets['tournament_key'].isin(good_tournament_keys)].copy()

print(f"We had {top_8_sets.shape[0]} top 8 sets and are left with {good_top_8_sets.shape[0]} good top 8 sets.")
print()
print(good_top_8_sets['location_names'].value_counts())
ln_set_count = (~good_top_8_sets['location_names'].isin(['WSF', 'LQF', 'WF', 'LSF', 'LF', 'GF', 'GFR'])).sum()
print(f"Ln   {ln_set_count}")

In [None]:
# These are all the top 8 sets from tournaments with the correct bracket stricture
good_top_8_sets.to_pickle(data_path + 'good_top_8_sets.pkl')

## Label the top 8 sets we will use for training.
This is how you would use the saved data frames to add the column to the data. You cannot have reset the original index of sets_df.

In [33]:
good_top_8_sets = pd.read_pickle(data_path + 'good_top_8_sets.pkl')

# Get the indices as a list
indices = list(good_top_8_sets.index)

# Copy the dataframe you wish to label
labelled_sets_df = sets_df.copy()

# Initialize the 'top_8' column with False
labelled_sets_df['good_top_8'] = False

# Set 'top_8' to True at the specified indices
labelled_sets_df.loc[indices, 'good_top_8'] = True
