# Exploration of rating


In [301]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'


## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [302]:
# # Function to get the table names
# def get_table_names(conn):
#     query = "SELECT name FROM sqlite_master WHERE type='table';"
#     return pd.read_sql(query, conn)['name'].tolist()

# # Function to load tables into DataFrames
# def load_tables_to_dfs(conn):
#     table_names = get_table_names(conn)
#     dataframes = {}
    
#     for table in table_names:
#         # Load table into a DataFrame
#         df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
#         # Detect and convert JSON formatted columns (if any)
#         for col in df.columns:
#             # Check if any entry in the column is a valid JSON (list or dictionary)
#             if df[col].apply(lambda x: isinstance(x, str)).all():
#                 try:
#                     # Try parsing the column as JSON
#                     df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
#                 except (json.JSONDecodeError, TypeError):
#                     # If it fails, skip the column
#                     pass
        
#         # Store the DataFrame with table name + '_df'
#         dataframes[f"{table}_df"] = df
        
#     return dataframes

# if os.path.exists(data_path + 'dfs_dict.pkl'):
#     cell_has_run = True
#     # Load the dictionary of DataFrames from the pickle
#     with open(data_path + 'dfs_dict.pkl', 'rb') as f:
#         dfs = pickle.load(f)
# # Check if the flag variable exists in the global scope so that this code does not run twice
# if 'cell_has_run' not in globals():
#     path = + data_path + "melee_player_database.db"
    
#     # Connect to the database
#     conn = sqlite3.connect(path)

#     # Convert each table into a DataFrame
#     dfs = load_tables_to_dfs(conn)

#     # Close the connection
#     conn.close()

#     # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
#     # For example, to access the DataFrame for a table called 'players':
#     # players_df = dfs['players_df']

#     dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
#     dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
#     # Set the flag to indicate that the cell has been run
#     cell_has_run = True

### Load Data


In [303]:
players_df = pd.read_pickle(data_path + '/labelled_data/players_df.pkl')
players_df.head()


Unnamed: 0,game,player_id,tag,all_tags,prefixes,social,country,state,region,c_country,c_state,c_region,placings,characters,alias
0,melee,Rishi,Rishi,[Rishi],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
1,melee,15634,lloD,"[lloD, VGz | lloD, Llod]",[],{'twitter': ['lloD74']},United States,VA,,US,CA,Laurel,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/peach': 1089, 'melee/falco': 1, 'melee...",
2,melee,6126,Zain,"[Zain, DontTestMe]",[PG],{'twitter': ['PG_Zain']},United States,VA,,US,CA,Los Angeles,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/marth': 1065, 'melee/pichu': 1, 'melee...",DontTestMe
3,melee,Chu,Chu,[Chu],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
4,melee,5620,Junebug,"[Junebug, LS | VGz Junebug]",[],{'twitter': ['arJunebug']},United States,VA,,US,VA,Richmond,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/sheik': 46, 'melee/falco': 4, 'melee/g...",


In [304]:
sets_df = pd.read_pickle(data_path + '/labelled_data/sets_df.pkl')
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
print(sets_df.shape)
sets_df.head()


32.9% percent of sets have some game data
(1795681, 21)


Unnamed: 0,key,game,tournament_key,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,location_names,...,bracket_order,set_order,best_of,game_data,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,valid_score
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,Chillin,5620,Chillin,3,1,"[R1, Round 1, Round 1]",...,1,A,5,[],False,,False,,False,True
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,15634,Aglet,2,3,"[R1, Round 1, Round 1]",...,1,B,5,[],False,,False,,False,True
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,1097,6126,1097,3,0,"[R1, Round 1, Round 1]",...,1,C,5,[],False,,False,,False,True
3,104675846,melee,mdva-invitational-2017-(challonge-mirror),1069,Chu,Chu,1069,0,3,"[R1, Round 1, Round 1]",...,1,D,5,[],False,,False,,False,True
4,104675847,melee,mdva-invitational-2017-(challonge-mirror),Rishi,Jerry,Jerry,Rishi,1,3,"[R1, Round 1, Round 1]",...,1,E,5,[],False,,False,,False,True


In [305]:
tournament_info_df = pd.read_pickle(data_path + '/labelled_data/tournament_info_df.pkl')
print(tournament_info_df.shape)
tournament_info_df.head()


(39675, 37)


Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,...,WSF_B_p2,LN_A_p1_non_top_8_sets,LN_A_p2_non_top_8_sets,LN_B_p1_non_top_8_sets,LN_B_p2_non_top_8_sets,WSF_A_p1_non_top_8_sets,WSF_A_p2_non_top_8_sets,WSF_B_p1_non_top_8_sets,WSF_B_p2_non_top_8_sets,major
0,melee,mdva-invitational-2017-(challonge-mirror),MDVA Invitational 2017 (Challonge Mirror),challonge,https://challonge.com/mdva_invitational_2017,,17,,2017-11-26 08:05:11,2017-11-26 08:48:09,...,,,,,,,,,,
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,...,Ginger,"[(32, True), (62, True), (77, False), (164, Tr...","[(39, True), (65, True), (78, False), (165, Tr...","[(47, True), (69, True), (80, False), (159, Tr...","[(40, True), (66, True), (79, True), (85, Fals...","[(28, True), (60, True), (76, True), (84, True)]","[(36, True), (64, True), (78, True), (85, True)]","[(44, True), (68, True), (80, True), (86, True)]","[(52, True), (72, True), (82, True), (87, True)]",
2,melee,slippi-champions-league-week-1__melee-singles,Slippi Champions League Week 1,pgstats,slippi-champions-league-week-1,melee-singles,20,,2020-10-11 14:00:00,2020-10-11 14:00:00,...,,,,,,,,,,True
3,melee,slippi-champions-league-week-2__melee-singles,Slippi Champions League Week 2,pgstats,slippi-champions-league-week-2,melee-singles,20,,2020-10-18 14:00:00,2020-10-18 14:00:00,...,,,,,,,,,,True
4,melee,slippi-champions-league-week-3__melee-singles,Slippi Champions League Week 3,pgstats,slippi-champions-league-week-3,melee-singles,20,,2020-10-25 14:00:00,2020-10-25 14:00:00,...,,,,,,,,,,True


In [306]:
overall_players_ranking_new_weekly_df = pd.read_pickle(data_path + 'overall_players_ranking_new_weekly.pkl')
overall_players_ranking_new_weekly_df.sample(3)

Unnamed: 0,1617201,1497667,2653190,3657740,41449,6039,3621289,3682293,3332271,2034855,...,1911774,4106746,Black hayato,The boy,138692,3293641,1701726,2408932,SmyD,15495
2019-04-25,1500.0,1500.0,1500.0,1500.0,1147.194923,1362.829389,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1337.689105,1400.124736,1397.772865,1500.0,1500.0,1500.0,1559.078031,1482.878104
2018-04-26,1500.0,1500.0,1500.0,1500.0,1147.194923,1362.829389,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1337.689105,1400.124736,1397.772865,1500.0,1500.0,1500.0,1559.078031,1482.878104
2022-07-14,1305.337441,1500.0,1500.0,1500.0,1506.248436,1362.829389,1500.0,1500.0,1500.0,1500.0,...,1264.815941,1500.0,1337.689105,1400.124736,1397.772865,1500.0,1500.0,1514.677506,1559.078031,1482.878104


In [307]:
overall_players_rds_new_weekly_df = pd.read_pickle(data_path + 'overall_players_rds_new_weekly.pkl')
overall_players_rds_new_weekly_df.sample(3)

Unnamed: 0,1617201,1497667,2653190,3657740,41449,6039,3621289,3682293,3332271,2034855,...,1911774,4106746,Black hayato,The boy,138692,3293641,1701726,2408932,SmyD,15495
2017-06-08,350.0,350.0,350.0,350.0,254.735387,233.16009,350.0,350.0,350.0,350.0,...,350.0,350.0,308.110135,250.018187,223.482282,350.0,350.0,350.0,202.22714,161.713129
2018-01-11,350.0,350.0,350.0,350.0,261.264449,240.273332,350.0,350.0,350.0,350.0,...,350.0,350.0,313.527839,256.664383,230.893485,350.0,350.0,350.0,210.387479,171.808279
2022-04-07,259.544882,350.0,350.0,350.0,50.298278,285.900916,350.0,350.0,350.0,350.0,...,91.634259,350.0,349.727355,299.806113,278.062245,350.0,350.0,197.23797,261.279313,231.346432


In [308]:
char_vs_char_player_rankings_weekly_alt2_df = pd.read_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2.pkl')
char_vs_char_player_rankings_weekly_alt2_df.sample(3)


Unnamed: 0,3688504/sheik/roy,3688504/sheik/samus,3688504/sheik/sheik,3689802/sheik/marth,3689821/falco/falco,3689821/falco/fox,3689821/falco/jigglypuff,3689821/falco/kirby,3689821/falco/marth,3689821/falco/mewtwo,...,368847/fox/pikachu,368847/fox/samus,368847/fox/sheik,3688504/sheik/captainfalcon,3688504/sheik/falco,3688504/sheik/fox,3688504/sheik/jigglypuff,3688504/sheik/luigi,3688504/sheik/marth,3688504/sheik/peach
2021-09-09,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2015-01-22,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2020-01-30,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0


In [309]:
char_vs_char_player_rankings_weekly_alt2_rds_df = pd.read_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2_rds.pkl')
char_vs_char_player_rankings_weekly_alt2_rds_df.sample(3)

Unnamed: 0,1960158/fox/mewtwo,1960158/fox/roy,1960158/jigglypuff/falco,1960158/jigglypuff/fox,1960158/jigglypuff/iceclimbers,1960158/jigglypuff/jigglypuff,1960158/jigglypuff/link,1960158/jigglypuff/luigi,1960158/jigglypuff/marth,1960158/jigglypuff/sheik,...,999886/luigi/falco,999886/marth/captainfalcon,999886/sheik/falco,999888/sheik/captainfalcon,999888/sheik/falco,999888/sheik/fox,999888/sheik/jigglypuff,999888/sheik/marth,999888/sheik/mrgameandwatch,999888/sheik/samus
2015-01-08,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
2022-10-13,350.0,350.0,143.641543,93.907859,304.305274,152.087082,302.774908,251.114745,179.140214,174.14973,...,266.590236,241.167082,280.961989,185.624329,212.016796,216.841502,239.136472,162.168177,247.405143,247.405143
2020-03-26,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0


In [310]:
top_player_id = {
    'aMSa': '1021',
    'Cody': '19554',
    'Mang0': '1000',
    'Zain': '6126',
    'Armada': '6189',
    'Wizzrobe': '1028',
    'Mew2King': '1003',
    'PPMD': '1002',
    'Hungrybox': '1004',
    'Plup': '15990',
    'Axe': '16342'
}

# players_df[players_df['tag']=='Hungrybox']
# players_df.loc[100:120]



I want a function that quickly finds a player's alt_2 and alt_2_rds for all relevant matchups.


In [311]:
# Sort column names and cache them for faster lookup
alt2_char_char_columns = char_vs_char_player_rankings_weekly_alt2_df.columns
alt2_sorted_indices = np.argsort(alt2_char_char_columns)
alt2_sorted_columns = alt2_char_char_columns[alt2_sorted_indices]
alt2_sorted_columns_id = [label.split('/')[0] for label in alt2_char_char_columns[alt2_sorted_indices]]
alt2_sorted_columns_id_char = [label.split('/')[0] + '/' + label.split('/')[1] for label in alt2_char_char_columns[alt2_sorted_indices]]

# Create a mapping for easy access
sorted_alt2_column_mapping = {
    "sorted_indices": alt2_sorted_indices,
    "sorted_columns": alt2_sorted_columns,
    "sorted_columns_id": alt2_sorted_columns_id,
    "sorted_columns_id_char": alt2_sorted_columns_id_char
}

def get_alt2_player_columns(player_id, df=char_vs_char_player_rankings_weekly_alt2_df,sorted_column_mapping=sorted_alt2_column_mapping):
    sorted_indices = sorted_column_mapping["sorted_indices"]
    sorted_columns = sorted_column_mapping["sorted_columns_id"]
    
    # Find the start and end range for player_id
    start_idx = np.searchsorted(sorted_columns, player_id, side='left')
    end_idx = np.searchsorted(sorted_columns, player_id, side='right')
    
    # If no columns are found, return an empty DataFrame
    if start_idx == end_idx:
        return pd.DataFrame()  # or handle the case in another way if preferred
    
    # Map sorted indices back to original column order and subset DataFrame
    player_indices = sorted_indices[start_idx:end_idx]
    return df.iloc[:, player_indices]

def get_alt2_player_character_columns(player_id, player_char, df=char_vs_char_player_rankings_weekly_alt2_df, sorted_column_mapping=sorted_alt2_column_mapping):
    sorted_indices = sorted_column_mapping["sorted_indices"]
    sorted_columns = sorted_column_mapping["sorted_columns_id_char"]
    
    # Format search string as player_id/player_character/opponent_character
    search_pattern = player_id + '/' + player_char
    
    # Find the start and end range for player columns
    start_idx = np.searchsorted(sorted_columns, search_pattern, side='left')
    end_idx = np.searchsorted(sorted_columns, search_pattern, side='right')
    
    if start_idx == end_idx:
        return pd.DataFrame()  # or handle the case in another way if preferred
    
    # Map sorted indices back to original column order and subset DataFrame
    relevant_columns = sorted_indices[start_idx:end_idx]
    return df.iloc[:, relevant_columns]

def get_alt2_column(player_id, player_char, opponent_char, df=char_vs_char_player_rankings_weekly_alt2_df, sorted_column_mapping=sorted_alt2_column_mapping):
    sorted_indices = sorted_column_mapping["sorted_indices"]
    sorted_columns = sorted_column_mapping["sorted_columns"]
    
    # Format search string as player_id/player_character/opponent_character
    search_pattern = player_id + '/' + player_char + '/' + opponent_char
    
    # Find the start and end range for player columns
    idx = np.searchsorted(sorted_columns, search_pattern, side='left')

    return df.iloc[:, sorted_indices[idx]]


In [312]:
# Sort column names and cache them for faster lookup
alt2_rds_char_char_columns = char_vs_char_player_rankings_weekly_alt2_rds_df.columns
alt2_rds_sorted_indices = np.argsort(alt2_rds_char_char_columns)
alt2_rds_sorted_columns = alt2_rds_char_char_columns[alt2_rds_sorted_indices]
alt2_rds_sorted_columns_id = [label.split('/')[0] for label in alt2_rds_char_char_columns[alt2_rds_sorted_indices]]
alt2_rds_sorted_columns_id_char = [label.split('/')[0] + '/' + label.split('/')[1] for label in alt2_rds_char_char_columns[alt2_rds_sorted_indices]]

# Create a mapping for easy access
sorted_alt2_rds_column_mapping = {
    "sorted_indices": alt2_rds_sorted_indices,
    "sorted_columns": alt2_rds_sorted_columns,
    "sorted_columns_id": alt2_rds_sorted_columns_id,
    "sorted_columns_id_char": alt2_rds_sorted_columns_id_char
}

def get_alt2_rds_player_columns(player_id, df=char_vs_char_player_rankings_weekly_alt2_rds_df,sorted_column_mapping=sorted_alt2_rds_column_mapping):
    sorted_indices = sorted_column_mapping["sorted_indices"]
    sorted_columns = sorted_column_mapping["sorted_columns_id"]
    
    # Find the start and end range for player_id
    start_idx = np.searchsorted(sorted_columns, player_id, side='left')
    end_idx = np.searchsorted(sorted_columns, player_id, side='right')
    
    # If no columns are found, return an empty DataFrame
    if start_idx == end_idx:
        return pd.DataFrame()  # or handle the case in another way if preferred
    
    # Map sorted indices back to original column order and subset DataFrame
    player_indices = sorted_indices[start_idx:end_idx]
    return df.iloc[:, player_indices]

def get_alt2_rds_player_character_columns(player_id, player_char, df=char_vs_char_player_rankings_weekly_alt2_rds_df, sorted_column_mapping=sorted_alt2_rds_column_mapping):
    sorted_indices = sorted_column_mapping["sorted_indices"]
    sorted_columns = sorted_column_mapping["sorted_columns_id_char"]
    
    # Format search string as player_id/player_character/opponent_character
    search_pattern = player_id + '/' + player_char
    
    # Find the start and end range for player columns
    start_idx = np.searchsorted(sorted_columns, search_pattern, side='left')
    end_idx = np.searchsorted(sorted_columns, search_pattern, side='right')
    
    if start_idx == end_idx:
        return pd.DataFrame()  # or handle the case in another way if preferred
    
    # Map sorted indices back to original column order and subset DataFrame
    relevant_columns = sorted_indices[start_idx:end_idx]
    return df.iloc[:, relevant_columns]

def get_alt2_rds_column(player_id, player_char, opponent_char, df=char_vs_char_player_rankings_weekly_alt2_rds_df, sorted_column_mapping=sorted_alt2_rds_column_mapping):
    sorted_indices = sorted_column_mapping["sorted_indices"]
    sorted_columns = sorted_column_mapping["sorted_columns"]
    
    # Format search string as player_id/player_character/opponent_character
    search_pattern = player_id + '/' + player_char + '/' + opponent_char
    
    # Find the start and end range for player columns
    idx = np.searchsorted(sorted_columns, search_pattern, side='left')

    return df.iloc[:, sorted_indices[idx]]


In [313]:
sets_df.head()

Unnamed: 0,key,game,tournament_key,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,location_names,...,bracket_order,set_order,best_of,game_data,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,valid_score
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,Chillin,5620,Chillin,3,1,"[R1, Round 1, Round 1]",...,1,A,5,[],False,,False,,False,True
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,15634,Aglet,2,3,"[R1, Round 1, Round 1]",...,1,B,5,[],False,,False,,False,True
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,1097,6126,1097,3,0,"[R1, Round 1, Round 1]",...,1,C,5,[],False,,False,,False,True
3,104675846,melee,mdva-invitational-2017-(challonge-mirror),1069,Chu,Chu,1069,0,3,"[R1, Round 1, Round 1]",...,1,D,5,[],False,,False,,False,True
4,104675847,melee,mdva-invitational-2017-(challonge-mirror),Rishi,Jerry,Jerry,Rishi,1,3,"[R1, Round 1, Round 1]",...,1,E,5,[],False,,False,,False,True


In [333]:
sets_with_game_data_df = sets_df[sets_df['game_data'].apply(lambda x: x != [])].copy()
sets_with_game_data_df = sets_with_game_data_df[sets_with_game_data_df['valid_score'] == True]
sets_with_game_data_df['length_gamedata'] = sets_with_game_data_df['game_data'].apply(len)
sets_with_game_data_df = sets_with_game_data_df[sets_with_game_data_df['length_gamedata'].isin([2,3,4,5])]

In [334]:
sets_with_game_data_df.head()

Unnamed: 0,key,game,tournament_key,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,location_names,...,set_order,best_of,game_data,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,valid_score,length_gamedata
19575,,melee,evo-2018__evo-2018-1,6126,1009,1009,6126,0,2,"[W1, Winners 1, Winners Round 1]",...,B,3,"[{'loser_char': 'melee/fox', 'winner_score': 1...",False,,False,,True,True,2
19582,,melee,evo-2018__evo-2018-1,1004,6126,1004,6126,2,0,"[WQF, Winners Quarters, Winners Quarter-Final]",...,I,3,"[{'loser_char': 'melee/marth', 'winner_score':...",False,,False,,True,True,2
19626,,melee,evo-2018__evo-2018-1,1028,1055,1028,1055,2,0,"[L1, Losers 1, Losers Round 1]",...,G,3,"[{'loser_char': 'melee/sheik', 'winner_score':...",True,LN,True,LN_B,True,True,2
19628,,melee,evo-2018__evo-2018-1,15990,1000,15990,1000,2,0,"[WSF, Winners Semis, Winners Semi-Final]",...,B,3,"[{'loser_char': 'melee/falco', 'winner_score':...",True,WSF,True,WSF_A,True,True,2
19629,,melee,evo-2018__evo-2018-1,1004,1028,1004,1028,2,1,"[LQF, Losers Quarters, Losers Quarter-Final]",...,I,3,"[{'loser_char': 'melee/captainfalcon', 'winner...",True,LQF,True,LQF_B,True,True,3


In [335]:
sets_with_game_data_df['game_data'].values[0]

[{'loser_char': 'melee/fox',
  'winner_score': 1,
  'winner_id': 6126,
  'loser_id': 1009,
  'winner_char': 'melee/marth',
  'loser_score': 0,
  'stage': 'Battlefield'},
 {'loser_char': 'melee/fox',
  'winner_score': 1,
  'winner_id': 6126,
  'loser_id': 1009,
  'winner_char': 'melee/marth',
  'loser_score': 0,
  'stage': 'Pokémon Stadium'}]

In [338]:
sets_with_game_data_df['game_data'].values[:3]

array([list([{'loser_char': 'melee/fox', 'winner_score': 1, 'winner_id': 6126, 'loser_id': 1009, 'winner_char': 'melee/marth', 'loser_score': 0, 'stage': 'Battlefield'}, {'loser_char': 'melee/fox', 'winner_score': 1, 'winner_id': 6126, 'loser_id': 1009, 'winner_char': 'melee/marth', 'loser_score': 0, 'stage': 'Pokémon Stadium'}]),
       list([{'loser_char': 'melee/marth', 'winner_score': 1, 'winner_id': 1004, 'loser_id': 6126, 'winner_char': 'melee/jigglypuff', 'loser_score': 0, 'stage': 'Battlefield'}, {'loser_char': 'melee/marth', 'winner_score': 2, 'winner_id': 1004, 'loser_id': 6126, 'winner_char': 'melee/jigglypuff', 'loser_score': 0, 'stage': 'Pokémon Stadium'}]),
       list([{'loser_char': 'melee/sheik', 'winner_score': None, 'winner_id': 1028, 'loser_id': 1055, 'winner_char': 'melee/captainfalcon', 'loser_score': 0, 'stage': 'Battlefield'}, {'loser_char': 'melee/sheik', 'winner_score': None, 'winner_id': 1028, 'loser_id': 1055, 'winner_char': 'melee/captainfalcon', 'loser_sco

In [343]:
sets_with_game_data_df.head(2)

Unnamed: 0,key,game,tournament_key,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,location_names,...,game_data,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,valid_score,length_gamedata,p1_characters,p2_characters
19575,,melee,evo-2018__evo-2018-1,6126,1009,1009,6126,0,2,"[W1, Winners 1, Winners Round 1]",...,"[{'loser_char': 'melee/fox', 'winner_score': 1...",False,,False,,True,True,2,[],[]
19582,,melee,evo-2018__evo-2018-1,1004,6126,1004,6126,2,0,"[WQF, Winners Quarters, Winners Quarter-Final]",...,"[{'loser_char': 'melee/marth', 'winner_score':...",False,,False,,True,True,2,[],[]


In [356]:
tqdm.pandas()

# Function to extract unique characters played by each player in each set
def extract_characters(game_data, p1_id, p2_id):
    p1_characters = set()
    p2_characters = set()
    
    for game in game_data:
        winner_id = str(game['winner_id'])
        loser_id = str(game['loser_id'])
        if winner_id== p1_id:
            p1_characters.add(game['winner_char'].split('/')[1])
        elif loser_id == p1_id:
            p1_characters.add(game['loser_char'].split('/')[1])
       
        if winner_id == p2_id:
            p2_characters.add(game['winner_char'].split('/')[1])
        elif loser_id == p2_id:
            p2_characters.add(game['loser_char'].split('/')[1])
    # print(p1_characters)
    # Convert sets to sorted lists for consistent ordering
    return sorted(p1_characters), sorted(p2_characters)

# Apply the function to each row in the DataFrame
sets_with_game_data_df[['p1_characters', 'p2_characters']] = sets_with_game_data_df.progress_apply(
    lambda row: pd.Series(extract_characters(row['game_data'], row['p1_id'], row['p2_id'])),
    axis=1
)

# Display the first few rows to verify
sets_with_game_data_df[['p1_characters', 'p2_characters']].info()


  0%|          | 0/541085 [00:00<?, ?it/s]

AttributeError: 'NoneType' object has no attribute 'split'

In [331]:
sets_with_game_data_df['length_gamedata'].value_counts()

length_gamedata
2    299266
3    169879
4     42504
5     29436
Name: count, dtype: int64

In [357]:
tqdm.pandas()

# Function to extract unique characters played by each player in each set
def extract_characters(game_data, p1_id, p2_id):
    p1_characters = set()
    p2_characters = set()
    
    for game in game_data:
        # Convert IDs to strings for comparison
        winner_id = str(game['winner_id'])
        loser_id = str(game['loser_id'])
        
        # Check and add character for player 1
        if winner_id == p1_id and game['winner_char'] is not None:
            p1_characters.add(game['winner_char'].split('/')[1])
        elif loser_id == p1_id and game['loser_char'] is not None:
            p1_characters.add(game['loser_char'].split('/')[1])
        
        # Check and add character for player 2
        if winner_id == p2_id and game['winner_char'] is not None:
            p2_characters.add(game['winner_char'].split('/')[1])
        elif loser_id == p2_id and game['loser_char'] is not None:
            p2_characters.add(game['loser_char'].split('/')[1])
    
    # Convert sets to sorted lists for consistent ordering
    return sorted(p1_characters), sorted(p2_characters)

# Apply the function to each row in the DataFrame
sets_with_game_data_df[['p1_characters', 'p2_characters']] = sets_with_game_data_df.progress_apply(
    lambda row: pd.Series(extract_characters(row['game_data'], str(row['p1_id']), str(row['p2_id']))),
    axis=1
)

# Display the first few rows to verify
sets_with_game_data_df[['p1_characters', 'p2_characters']].info()


  0%|          | 0/541085 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 541085 entries, 19575 to 1795642
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   p1_characters  541085 non-null  object
 1   p2_characters  541085 non-null  object
dtypes: object(2)
memory usage: 12.4+ MB


In [358]:
sets_with_game_data_df[['p1_characters', 'p2_characters']].head(10)
        

Unnamed: 0,p1_characters,p2_characters
19575,[fox],[marth]
19582,[jigglypuff],[marth]
19626,[captainfalcon],[sheik]
19628,[sheik],[falco]
19629,[jigglypuff],[captainfalcon]
19630,[jigglypuff],[fox]
19631,[fox],[jigglypuff]
19632,[peach],[captainfalcon]
19633,[falco],[peach]
33211,[fox],[marth]


In [359]:
# Filter sets where players changed characters
sets_with_character_changes = sets_with_game_data_df[
    (sets_with_game_data_df['p1_characters'].apply(len) > 1) | 
    (sets_with_game_data_df['p2_characters'].apply(len) > 1)
]

# Filter sets where players did not change characters
sets_without_character_changes = sets_with_game_data_df[
    (sets_with_game_data_df['p1_characters'].apply(len) == 1) & 
    (sets_with_game_data_df['p2_characters'].apply(len) == 1)
]

In [365]:
print(sets_with_character_changes.shape)
sets_with_character_changes[['p1_characters', 'p2_characters']].head()

(95159, 24)


Unnamed: 0,p1_characters,p2_characters
36898,[falco],"[jigglypuff, kirby]"
56867,[iceclimbers],"[falco, fox]"
56868,[jigglypuff],"[fox, sheik]"
56915,[falco],"[marth, sheik]"
56916,"[iceclimbers, jigglypuff]","[marth, peach]"


In [364]:
print(sets_without_character_changes.shape)
sets_without_character_changes[['p1_characters', 'p2_characters']].head()

(438876, 24)


Unnamed: 0,p1_characters,p2_characters
19575,[fox],[marth]
19582,[jigglypuff],[marth]
19626,[captainfalcon],[sheik]
19628,[sheik],[falco]
19629,[jigglypuff],[captainfalcon]


In [366]:
sets_without_character_changes.shape[0] / sets_with_game_data_df.shape[0]

0.8111036158829019

In [368]:
sets_without_character_changes['matchup'] = sets_with_game_data_df.apply(
    lambda row: f"{row['p1_characters'][0]}/{row['p2_characters'][0]}"
    if row['p1_characters'] and row['p2_characters'] else None,
    axis=1
)
# Display the first few rows to verify
sets_without_character_changes[['p1_characters', 'p2_characters', 'matchup']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sets_without_character_changes['matchup'] = sets_with_game_data_df.apply(


Unnamed: 0,p1_characters,p2_characters,matchup
19575,[fox],[marth],fox/marth
19582,[jigglypuff],[marth],jigglypuff/marth
19626,[captainfalcon],[sheik],captainfalcon/sheik
19628,[sheik],[falco],sheik/falco
19629,[jigglypuff],[captainfalcon],jigglypuff/captainfalcon
