# Additionl columns to sets_df
We already have run jaspar_label_majors.ipynb, jaspar_top_8_tournamen_path.ipynb, and jaspar_top_8.ipynb
resulting in 
- (data_path + 'top_8_tournament_previous_sets_and_results_df') 
- (data_path + 'sets_top_8_labeled_df.pkl')

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, cohen_kappa_score
from sklearn.model_selection import train_test_split  # Correct import

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'


## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [2]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = + data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Load Data


In [None]:
# players_df = pd.read_pickle(data_path + '/labelled_data/players_df.pkl')
# players_df.head()


Unnamed: 0,game,player_id,tag,all_tags,prefixes,social,country,state,region,c_country,c_state,c_region,placings,characters,alias
0,melee,Rishi,Rishi,[Rishi],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
1,melee,15634,lloD,"[lloD, VGz | lloD, Llod]",[],{'twitter': ['lloD74']},United States,VA,,US,CA,Laurel,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/peach': 1089, 'melee/falco': 1, 'melee...",
2,melee,6126,Zain,"[Zain, DontTestMe]",[PG],{'twitter': ['PG_Zain']},United States,VA,,US,CA,Los Angeles,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/marth': 1065, 'melee/pichu': 1, 'melee...",DontTestMe
3,melee,Chu,Chu,[Chu],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
4,melee,5620,Junebug,"[Junebug, LS | VGz Junebug]",[],{'twitter': ['arJunebug']},United States,VA,,US,VA,Richmond,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/sheik': 46, 'melee/falco': 4, 'melee/g...",


In [5]:
sets_df = pd.read_pickle(data_path + '/sets_top_8_labeled_df.pkl')
# sets_df = dfs['sets_df']
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
print(sets_df.shape)
sets_df.head(3)


32.9% percent of sets have some game data
(1795681, 19)


Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,5620,Chillin,3,1,"[R1, Round 1, Round 1]",,1,A,5,[],False,,False,,False
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,Aglet,2,3,"[R1, Round 1, Round 1]",,1,B,5,[],False,,False,,False
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,6126,1097,3,0,"[R1, Round 1, Round 1]",,1,C,5,[],False,,False,,False


In [7]:
tournament_info_df = pd.read_pickle(data_path + '/top_8_tournament_previous_sets_and_results_df')
print(tournament_info_df.shape)
tournament_info_df.head(3)


(30454, 36)


Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,...,WSF_B_p1,WSF_B_p2,LN_A_p1_non_top_8_sets,LN_A_p2_non_top_8_sets,LN_B_p1_non_top_8_sets,LN_B_p2_non_top_8_sets,WSF_A_p1_non_top_8_sets,WSF_A_p2_non_top_8_sets,WSF_B_p1_non_top_8_sets,WSF_B_p2_non_top_8_sets
0,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,...,1008,Ginger,"[(32, True), (62, True), (77, False), (164, Tr...","[(39, True), (65, True), (78, False), (165, Tr...","[(47, True), (69, True), (80, False), (159, Tr...","[(40, True), (66, True), (79, True), (85, Fals...","[(28, True), (60, True), (76, True), (84, True)]","[(36, True), (64, True), (78, True), (85, True)]","[(44, True), (68, True), (80, True), (86, True)]","[(52, True), (72, True), (82, True), (87, True)]"
1,melee,httpsparagonchallongecomla_2015_melee_singles,Paragon Los Angeles 2015,challonge,https://paragon.challonge.com/la_2015_melee_si...,,15,,2015-09-06 23:45:46,2015-09-07 20:33:07,...,4465,1003,"[(193, True), (219, True), (232, False), (286,...","[(210, True), (228, True), (237, True), (241, ...","[(190, True), (218, False), (272, True), (282,...","[(188, True), (217, False), (273, True), (283,...","[(182, True), (214, True), (230, True), (238, ...","[(194, True), (220, True), (233, True), (239, ...","[(198, True), (222, True), (234, True), (240, ...","[(206, True), (226, True), (236, True), (241, ..."
2,melee,httpsdl4-5challongecomdl45meleeprobracket,DrommeLAN4.5,challonge,https://dl4-5.challonge.com/DL45meleeProBracket,,15,,2015-05-02 23:55:20,2015-05-03 04:14:15,...,4465,12870,"[(328, True), (350, True), (361, True), (366, ...","[(310, True), (341, True), (356, False), (414,...","[(314, True), (343, True), (357, True), (364, ...","[(320, True), (346, True), (359, True), (365, ...","[(308, True), (340, True), (356, True), (364, ...","[(316, True), (344, True), (358, True), (365, ...","[(324, True), (348, True), (360, True), (366, ...","[(332, True), (352, True), (362, True), (367, ..."


### Add loser_id column

In [18]:
sets_df['loser_id'] = sets_df['p1_id']
p2_lose= (sets_df['winner_id'] == sets_df['p1_id'])
sets_df.loc[p2_lose, 'loser_id'] = sets_df['p2_id']

sets_df = sets_df[['key', 'game', 'tournament_key', 'winner_id', 'loser_id', 'p1_id', 'p2_id',
       'p1_score', 'p2_score', 'location_names', 'bracket_name',
       'bracket_order', 'set_order', 'best_of', 'game_data', 'top_8',
       'top_8_location_names', 'valid_top_8_bracket',
       'top_8_bracket_location_names', 'major']]
sets_df.head()

Unnamed: 0,key,game,tournament_key,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,Chillin,5620,Chillin,3,1,"[R1, Round 1, Round 1]",,1,A,5,[],False,,False,,False
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,15634,Aglet,2,3,"[R1, Round 1, Round 1]",,1,B,5,[],False,,False,,False
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,1097,6126,1097,3,0,"[R1, Round 1, Round 1]",,1,C,5,[],False,,False,,False
3,104675846,melee,mdva-invitational-2017-(challonge-mirror),1069,Chu,Chu,1069,0,3,"[R1, Round 1, Round 1]",,1,D,5,[],False,,False,,False
4,104675847,melee,mdva-invitational-2017-(challonge-mirror),Rishi,Jerry,Jerry,Rishi,1,3,"[R1, Round 1, Round 1]",,1,E,5,[],False,,False,,False


### Add valid_score column
This column will be true if the match was a best of 3 or best of 5 with one player getting the score needed to win.

In [None]:
best_of_3s = sets_df[sets_df['best_of'] == 3]
best_of_3s_p1_win = best_of_3s[(best_of_3s['p1_score'] == 2) & (best_of_3s['winner_id'] == best_of_3s['p1_id'])]
best_of_3s_valid = best_of_3s_p1_win[best_of_3s_p1_win['p2_score'].isin([0,1])]

valid_score_index = list(best_of_3s_valid.index)

best_of_3s_p2_win = best_of_3s[(best_of_3s['p2_score'] == 2) & (best_of_3s['winner_id'] == best_of_3s['p2_id'])]
best_of_3s_valid = pd.concat([best_of_3s_valid, best_of_3s_p2_win[best_of_3s_p2_win['p1_score'].isin([0,1])]])
valid_score_index = valid_score_index.append(list(best_of_3s_valid.index))


best_of_5s = sets_df[sets_df['best_of'] == 5]
best_of_5s_p1_win = best_of_5s[(best_of_5s['p1_score'] == 3) & (best_of_5s['winner_id'] == best_of_5s['p1_id'])]
best_of_5s_valid = best_of_5s_p1_win[best_of_5s_p1_win['p2_score'].isin([0,1,2])]
valid_score_index = valid_score_index.append(list(best_of_5s_valid.index))
best_of_5s_p2_win = best_of_5s[(best_of_5s['p2_score'] == 3) & (best_of_5s['winner_id'] == best_of_5s['p2_id'])]
best_of_5s_valid = pd.concat([best_of_5s_valid, best_of_5s_p2_win[best_of_5s_p2_win['p1_score'].isin([0,1,2])]])
valid_score_index = valid_score_index.append(list(best_of_5s_valid.index))

valid_score_index

# sets_df['valid_score'] = False
# sets_df.loc[valid_score, 'valid_score'] = True

# sets_df.columns

# filtered_sets_df = filtered_sets_df[filtered_sets_df['p1_score'].isin([0,1,2,3])]
# filtered_sets_df = filtered_sets_df[filtered_sets_df['p2_score'].isin([0,1,2,3])]
# filtered_sets_df = filtered_sets_df[filtered_sets_df['best_of'].isin([3,5])]

# mask = np.max(filtered_sets_df[['p1_score','p2_score']], axis = 1) == (filtered_sets_df['best_of'] // 2 + 1)

# filtered_sets_df = filtered_sets_df[mask]
# print(filtered_sets_df['p1_score'].info())
# filtered_sets_df[['p1_score','p2_score','best_of']].value_counts()

(759753, 21)
759753


TypeError: object of type 'NoneType' has no len()

In [None]:
sets_with_game_data_df = sets_df[sets_df['game_data'].apply(lambda x: x != [])].copy()
sets_with_game_data_df = sets_with_game_data_df[sets_with_game_data_df['valid_score'] == True]
sets_with_game_data_df['length_gamedata'] = sets_with_game_data_df['game_data'].apply(len)
sets_with_game_data_df = sets_with_game_data_df[sets_with_game_data_df['length_gamedata'].isin([2,3,4,5])]

In [14]:
sets_with_game_data_df.head()

Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,set_order,best_of,game_data,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,length_gamedata
19575,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,6126,1009,1009,6126,0,2,...,B,3,"[{'loser_char': 'melee/fox', 'winner_score': 1...",False,,False,,True,187,2
19582,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,6126,1004,6126,2,0,...,I,3,"[{'loser_char': 'melee/marth', 'winner_score':...",False,,False,,True,187,2
19626,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1028,1055,1028,1055,2,0,...,G,3,"[{'loser_char': 'melee/sheik', 'winner_score':...",True,LN,True,LN_B,True,187,2
19628,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,15990,1000,15990,1000,2,0,...,B,3,"[{'loser_char': 'melee/falco', 'winner_score':...",True,WSF,True,WSF_A,True,187,2
19629,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,1028,1004,1028,2,1,...,I,3,"[{'loser_char': 'melee/captainfalcon', 'winner...",True,LQF,True,LQF_B,True,187,3


In [15]:
sets_with_game_data_df['game_data'].values[0]

[{'loser_char': 'melee/fox',
  'winner_score': 1,
  'winner_id': 6126,
  'loser_id': 1009,
  'winner_char': 'melee/marth',
  'loser_score': 0,
  'stage': 'Battlefield'},
 {'loser_char': 'melee/fox',
  'winner_score': 1,
  'winner_id': 6126,
  'loser_id': 1009,
  'winner_char': 'melee/marth',
  'loser_score': 0,
  'stage': 'Pokémon Stadium'}]

In [16]:
sets_with_game_data_df['game_data'].values[:3]

array([list([{'loser_char': 'melee/fox', 'winner_score': 1, 'winner_id': 6126, 'loser_id': 1009, 'winner_char': 'melee/marth', 'loser_score': 0, 'stage': 'Battlefield'}, {'loser_char': 'melee/fox', 'winner_score': 1, 'winner_id': 6126, 'loser_id': 1009, 'winner_char': 'melee/marth', 'loser_score': 0, 'stage': 'Pokémon Stadium'}]),
       list([{'loser_char': 'melee/marth', 'winner_score': 1, 'winner_id': 1004, 'loser_id': 6126, 'winner_char': 'melee/jigglypuff', 'loser_score': 0, 'stage': 'Battlefield'}, {'loser_char': 'melee/marth', 'winner_score': 2, 'winner_id': 1004, 'loser_id': 6126, 'winner_char': 'melee/jigglypuff', 'loser_score': 0, 'stage': 'Pokémon Stadium'}]),
       list([{'loser_char': 'melee/sheik', 'winner_score': None, 'winner_id': 1028, 'loser_id': 1055, 'winner_char': 'melee/captainfalcon', 'loser_score': 0, 'stage': 'Battlefield'}, {'loser_char': 'melee/sheik', 'winner_score': None, 'winner_id': 1028, 'loser_id': 1055, 'winner_char': 'melee/captainfalcon', 'loser_sco

In [17]:
sets_with_game_data_df.head(2)

Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,set_order,best_of,game_data,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,length_gamedata
19575,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,6126,1009,1009,6126,0,2,...,B,3,"[{'loser_char': 'melee/fox', 'winner_score': 1...",False,,False,,True,187,2
19582,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,6126,1004,6126,2,0,...,I,3,"[{'loser_char': 'melee/marth', 'winner_score':...",False,,False,,True,187,2


In [18]:
# tqdm.pandas()

# # Function to extract unique characters played by each player in each set
# def extract_characters(game_data, p1_id, p2_id):
#     p1_characters = set()
#     p2_characters = set()
    
#     for game in game_data:
#         winner_id = str(game['winner_id'])
#         loser_id = str(game['loser_id'])
#         if winner_id== p1_id:
#             p1_characters.add(game['winner_char'].split('/')[1])
#         elif loser_id == p1_id:
#             p1_characters.add(game['loser_char'].split('/')[1])
       
#         if winner_id == p2_id:
#             p2_characters.add(game['winner_char'].split('/')[1])
#         elif loser_id == p2_id:
#             p2_characters.add(game['loser_char'].split('/')[1])
#     # print(p1_characters)
#     # Convert sets to sorted lists for consistent ordering
#     return sorted(p1_characters), sorted(p2_characters)

# # Apply the function to each row in the DataFrame
# sets_with_game_data_df[['p1_characters', 'p2_characters']] = sets_with_game_data_df.progress_apply(
#     lambda row: pd.Series(extract_characters(row['game_data'], row['p1_id'], row['p2_id'])),
#     axis=1
# )

# # Display the first few rows to verify
# sets_with_game_data_df[['p1_characters', 'p2_characters']].info()


In [19]:
sets_with_game_data_df['length_gamedata'].value_counts()

length_gamedata
2    299266
3    169879
4     42504
5     29436
Name: count, dtype: int64

In [20]:
tqdm.pandas()

# Function to extract unique characters played by each player in each set
def extract_characters(game_data, p1_id, p2_id):
    p1_characters = set()
    p2_characters = set()
    
    for game in game_data:
        # Convert IDs to strings for comparison
        winner_id = str(game['winner_id'])
        loser_id = str(game['loser_id'])
        
        # Check and add character for player 1
        if winner_id == p1_id and game['winner_char'] is not None:
            p1_characters.add(game['winner_char'].split('/')[1])
        elif loser_id == p1_id and game['loser_char'] is not None:
            p1_characters.add(game['loser_char'].split('/')[1])
        
        # Check and add character for player 2
        if winner_id == p2_id and game['winner_char'] is not None:
            p2_characters.add(game['winner_char'].split('/')[1])
        elif loser_id == p2_id and game['loser_char'] is not None:
            p2_characters.add(game['loser_char'].split('/')[1])
    
    # Convert sets to sorted lists for consistent ordering
    return sorted(p1_characters), sorted(p2_characters)

# Apply the function to each row in the DataFrame
sets_with_game_data_df[['p1_characters', 'p2_characters']] = sets_with_game_data_df.progress_apply(
    lambda row: pd.Series(extract_characters(row['game_data'], str(row['p1_id']), str(row['p2_id']))),
    axis=1
)

# Display the first few rows to verify
sets_with_game_data_df[['p1_characters', 'p2_characters']].info()


  0%|          | 0/541085 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 541085 entries, 19575 to 1795642
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   p1_characters  541085 non-null  object
 1   p2_characters  541085 non-null  object
dtypes: object(2)
memory usage: 12.4+ MB


In [21]:
sets_with_game_data_df[['p1_characters', 'p2_characters']].head(10)


Unnamed: 0,p1_characters,p2_characters
19575,[fox],[marth]
19582,[jigglypuff],[marth]
19626,[captainfalcon],[sheik]
19628,[sheik],[falco]
19629,[jigglypuff],[captainfalcon]
19630,[jigglypuff],[fox]
19631,[fox],[jigglypuff]
19632,[peach],[captainfalcon]
19633,[falco],[peach]
33211,[fox],[marth]


In [22]:
# Filter sets where players changed characters
sets_with_character_changes = sets_with_game_data_df[
    (sets_with_game_data_df['p1_characters'].apply(len) > 1) | 
    (sets_with_game_data_df['p2_characters'].apply(len) > 1)
].copy()

# Filter sets where players did not change characters
sets_without_character_changes = sets_with_game_data_df[
    (sets_with_game_data_df['p1_characters'].apply(len) == 1) & 
    (sets_with_game_data_df['p2_characters'].apply(len) == 1)
].copy()

In [23]:
print(sets_with_character_changes.shape)
print(sets_with_character_changes.shape[0] / sets_with_game_data_df.shape[0])
sets_with_character_changes[['p1_characters', 'p2_characters']].head()

(95159, 26)
0.17586700795623608


Unnamed: 0,p1_characters,p2_characters
36898,[falco],"[jigglypuff, kirby]"
56867,[iceclimbers],"[falco, fox]"
56868,[jigglypuff],"[fox, sheik]"
56915,[falco],"[marth, sheik]"
56916,"[iceclimbers, jigglypuff]","[marth, peach]"


In [24]:
print(sets_without_character_changes.shape)
print(sets_without_character_changes.shape[0] / sets_with_game_data_df.shape[0])
sets_without_character_changes[['p1_characters', 'p2_characters']].head()

(438876, 26)
0.8111036158829019


Unnamed: 0,p1_characters,p2_characters
19575,[fox],[marth]
19582,[jigglypuff],[marth]
19626,[captainfalcon],[sheik]
19628,[sheik],[falco]
19629,[jigglypuff],[captainfalcon]


In [25]:
sets_without_character_changes['matchup'] = sets_with_game_data_df.apply(
    lambda row: f"{row['p1_characters'][0]}/{row['p2_characters'][0]}"
    if row['p1_characters'] and row['p2_characters'] else None,
    axis=1
)
# Display the first few rows to verify
sets_without_character_changes[['p1_characters', 'p2_characters', 'matchup']].head()

Unnamed: 0,p1_characters,p2_characters,matchup
19575,[fox],[marth],fox/marth
19582,[jigglypuff],[marth],jigglypuff/marth
19626,[captainfalcon],[sheik],captainfalcon/sheik
19628,[sheik],[falco],sheik/falco
19629,[jigglypuff],[captainfalcon],jigglypuff/captainfalcon


In [26]:
# Step 2: Define a function to find the closest prior date index in char_vs_char_player_rankings_weekly_alt2_df
def get_prior_date_index(start_date, rankings_dates):
    # Find the last date before the tournament start date
    closest_date = rankings_dates[rankings_dates <= start_date].max()
    # Get the index position of this closest date in rankings_dates
    if pd.isnull(closest_date):
        return None  # In case there are no prior dates
    return rankings_dates.get_loc(closest_date)

# Step 3: Apply this function to each row in `sets_df` to get the index position
rankings_dates = char_vs_char_player_rankings_weekly_alt2_df.index
sets_df['ranking_date_index'] = sets_df['tournament_start_date'].progress_apply(lambda x: get_prior_date_index(x, rankings_dates))

# Display the first few rows to verify
sets_df[['tournament_start_date', 'ranking_date_index']].head()

  0%|          | 0/1795681 [00:00<?, ?it/s]

Unnamed: 0,tournament_start_date,ranking_date_index
0,2017-11-26 08:05:11,151.0
1,2017-11-26 08:05:11,151.0
2,2017-11-26 08:05:11,151.0
3,2017-11-26 08:05:11,151.0
4,2017-11-26 08:05:11,151.0


In [28]:
print(sets_without_character_changes.columns)
sets_without_character_changes.head()


Index(['key', 'game', 'tournament_key', 'tournament_start_date', 'winner_id',
       'loser_id', 'p1_id', 'p2_id', 'p1_score', 'p2_score', 'valid_score',
       'location_names', 'bracket_name', 'bracket_order', 'set_order',
       'best_of', 'game_data', 'top_8', 'top_8_location_names',
       'valid_top_8_bracket', 'top_8_bracket_location_names', 'major',
       'ranking_date_index', 'length_gamedata', 'p1_characters',
       'p2_characters', 'matchup'],
      dtype='object')


Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,length_gamedata,p1_characters,p2_characters,matchup
19575,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,6126,1009,1009,6126,0,2,...,False,,False,,True,187,2,[fox],[marth],fox/marth
19582,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,6126,1004,6126,2,0,...,False,,False,,True,187,2,[jigglypuff],[marth],jigglypuff/marth
19626,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1028,1055,1028,1055,2,0,...,True,LN,True,LN_B,True,187,2,[captainfalcon],[sheik],captainfalcon/sheik
19628,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,15990,1000,15990,1000,2,0,...,True,WSF,True,WSF_A,True,187,2,[sheik],[falco],sheik/falco
19629,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,1028,1004,1028,2,1,...,True,LQF,True,LQF_B,True,187,3,[jigglypuff],[captainfalcon],jigglypuff/captainfalcon


In [None]:
sets_without_character_changes['charcter_data'] = True
sets_with_character_changes['charcter_data'] = True
sets_without_character_changes['charcter_change'] = False
sets_with_character_changes['charcter_change'] = True
char_data_sets = pd.concat([sets_without_character_changes, sets_with_character_changes])

In [37]:
type(sets_without_character_changes.iloc[0]['p1_characters'])

list

In [39]:
sets_df['charcter_data'] = False
sets_df['charcter_change'] = None
sets_df['p1_characters'] = None
sets_df['p2_characters'] = None


In [40]:
index_list = list(char_data_sets.index)
sets_df.loc[index_list,['charcter_data', 'charcter_change', 'p1_characters', 'p2_characters']]

Unnamed: 0,charcter_data,charcter_change,p1_characters,p2_characters
19575,False,,,
19582,False,,,
19626,False,,,
19628,False,,,
19629,False,,,
...,...,...,...,...
1795496,False,,,
1795636,False,,,
1795637,False,,,
1795641,False,,,


In [41]:
sets_df.to_pickle(data_path + '/labelled_data/sets_df_3.pkl')

In [None]:
sets_without_character_changes['p1_characters'] = sets_without_character_changes['p1_characters'].apply(lambda x: x[0])
sets_without_character_changes['p2_characters'] = sets_without_character_changes['p2_characters'].apply(lambda x: x[0])

In [None]:
print(sets_without_character_changes['p1_characters'].value_counts().to_string())

p1_characters
fox               94929
falco             75292
marth             50962
sheik             45527
captainfalcon     39017
jigglypuff        26444
peach             25290
luigi             13324
samus             12254
ganondorf          8644
iceclimbers        7218
drmario            5921
yoshi              5067
pikachu            4783
link               2640
mario              2547
mrgameandwatch     2513
donkeykong         2265
roy                2197
zelda              2062
kirby              1948
ness               1807
younglink          1737
pichu              1213
random             1145
bowser             1120
mewtwo             1010


In [30]:
# sets_without_character_changes.to_pickle(data_path + '/labelled_data/sets_without_character_changes_df.pkl')

In [31]:
# num_pop = 6
# popular_characters =sets_without_character_changes['p1_characters'].value_counts().head(num_pop).index.values
# popular_matchup_sets_df = sets_without_character_changes[sets_without_character_changes['p1_characters'].isin(popular_characters) & sets_without_character_changes['p2_characters'].isin(popular_characters)].copy()
# print(popular_matchup_sets_df['matchup'].value_counts().to_string())
popular_matchup_sets_df = sets_without_character_changes



In [32]:
print(popular_matchup_sets_df.columns)

print(popular_matchup_sets_df['matchup'].value_counts().to_string())

Index(['key', 'game', 'tournament_key', 'tournament_start_date', 'winner_id',
       'loser_id', 'p1_id', 'p2_id', 'p1_score', 'p2_score', 'valid_score',
       'location_names', 'bracket_name', 'bracket_order', 'set_order',
       'best_of', 'game_data', 'top_8', 'top_8_location_names',
       'valid_top_8_bracket', 'top_8_bracket_location_names', 'major',
       'ranking_date_index', 'length_gamedata', 'p1_characters',
       'p2_characters', 'matchup'],
      dtype='object')
matchup
fox/fox                          17883
fox/falco                        17512
falco/fox                        15753
falco/falco                      14010
fox/marth                        12011
marth/fox                        10478
falco/marth                       9911
sheik/fox                         9803
fox/sheik                         9650
marth/falco                       9648
sheik/falco                       8164
fox/captainfalcon                 8090
captainfalcon/fox                 7596
fa

In [33]:
popular_matchup_sets_df.head()

Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,length_gamedata,p1_characters,p2_characters,matchup
19575,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,6126,1009,1009,6126,0,2,...,False,,False,,True,187,2,fox,marth,fox/marth
19582,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,6126,1004,6126,2,0,...,False,,False,,True,187,2,jigglypuff,marth,jigglypuff/marth
19626,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1028,1055,1028,1055,2,0,...,True,LN,True,LN_B,True,187,2,captainfalcon,sheik,captainfalcon/sheik
19628,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,15990,1000,15990,1000,2,0,...,True,WSF,True,WSF_A,True,187,2,sheik,falco,sheik/falco
19629,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,1028,1004,1028,2,1,...,True,LQF,True,LQF_B,True,187,3,jigglypuff,captainfalcon,jigglypuff/captainfalcon


In [34]:
popular_matchup_sets_df = popular_matchup_sets_df.dropna(subset=['ranking_date_index'])

popular_matchup_sets_df['ranking_date_index'].info()

<class 'pandas.core.series.Series'>
Index: 438869 entries, 19575 to 1795639
Series name: ranking_date_index
Non-Null Count   Dtype
--------------   -----
438869 non-null  Int64
dtypes: Int64(1)
memory usage: 7.1 MB


In [35]:
print(popular_matchup_sets_df.columns)
add_columns = popular_matchup_sets_df['matchup'].unique()

Index(['key', 'game', 'tournament_key', 'tournament_start_date', 'winner_id',
       'loser_id', 'p1_id', 'p2_id', 'p1_score', 'p2_score', 'valid_score',
       'location_names', 'bracket_name', 'bracket_order', 'set_order',
       'best_of', 'game_data', 'top_8', 'top_8_location_names',
       'valid_top_8_bracket', 'top_8_bracket_location_names', 'major',
       'ranking_date_index', 'length_gamedata', 'p1_characters',
       'p2_characters', 'matchup'],
      dtype='object')


In [39]:
popular_matchup_sets_df=popular_matchup_sets_df[(popular_matchup_sets_df['p1_characters'] != 'random') & (popular_matchup_sets_df['p2_characters'] != 'random')]

In [44]:
# from pandarallel import pandarallel

# Make a copy of the selected columns
p1_alt_2 = popular_matchup_sets_df[['ranking_date_index', 'p1_id']].copy()
# p1_alt_2['row'] = p1_alt_2['ranking_date_index']
# p1_alt_2 = p1_alt_2[['row', 'p1_id']]

# Get unique values from 'matchup' and use them as new column names
add_columns = popular_matchup_sets_df['matchup'].unique()

# Initialize these columns with the value 1500.0
for col in add_columns:
    p1_alt_2[col] = 1500.0

# Create a DataFrame to hold all the values we want to fill in p1_alt_2
matchup_df = p1_alt_2[['ranking_date_index', 'p1_id']].copy()

# Prepare matchup columns as a MultiIndex for efficient lookup in vectorized operations
matchup_columns = [col.split('/') for col in add_columns]
matchup_index = pd.MultiIndex.from_tuples(matchup_columns, names=["player_char", "opponent_char"])
matchup_values = pd.DataFrame(1500.0, index=matchup_df.index, columns=matchup_index)  # Start with 1500.0 as default

# Define a wrapper for get_alt2 that can be used in apply with a single row
def get_matchup_values(row):
    player_id = str(row['p1_id'])
    row_number = row['ranking_date_index']
    
    values = []
    for player_char, opponent_char in matchup_columns:
        value = get_alt2(player_id, player_char, opponent_char, row_number)
        values.append(value)
        
    return values

# # Initialize pandarallel
# pandarallel.initialize(nb_workers=18, progress_bar=True)

# Run apply in parallel
matchup_values = matchup_df.progress_apply(get_matchup_values, axis=1, result_type="expand")
matchup_values.columns = add_columns  # Restore original matchup column names

# Merge these values back into p1_alt_2
p1_alt_2.update(matchup_values)


  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0
  p1_alt_2[col] = 1500.0


  0%|          | 0/437369 [00:00<?, ?it/s]

In [45]:
# Make a copy of the selected columns
p2_alt_2 = popular_matchup_sets_df[['ranking_date_index', 'p2_id']].copy()
# p1_alt_2['row'] = p1_alt_2['ranking_date_index']
# p1_alt_2 = p1_alt_2[['row', 'p1_id']]

# Get unique values from 'matchup' and use them as new column names
add_columns = popular_matchup_sets_df['matchup'].unique()

# Initialize these columns with the value 1500.0
for col in add_columns:
    p2_alt_2[col] = 1500.0

# Create a DataFrame to hold all the values we want to fill in p1_alt_2
matchup_df = p2_alt_2[['ranking_date_index', 'p2_id']].copy()

# Prepare matchup columns as a MultiIndex for efficient lookup in vectorized operations
matchup_columns = [col.split('/') for col in add_columns]
matchup_index = pd.MultiIndex.from_tuples(matchup_columns, names=["player_char", "opponent_char"])
matchup_values = pd.DataFrame(1500.0, index=matchup_df.index, columns=matchup_index)  # Start with 1500.0 as default

# Define a wrapper for get_alt2 that can be used in apply with a single row
def get_matchup_values(row):
    player_id = str(row['p2_id'])
    row_number = row['ranking_date_index']
    
    values = []
    for player_char, opponent_char in matchup_columns:
        value = get_alt2(player_id, player_char, opponent_char, row_number)
        values.append(value)
        
    return values

# Run apply in parallel
matchup_values = matchup_df.progress_apply(get_matchup_values, axis=1, result_type="expand")
matchup_values.columns = add_columns  # Restore original matchup column names

# Merge these values back into p1_alt_2
p2_alt_2.update(matchup_values)

  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0
  p2_alt_2[col] = 1500.0


  0%|          | 0/437369 [00:00<?, ?it/s]

In [46]:
# Make a copy of the selected columns
p1_alt_2_rds = popular_matchup_sets_df[['ranking_date_index', 'p1_id']].copy()
# p1_alt_2['row'] = p1_alt_2['ranking_date_index']
# p1_alt_2 = p1_alt_2[['row', 'p1_id']]

# Get unique values from 'matchup' and use them as new column names
add_columns = popular_matchup_sets_df['matchup'].unique()

# Initialize these columns with the value 1500.0
for col in add_columns:
    p1_alt_2_rds[col] = 350.0

# Create a DataFrame to hold all the values we want to fill in p1_alt_2
matchup_df = p1_alt_2_rds[['ranking_date_index', 'p1_id']].copy()

# Prepare matchup columns as a MultiIndex for efficient lookup in vectorized operations
matchup_columns = [col.split('/') for col in add_columns]
matchup_index = pd.MultiIndex.from_tuples(matchup_columns, names=["player_char", "opponent_char"])
matchup_values = pd.DataFrame(350.0, index=matchup_df.index, columns=matchup_index)  # Start with 1500.0 as default

# Define a wrapper for get_alt2 that can be used in apply with a single row
def get_matchup_values(row):
    player_id = str(row['p1_id'])
    row_number = row['ranking_date_index']
    
    values = []
    for player_char, opponent_char in matchup_columns:
        value = get_alt2_rds(player_id, player_char, opponent_char, row_number)
        values.append(value)
        
    return values

# Run apply in parallel
matchup_values = matchup_df.progress_apply(get_matchup_values, axis=1, result_type="expand")
matchup_values.columns = add_columns  # Restore original matchup column names

# Merge these values back into p1_alt_2
p1_alt_2_rds.update(matchup_values)

  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] = 350.0
  p1_alt_2_rds[col] 

  0%|          | 0/437369 [00:00<?, ?it/s]

In [47]:
# Make a copy of the selected columns
p2_alt_2_rds = popular_matchup_sets_df[['ranking_date_index', 'p2_id']].copy()
# p1_alt_2['row'] = p1_alt_2['ranking_date_index']
# p1_alt_2 = p1_alt_2[['row', 'p1_id']]

# Get unique values from 'matchup' and use them as new column names
add_columns = popular_matchup_sets_df['matchup'].unique()

# Initialize these columns with the value 1500.0
for col in add_columns:
    p2_alt_2_rds[col] = 350.0

# Create a DataFrame to hold all the values we want to fill in p1_alt_2
matchup_df = p2_alt_2_rds[['ranking_date_index', 'p2_id']].copy()

# Prepare matchup columns as a MultiIndex for efficient lookup in vectorized operations
matchup_columns = [col.split('/') for col in add_columns]
matchup_index = pd.MultiIndex.from_tuples(matchup_columns, names=["player_char", "opponent_char"])
matchup_values = pd.DataFrame(350.0, index=matchup_df.index, columns=matchup_index)  # Start with 1500.0 as default

# Define a wrapper for get_alt2 that can be used in apply with a single row
def get_matchup_values(row):
    player_id = str(row['p2_id'])
    row_number = row['ranking_date_index']
    
    values = []
    for player_char, opponent_char in matchup_columns:
        value = get_alt2_rds(player_id, player_char, opponent_char, row_number)
        values.append(value)
        
    return values

# Run apply in parallel
matchup_values = matchup_df.progress_apply(get_matchup_values, axis=1, result_type="expand")
matchup_values.columns = add_columns  # Restore original matchup column names

# Merge these values back into p1_alt_2
p2_alt_2_rds.update(matchup_values)

  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] = 350.0
  p2_alt_2_rds[col] 

  0%|          | 0/437369 [00:00<?, ?it/s]

In [None]:
dataset_mini =  pd.read_pickle(data_path + 'dataset_mini.pkl')

In [50]:
dataset_mini.columns

Index(['p1_elo', 'p2_elo', 'p1_rd', 'p2_rd', 'p1_updates', 'p2_updates',
       'p1_m1_usage', 'p1_m2_usage', 'p2_m1_usage', 'p2_m2_usage',
       'p1/m1/m1_elo', 'p1/m1/m1_rd', 'p1/m1/m1_updates', 'p1/m1/m2_elo',
       'p1/m1/m2_rd', 'p1/m1/m2_updates', 'p1/m2/m1_elo', 'p1/m2/m1_rd',
       'p1/m2/m1_updates', 'p1/m2/m2_elo', 'p1/m2/m2_rd', 'p1/m2/m2_updates',
       'p2/m1/m1_elo', 'p2/m1/m1_rd', 'p2/m1/m1_updates', 'p2/m1/m2_elo',
       'p2/m1/m2_rd', 'p2/m1/m2_updates', 'p2/m2/m1_elo', 'p2/m2/m1_rd',
       'p2/m2/m1_updates', 'p2/m2/m2_elo', 'p2/m2/m2_rd', 'p2/m2/m2_updates',
       'p1/m1/m1_alt_elo', 'p1/m1/m1_alt_updates', 'p1/m1/m2_alt_elo',
       'p1/m1/m2_alt_updates', 'p1/m2/m1_alt_elo', 'p1/m2/m1_alt_updates',
       'p1/m2/m2_alt_elo', 'p1/m2/m2_alt_updates', 'p2/m1/m1_alt_elo',
       'p2/m1/m1_alt_updates', 'p2/m1/m2_alt_elo', 'p2/m1/m2_alt_updates',
       'p2/m2/m1_alt_elo', 'p2/m2/m1_alt_updates', 'p2/m2/m2_alt_elo',
       'p2/m2/m2_alt_updates', 'p2/m1/m1_alt_re

In [51]:
popular_matchup_sets_df.head()

Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,length_gamedata,p1_characters,p2_characters,matchup
19575,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,6126,1009,1009,6126,0,2,...,False,,False,,True,187,2,fox,marth,fox/marth
19582,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,6126,1004,6126,2,0,...,False,,False,,True,187,2,jigglypuff,marth,jigglypuff/marth
19626,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1028,1055,1028,1055,2,0,...,True,LN,True,LN_B,True,187,2,captainfalcon,sheik,captainfalcon/sheik
19628,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,15990,1000,15990,1000,2,0,...,True,WSF,True,WSF_A,True,187,2,sheik,falco,sheik/falco
19629,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,1028,1004,1028,2,1,...,True,LQF,True,LQF_B,True,187,3,jigglypuff,captainfalcon,jigglypuff/captainfalcon


In [52]:
# List of indices from popular_matchup_sets_df that are not in dataset_mini
valid_indicies = [index for index in popular_matchup_sets_df.index if index in dataset_mini.index]


In [53]:
popular_characters_overall_elos = dataset_mini.loc[valid_indicies][['p1_elo', 'p2_elo', 'p1_rd', 'p2_rd']]

In [54]:
pop_sets = popular_matchup_sets_df.loc[valid_indicies]
p1_alt_2_valid = p1_alt_2.loc[valid_indicies]
p2_alt_2_valid = p2_alt_2.loc[valid_indicies]
p1_alt_2_rds_valid = p1_alt_2_rds.loc[valid_indicies]
p2_alt_2_rds_valid = p2_alt_2_rds.loc[valid_indicies]

In [55]:
# x = 

In [56]:
path = data_path + 'predict_matchup_dataset/'
popular_characters_overall_elos.to_pickle(path + 'overall_elos.pkl')
pop_sets.to_pickle(path + 'matchup_sets_df.pkl')
p1_alt_2_valid.to_pickle(path + 'p1_alt_2.pkl')
p2_alt_2_valid.to_pickle(path + 'p2_alt_2.pkl')
p1_alt_2_rds_valid.to_pickle(path + 'p1_alt_2_rds.pkl')
p2_alt_2_rds_valid.to_pickle(path + 'p2_alt_2_rds.pkl')

In [None]:
# Initialize the XGBoost classifier
classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_estimators=100, tree_method='hist', max_depth=7)

# Train the classifier on the training data
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
cohen_kappa= cohen_kappa_score(y_pred, y_test)
print(f"Test Accuracy: {accuracy:.2f}")
print(f"Cohen Kappa Score: {cohen_kappa:.2f}")
# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



KeyboardInterrupt: 

In [None]:


# Convert data to numpy arrays and concatenate
p1_alt_2_np = p1_alt_2_valid.to_numpy()
p2_alt_2_np = p2_alt_2_valid.to_numpy()
alt2_dataset = np.concatenate((p1_alt_2_np, p2_alt_2_np), axis=1)

# Convert labels to dummy variables
labels = pd.get_dummies(pop_sets['matchup'])  # Assuming pop_sets was a typo


# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(alt2_dataset, labels, test_size=0.2, random_state=42)  # 80% train, 20% test


In [None]:
# Initialize the XGBoost classifier
classifier = XGBClassifier(eval_metric='mlogloss', random_state=42, n_estimators=100, tree_method='hist', max_depth=7)

# Train the classifier on the training data
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
cohen_kappa= cohen_kappa_score(y_pred, y_test)
print(f"Test Accuracy: {accuracy:.2f}")
print(f"Cohen Kappa Score: {cohen_kappa:.2f}")
# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.60
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.59      0.71       683
           1       0.88      0.61      0.72      1389
           2       0.87      0.64      0.74      1496
           3       0.91      0.62      0.74       474
           4       0.89      0.61      0.72       976
           5       0.90      0.65      0.75       803
           6       0.87      0.60      0.71      1367
           7       0.85      0.60      0.70      2689
           8       0.84      0.61      0.71      3090
           9       0.86      0.63      0.73       795
          10       0.88      0.61      0.72      2033
          11       0.88      0.59      0.71      1579
          12       0.88      0.62      0.72      1610
          13       0.85      0.64      0.73      3588
          14       0.85      0.63      0.72      3507
          15       0.86      0.64      0.74      1468
          16       0.84      0.60     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.model_selection import train_test_split  # Correct import

# Convert data to numpy arrays and concatenate
p1_alt_2_np = p1_alt_2_valid.to_numpy()
p2_alt_2_np = p2_alt_2_valid.to_numpy()
p1_alt_2_rds_np = p1_alt_2_rds_valid.to_numpy()
p2_alt_2_rds_np = p2_alt_2_rds_valid.to_numpy()
alt2_dataset = np.concatenate((p1_alt_2_np, p2_alt_2_np, p1_alt_2_rds_np, p2_alt_2_rds_np), axis=1)

# Convert labels to dummy variables
labels = pd.get_dummies(pop_sets['matchup'])  # Assuming pop_sets was a typo


# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(alt2_dataset, labels, test_size=0.2, random_state=42)  # 80% train, 20% test


In [None]:
# Initialize the XGBoost classifier
classifier = XGBClassifier(eval_metric='mlogloss', random_state=42, n_estimators=100, tree_method='hist', max_depth=7)

# Train the classifier on the training data
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
cohen_kappa= cohen_kappa_score(y_pred, y_test)
print(f"Test Accuracy: {accuracy:.2f}")
print(f"Cohen Kappa Score: {cohen_kappa:.2f}")
# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.64
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.62      0.73       683
           1       0.88      0.65      0.75      1389
           2       0.87      0.68      0.77      1496
           3       0.91      0.67      0.77       474
           4       0.89      0.64      0.74       976
           5       0.89      0.65      0.75       803
           6       0.88      0.65      0.75      1367
           7       0.85      0.64      0.73      2689
           8       0.85      0.66      0.74      3090
           9       0.85      0.68      0.76       795
          10       0.87      0.64      0.74      2033
          11       0.87      0.61      0.72      1579
          12       0.89      0.66      0.75      1610
          13       0.85      0.67      0.75      3588
          14       0.85      0.67      0.75      3507
          15       0.87      0.68      0.77      1468
          16       0.86      0.64     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
