In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict

import sqlite3
import sys
import time

## Loading database

Due to Jaspar's code for loading the database and handling data types being a bit more robust than mine, I have decided to start this second file for data exploration from scratch, copying over the database loading code. Currently, it is used for generating a heatmap of character vs character win percentages, but might be used for more exploration later.

Next couple cells are originally Jaspar's - see his data exploration file for some more comments on it.


In [2]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = "../data/melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

In [3]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

In [4]:
players_df = dfs['players_df']
ranking_df = dfs['ranking_df']
ranking_seasons_df = dfs['ranking_seasons_df']
sets_df = dfs['sets_df']
tournament_info_df = dfs['tournament_info_df']

In [5]:
# Might not be best practice, but seems reasonably fast
game_data = []

for data in sets_df['game_data']:
    game_data += data
    
game_data_df = pd.DataFrame(game_data)

# melee/character -> character
# Dunno why these aren't formatted as strings by default. I'll look into that later.
game_data_df['loser_char'] = game_data_df['loser_char'].apply(lambda x: str(x).replace('melee/', ''))
game_data_df['winner_char'] = game_data_df['winner_char'].apply(lambda x: str(x).replace('melee/', ''))

# Data cleanup. Not 100% sure from where, but 'None' (as a string) shows up as a character sometimes.
# Let's just remove it here.
num_invalid = len(game_data_df[(game_data_df['loser_char'] == 'None') & (game_data_df['winner_char'] == 'None')].index)
num_total = len(game_data_df.index)
print("About {0:.2f}% of the data is invalid. Removing it.".format(100.0 * num_invalid / num_total))

game_data_df = game_data_df[(game_data_df['loser_char'] != 'None') & (game_data_df['winner_char'] != 'None')]

game_data_df

About 1.20% of the data is invalid. Removing it.


Unnamed: 0,loser_char,winner_score,winner_id,loser_id,winner_char,loser_score,stage
0,peach,1.0,1004,1032,jigglypuff,0.0,Battlefield
1,peach,2.0,1004,1032,jigglypuff,0.0,Yoshi's Story
2,peach,2.0,1004,1032,jigglypuff,0.0,Yoshi's Story
3,peach,1.0,1017,1039,captainfalcon,0.0,Dream Land
4,peach,2.0,1017,1039,captainfalcon,0.0,
...,...,...,...,...,...,...,...
1609192,marth,,2791218,908884,sheik,0.0,
1609193,marth,,2791218,908884,sheik,0.0,
1609194,fox,,180567,2791218,falco,0.0,
1609195,sheik,,180567,2791218,fox,0.0,


In [11]:
# Also slightly janky. Compute total character vs character wins.
# Need a dummy column in there initially (winner_id) to get the .count() to work properly
char_vs_char_totals_df = game_data_df[['winner_char', 'loser_char', 'winner_id']].groupby(by=['winner_char', 'loser_char']).count().unstack()

# Remove that leftover winner_id, which is the top level column after the .unstack()
char_vs_char_totals_df.columns = char_vs_char_totals_df.columns.droplevel()

# Rows and columns seem to be ordered alphabetically by default
# We can order by number of wins with that character instead
# (should be roughly equivalent to character popularity)
wins_df = char_vs_char_totals_df.sum(axis=1)
wins_df = wins_df.sort_values(ascending=False)

# Reorder rows and columns, respectively
char_vs_char_totals_df = char_vs_char_totals_df.reindex(wins_df.index)
char_vs_char_totals_df = char_vs_char_totals_df[wins_df.index]

# Compute win percentages
# Convert to float prematurely so we don't get errors about setting incompatible data types
char_vs_char_rates_df = char_vs_char_totals_df.copy().astype(float)
characters = list(char_vs_char_rates_df.index)

for char_1 in characters:
    for char_2 in characters:
        wins   = char_vs_char_totals_df.loc[char_1, char_2]
        losses = char_vs_char_totals_df.loc[char_2, char_1]

        char_vs_char_rates_df.loc[char_1, char_2] = 100 * wins / (wins + losses)

# For display purposes only, remove the absurd amount of decimals
char_vs_char_rates_df.astype(int)

loser_char,bowser,captainfalcon,donkeykong,drmario,falco,fox,ganondorf,iceclimbers,jigglypuff,kirby,...,peach,pichu,pikachu,random,roy,samus,sheik,yoshi,younglink,zelda
winner_char,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fox,65,52,53,57,51,50,53,46,51,67,...,49,61,53,54,54,48,52,56,60,55
falco,60,50,51,55,50,48,50,42,46,56,...,43,55,52,50,53,46,48,55,50,52
marth,58,48,43,50,50,48,43,45,47,52,...,48,53,50,49,53,46,44,44,55,47
sheik,63,51,53,59,51,47,54,46,52,57,...,50,58,53,51,55,54,50,56,56,59
captainfalcon,61,50,51,62,49,47,52,50,50,53,...,50,51,51,50,57,52,48,59,58,59
jigglypuff,55,49,48,51,53,48,49,45,50,55,...,46,53,53,50,57,50,47,53,52,41
peach,64,49,58,58,56,50,50,59,53,71,...,50,68,56,54,65,52,49,63,56,50
luigi,56,45,50,57,49,49,40,51,52,52,...,46,62,54,57,58,45,48,52,52,46
samus,63,47,48,53,53,51,43,54,49,62,...,47,66,54,51,57,50,45,52,50,49
ganondorf,63,47,47,63,49,46,50,56,50,57,...,49,69,52,49,56,56,45,58,56,64
