In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict

import sqlite3
import sys
import time

import seaborn as sns

import re
import hashlib

## Loading database

Due to Jaspar's code for loading the database and handling data types being a bit more robust than mine, I have decided to start this second file for data exploration from scratch, copying over the database loading code. Currently, it is used for generating a heatmap of character vs character win percentages, but might be used for more exploration later.

Next couple cells are originally mostly Jaspar's - see his data exploration file for some more comments on it.


In [2]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = "../data/melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

In [3]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

In [4]:
players_df = dfs['players_df']
ranking_df = dfs['ranking_df']
ranking_seasons_df = dfs['ranking_seasons_df']
sets_df = dfs['sets_df']
tournament_info_df = dfs['tournament_info_df']

## Computing the character win rates

Ideally, for each individual player, we would have statistics of how well they perform playing a certain character, with their opponent also playing another given character. However, we might not have enough data for that in general, and so it might be necessary to use the global character vs. character win rates. Here, we compute those rates.

In [None]:
# Might not be best practice, but seems reasonably fast
game_data = []

for data in sets_df['game_data']:
    game_data += data
    
game_data_df = pd.DataFrame(game_data)

# melee/character -> character
# Dunno why these aren't formatted as strings by default. I'll look into that later.
game_data_df['loser_char'] = game_data_df['loser_char'].apply(lambda x: str(x).replace('melee/', ''))
game_data_df['winner_char'] = game_data_df['winner_char'].apply(lambda x: str(x).replace('melee/', ''))

# Data cleanup. Not 100% sure from where, but 'None' (as a string) shows up as a character sometimes.
# Let's just remove it here.
num_invalid = len(game_data_df[(game_data_df['loser_char'] == 'None') & (game_data_df['winner_char'] == 'None')].index)
num_total = len(game_data_df.index)
print("About {0:.2f}% of the data is invalid. Removing it.".format(100.0 * num_invalid / num_total))

game_data_df = game_data_df[(game_data_df['loser_char'] != 'None') & (game_data_df['winner_char'] != 'None')]

game_data_df

In [None]:
# Also slightly janky. Compute total character vs character wins.
# Need a dummy column in there initially (winner_id) to get the .count() to work properly
char_vs_char_totals_df = game_data_df[['winner_char', 'loser_char', 'winner_id']].groupby(by=['winner_char', 'loser_char']).count().unstack()

# Remove that leftover winner_id, which is the top level column after the .unstack()
char_vs_char_totals_df.columns = char_vs_char_totals_df.columns.droplevel()

# Rows and columns seem to be ordered alphabetically by default
# We can order by number of wins with that character instead
# (should be roughly equivalent to character popularity)
wins_df = char_vs_char_totals_df.sum(axis=1)
wins_df = wins_df.sort_values(ascending=False)

# Reorder rows and columns, respectively
char_vs_char_totals_df = char_vs_char_totals_df.reindex(wins_df.index)
char_vs_char_totals_df = char_vs_char_totals_df[wins_df.index]

# Compute win percentages
# Convert to float prematurely so we don't get errors about setting incompatible data types
char_vs_char_rates_df = char_vs_char_totals_df.copy().astype(float)
characters = list(char_vs_char_rates_df.index)

for char_1 in characters:
    for char_2 in characters:
        wins   = char_vs_char_totals_df.loc[char_1, char_2]
        losses = char_vs_char_totals_df.loc[char_2, char_1]

        char_vs_char_rates_df.loc[char_1, char_2] = 100 * wins / (wins + losses)

# For display purposes only, remove the absurd amount of decimals
char_vs_char_rates_df.astype(int)

## Create a heatmap

Here, we create a heatmap instead of just a table of numerical values.

In [None]:
# We can limit ourselves to the n most popular characters, to avoid having it be too massive
max_chars = 30

limited_df = char_vs_char_rates_df.iloc[:max_chars][char_vs_char_rates_df.columns[:max_chars]]
limited_df

sns.heatmap(limited_df, cmap='RdYlGn', center=50, xticklabels=True, yticklabels=True)

In [None]:
# Clearly, duplicates exist for some reason or another
players_df[players_df['tag'] == 'Hungrybox']

In [None]:
# At the end of the day, we will have to choose mappings for duplicate ids
# They will be stored here.
id_mapping = {}

def merge_one_tag_groupby(data):
    # Let's slowly deal with one column at a time.
    # game, player_id, tag, all_tags, prefixes, social, country, state, region, c_country, c_state, c_region, placings, characters, alias

    # Let's make this generalizable to games other than melee
    game = data.iloc[0]['game']

    # Pick one id to be the definitive one.
    # To keep things "canonical", if an ID is not numeric, pick that one.
    # Otherwise, pick the one with the smallest numeric value.
    #
    # We have to keep track of mapping the rest later.
    # Might as well map the definitive one to itself as well

    player_id = None

    # Search for a non-numeric ID
    for id in data['player_id']:
        if re.fullmatch(r'\d+', id) is None:
            player_id = id
            break    
    # All ids are numeric
    if player_id is None:
        ids = [int(x) for x in data['player_id']]
        ids.sort()
        player_id = str(ids[0])
    # Mapping old ids -> new id
    for id in data['player_id']:
        id_mapping[id] = player_id

    # These should all be the same anyways, if we grouped by tag
    tag = data.iloc[0]['tag']

    # We will actually repeatedly call this same function again
    # on later dataframes that actually do have multiple tags.
    # Hence, we will need to remember all of them.
    all_tags = []
    for tag_list in data['all_tags']:
        all_tags += tag_list
    all_tags = list(set(all_tags))

    # Might as well take all unique prefixes
    prefixes = []
    for prefix_list in data['prefixes']:
        prefixes += prefix_list
    prefixes = list(set(prefixes))

    # Again combine all socials into one, handling duplicates
    # Would rather throw away a blank entry than one with info
    social = {}
    # TODO: Maybe actually combine lists together instead of throwing info away?
    for social_list in data['social']:
        for key in social_list:
            if (key in social and social[key] == []) or key not in social:
                social[key] = social_list[key]
    
    # These entries might be correlated and we can't pick the first "non-None" entry for each
    # as those might correspond to different rows
    country   = data.iloc[0]['country']
    state     = data.iloc[0]['state']
    region    = data.iloc[0]['region']
    c_country = data.iloc[0]['c_country']
    c_state   = data.iloc[0]['c_state']
    c_region  = data.iloc[0]['c_region']

    # I'm gonna assume there are no duplicates among placings for duplicate tags
    placings = []
    for placing_list in data['placings']:
        placings += placing_list

    # There actually might be duplicates among the character list though
    # and we will need to compute totals among all of them
    characters = {}
    for character_dict in data['characters']:
        for key in character_dict:
            if key in characters:
                characters[key] += character_dict[key]
            else:
                characters[key] = character_dict[key]

    aliases = [x for x in data['alias'] if x is not None and x != 'None']
    alias = aliases[0] if len(aliases) > 0 else None

    final_dict = {'game': game,
                  'player_id': player_id,
                  'tag': tag,
                  'all_tags': all_tags,
                  'prefixes': prefixes,
                  'social': social,
                  'country': country,
                  'state': state,
                  'region': region,
                  'c_country': c_country,
                  'c_state': c_state,
                  'c_region': c_region,
                  'placings': placings,
                  'characters': characters,
                  'alias': alias}
    
    return pd.Series(final_dict)

# Note that the new index is the tag, and is not numeric anymore.
# Let's revert it back to numeric.
# TODO: Figure out this include_groups=False nonsense
merged_df = players_df.groupby('tag').apply(merge_one_tag_groupby)
merged_df = merged_df.reset_index(drop=True)

In [None]:
# This comparison takes a few minutes. You can skip it here if you want
PERFORM_COMMON_SETS_COMPARISON = True

# Avoiding so many string comparisons speeds things up DRAMATICALLY
def str_checksum(s):
    return (int(hashlib.sha256(s.encode('utf-8')).hexdigest(), 16) % 9223372036854775807)

def find_common_sets():
    # As a sanity check, let's verify that the "duplicates" we lumped together
    # really never played against each other
    print("Hashing player ids...")
    p1_id_series = sets_df['p1_id'].apply(str_checksum)
    p2_id_series = sets_df['p2_id'].apply(str_checksum)

    def have_played(id1, id2):
        result = True in ((p1_id_series == id1) & (p2_id_series == id2)).values or True in ((p1_id_series == id2) & (p2_id_series == id1)).values
        return result

    print("Performing comparisons...")

    total_common = 0

    for i,key in enumerate(id_mapping):
        if key == id_mapping[key]:
            continue

        if have_played(str_checksum(key), str_checksum(id_mapping[key])):
            total_common += 1
            print("Total of {0} at i={1}".format(total_common, i))
        
        # print(i)

    print("Total of {0}".format(total_common))

if PERFORM_COMMON_SETS_COMPARISON:
    find_common_sets()

#players_df = dfs['players_df']
#ranking_df = dfs['ranking_df']
#ranking_seasons_df = dfs['ranking_seasons_df']
#sets_df = dfs['sets_df']
#tournament_info_df = dfs['tournament_info_df']

In [None]:
sus_values = [1802, 16601, 24232, 40731, 44471, 51213, 78252, 88475]
keys = list(id_mapping.keys())

for value in sus_values:
    old_id = keys[value]
    new_id = id_mapping[old_id]
    
    print(old_id, new_id, players_df[players_df['player_id'] == new_id].iloc[0]['tag'])

In [None]:
list(players_df[players_df['tag'] == 'Hungrybox']['player_id'])