## Packages and default settings

In particular, there is a setting for how many rows to load. The database is MASSIVE.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sqlite3
import time
import sys

MAX_ROWS = 10*1000*1000
path = "../data/melee_player_database.db"

## Load the database

In [None]:
con = sqlite3.connect(path)
cur = con.cursor()

# Might be slightly janky
def get_column_names(table):
    cur.execute("SELECT * FROM " + table + " LIMIT 1")
    return [x[0] for x in cur.description]

def get_rows(table, max_rows=None):
    if max_rows is None:
        cur.execute("SELECT * FROM " + table)
    else:
        cur.execute("SELECT * FROM " + table + " LIMIT " + str(max_rows))
    
    return cur.fetchall()

def load_table(table, max_rows=None):
    # BENCHMARKING - BENCHMARKING - BENCHMARKING
    start_time = time.time()
    
    columns = get_column_names(table)
    rows = get_rows(table=table, max_rows=MAX_ROWS)
    
    df = pd.DataFrame(rows, columns=columns)
    
    # BENCHMARKING - BENCHMARKING - BENCHMARKING
    print("--- Table: {0} ---".format(table))
    print("--- Duration of {0:.2f} seconds ---".format(time.time() - start_time))
    print("--- Memory size of {0:.2f} MB ---".format(float(sys.getsizeof(df)) / (1024*1024)))
    print("--- Total of {0} rows ---".format(len(df.index)))
    print("")

    return df

# Load tables. Only sets is massive. No need to limit ourselves on the rest.
df_sets            = load_table("sets", MAX_ROWS)
df_tournament_info = load_table("tournament_info")
df_players         = load_table("players")
df_ranking_seasons = load_table("ranking_seasons")
df_ranking         = load_table("ranking")

In [None]:
df_sets.head(1)

In [None]:
df_tournament_info.head(1)

In [None]:
df_players.head(1)

In [None]:
df_ranking.head(1)

In [None]:
df_ranking_seasons.head(1)

## Combine some relevant info into one table

Here, for convenience, we will consider most of the set info together with the tournament dates (unix time) and player tags.

In [None]:
df_past_wins = df_sets[['key', 'tournament_key', 'winner_id', 'p1_id', 'p2_id', 'p1_score', 'p2_score', 'best_of', 'location_names']]

# Add tournament start/end, sort in chronological order
df_past_wins = df_past_wins.merge(df_tournament_info[['key', 'start', 'end']], left_on='tournament_key', right_on='key', how='left')
df_past_wins = df_past_wins.drop(labels=['key_y'], axis='columns')
df_past_wins = df_past_wins.rename(columns={"key_x": "key"})
df_past_wins = df_past_wins.sort_values(by=['start', 'end'])

# Add player tags, for convenience when looking up the info online
df_past_wins = df_past_wins.merge(df_players[['player_id', 'tag']], left_on='p1_id', right_on='player_id', how='left')
df_past_wins = df_past_wins.drop(labels=['player_id'], axis='columns')
df_past_wins = df_past_wins.rename(columns={"tag": "p1_tag"})

df_past_wins = df_past_wins.merge(df_players[['player_id', 'tag']], left_on='p2_id', right_on='player_id', how='left')
df_past_wins = df_past_wins.drop(labels=['player_id'], axis='columns')
df_past_wins = df_past_wins.rename(columns={"tag": "p2_tag"})

df_past_wins

## Data cleanup

Immediately, in the above, we notice that
* There are some entries with an unusually early ``start`` for the tournament. These are few and can be safely deleted.
* Some entries have a negative ``p1_score`` or ``p2_score``.

In [None]:
# Some of this seems to accidentally be 2001 data, despite the purported end date.
print("Total of {0} rows with unusually early start date".format(len(df_past_wins[df_past_wins['start'] < 1388534400].index)))

df_past_wins = df_past_wins[df_past_wins['start'] >= 1388534400]

# Some sets have a negative score?
num_negative = len(df_past_wins[(df_past_wins['p1_score'] < 0) | (df_past_wins['p2_score'] < 0)].index)
num_total = len(df_past_wins.index)
print("{0:.2f}% have a negative score somewhere".format(100 * num_negative / num_total))

df_past_wins = df_past_wins[(df_past_wins['p1_score'] >= 0) & (df_past_wins['p2_score'] >= 0)]

df_past_wins

## More data cleanup, this time in players

It appears as if the same player can have several different ids, as can be seen with the following example.

In [None]:
# Ex: "["Zain", "DontTestMe"]"

tag = "Zain"

def str_to_array(s):
    s = s.replace('[', '').replace(']', '')
    
    elements = s.split(',')
    elements = [e.strip().replace('\'','').replace('\"','') for e in elements]

    return elements

df_specific_tag = df_players[df_players['all_tags'].apply(lambda x: tag in str_to_array(x))]

df_specific_tag

#for p in df_specific_tag['placings']:
#    print(p)