## Packages and default settings

In particular, there is a setting for how many rows to load. The database is MASSIVE.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sqlite3
import time
import sys

MAX_ROWS = 10*1000*1000
path = "../data/melee_player_database.db"

## Load the database

In [2]:
con = sqlite3.connect(path)
cur = con.cursor()

# Might be slightly janky
def get_column_names(table):
    cur.execute("SELECT * FROM " + table + " LIMIT 1")
    return [x[0] for x in cur.description]

def get_rows(table, max_rows=None):
    if max_rows is None:
        cur.execute("SELECT * FROM " + table)
    else:
        cur.execute("SELECT * FROM " + table + " LIMIT " + str(max_rows))
    
    return cur.fetchall()

def load_table(table, max_rows=None):
    # BENCHMARKING - BENCHMARKING - BENCHMARKING
    start_time = time.time()
    
    columns = get_column_names(table)
    rows = get_rows(table=table, max_rows=MAX_ROWS)
    
    df = pd.DataFrame(rows, columns=columns)
    
    # BENCHMARKING - BENCHMARKING - BENCHMARKING
    print("--- Table: {0} ---".format(table))
    print("--- Duration of {0:.2f} seconds ---".format(time.time() - start_time))
    print("--- Memory size of {0:.2f} MB ---".format(float(sys.getsizeof(df)) / (1024*1024)))
    print("--- Total of {0} rows ---".format(len(df.index)))
    print("")

    return df

# Load tables. Only sets is massive. No need to limit ourselves on the rest.
df_sets            = load_table("sets", MAX_ROWS)
df_tournament_info = load_table("tournament_info")
df_players         = load_table("players")
df_ranking_seasons = load_table("ranking_seasons")
df_ranking         = load_table("ranking")

--- Table: sets ---
--- Duration of 6.10 seconds ---
--- Memory size of 1442.36 MB ---
--- Total of 1795681 rows ---

--- Table: tournament_info ---
--- Duration of 0.41 seconds ---
--- Memory size of 159.14 MB ---
--- Total of 39675 rows ---

--- Table: players ---
--- Duration of 0.37 seconds ---
--- Memory size of 142.89 MB ---
--- Total of 96689 rows ---

--- Table: ranking_seasons ---
--- Duration of 0.00 seconds ---
--- Memory size of 0.01 MB ---
--- Total of 5 rows ---

--- Table: ranking ---
--- Duration of 0.00 seconds ---
--- Memory size of 0.00 MB ---
--- Total of 1 rows ---



In [3]:
df_sets.head(1)

Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,5620,Chillin,3,1,"[""R1"", ""Round 1"", ""Round 1""]",,1,A,5.0,[]


In [4]:
df_tournament_info.head(1)

Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,country,state,city,entrants,placings,losses,bracket_types,online,lat,lng
0,melee,mdva-invitational-2017-(challonge-mirror),MDVA Invitational 2017 (Challonge Mirror),challonge,https://challonge.com/mdva_invitational_2017,,17,,1511683511,1511686089,US,VA,Fall's Church,10,"[\n [\n ""Rishi"",\n 1\n ],\n [\n 15...",{},b'{}',0,,


In [5]:
df_players.head(1)

Unnamed: 0,game,player_id,tag,all_tags,prefixes,social,country,state,region,c_country,c_state,c_region,placings,characters,alias
0,melee,Rishi,Rishi,"[""Rishi""]",[],"{""twitter"": []}",,,,,,,"[{""key"": ""mdva-invitational-2017-(challonge-mi...","""""",


In [6]:
df_ranking.head(1)

Unnamed: 0,game,ranking_name,priority,region,seasons,tournaments,icon
0,melee,SSBMRank,0,world,"[""2015"", ""2016"", ""2017"", ""2018"", ""2019""]",[],miom


In [7]:
df_ranking_seasons.head(1)

Unnamed: 0,game,ranking_name,season,start,end,total,by_id,by_placing,final,name
0,melee,SSBMRank,2015,1420070400,1451606399,100,"{""6189"": 1, ""1004"": 2, ""4465"": 3, ""1000"": 4, ""...","{""1"": ""6189"", ""2"": ""1004"", ""3"": ""4465"", ""4"": ""...",0,


## Combine some relevant info into one table

Here, for convenience, we will consider most of the set info together with the tournament dates (unix time) and player tags.

In [8]:
df_past_wins = df_sets[['key', 'tournament_key', 'winner_id', 'p1_id', 'p2_id', 'p1_score', 'p2_score', 'best_of', 'location_names']]

# Add tournament start/end, sort in chronological order
df_past_wins = df_past_wins.merge(df_tournament_info[['key', 'start', 'end']], left_on='tournament_key', right_on='key', how='left')
df_past_wins = df_past_wins.drop(labels=['key_y'], axis='columns')
df_past_wins = df_past_wins.rename(columns={"key_x": "key"})
df_past_wins = df_past_wins.sort_values(by=['start', 'end'])

# Add player tags, for convenience when looking up the info online
df_past_wins = df_past_wins.merge(df_players[['player_id', 'tag']], left_on='p1_id', right_on='player_id', how='left')
df_past_wins = df_past_wins.drop(labels=['player_id'], axis='columns')
df_past_wins = df_past_wins.rename(columns={"tag": "p1_tag"})

df_past_wins = df_past_wins.merge(df_players[['player_id', 'tag']], left_on='p2_id', right_on='player_id', how='left')
df_past_wins = df_past_wins.drop(labels=['player_id'], axis='columns')
df_past_wins = df_past_wins.rename(columns={"tag": "p2_tag"})

df_past_wins

Unnamed: 0,key,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,best_of,location_names,start,end,p1_tag,p2_tag
0,gg__36789317,pooperbowl-ii-lula-birthday-bash__redemption-f...,2058790,2117535,2058790,0,2,3.0,"[""WQF"", ""Winners Quarters"", ""Winners Quarter-F...",987565500,1618718400,vel,Butternust
1,gg__36789318,pooperbowl-ii-lula-birthday-bash__redemption-f...,1377847,1063082,1377847,0,2,3.0,"[""WQF"", ""Winners Quarters"", ""Winners Quarter-F...",987565500,1618718400,tilted,Niaki17
2,gg__36789319,pooperbowl-ii-lula-birthday-bash__redemption-f...,62672,2239416,62672,0,2,3.0,"[""WQF"", ""Winners Quarters"", ""Winners Quarter-F...",987565500,1618718400,steib,zubbs
3,gg__36789320,pooperbowl-ii-lula-birthday-bash__redemption-f...,2165165,2165165,264336,0,-1,3.0,"[""WQF"", ""Winners Quarters"", ""Winners Quarter-F...",987565500,1618718400,Senseovernet,TBA
4,gg__36789321,pooperbowl-ii-lula-birthday-bash__redemption-f...,1377847,2058790,1377847,0,2,3.0,"[""WSF"", ""Winners Semis"", ""Winners Semi-Final""]",987565500,1618718400,Butternust,Niaki17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795676,gg__76275446,naonline-21__melee-singles,2092907,2092907,3137063,3,2,5.0,"[""WF"", ""Winners Final"", ""Winners Final""]",1719147600,1719176400,armando,Lwi
1795677,gg__76275447,naonline-21__melee-singles,3137063,2092907,3137063,1,3,5.0,"[""GF"", ""Grand Final"", ""Grand Final""]",1719147600,1719176400,armando,Lwi
1795678,gg__76275448,naonline-21__melee-singles,2092907,3137063,2092907,-1,0,5.0,"[""GFR"", ""GF Reset"", ""Grand Final Reset""]",1719147600,1719176400,Lwi,armando
1795679,gg__76275453,naonline-21__melee-singles,2558872,3910942,2558872,0,3,5.0,"[""LSF"", ""Losers Semis"", ""Losers Semi-Final""]",1719147600,1719176400,nyts,GokhanZ


## Data cleanup

Immediately, in the above, we notice that
* There are some entries with an unusually early ``start`` for the tournament. These are few and can be safely deleted.
* Some entries have a negative ``p1_score`` or ``p2_score``.

In [9]:
# Some of this seems to accidentally be 2001 data, despite the purported end date.
print("Total of {0} rows with unusually early start date".format(len(df_past_wins[df_past_wins['start'] < 1388534400].index)))

df_past_wins = df_past_wins[df_past_wins['start'] >= 1388534400]

# Some sets have a negative score?
num_negative = len(df_past_wins[(df_past_wins['p1_score'] < 0) | (df_past_wins['p2_score'] < 0)].index)
num_total = len(df_past_wins.index)
print("{0:.2f}% have a negative score somewhere".format(100 * num_negative / num_total))

df_past_wins = df_past_wins[(df_past_wins['p1_score'] >= 0) & (df_past_wins['p2_score'] >= 0)]

df_past_wins

Total of 14 rows with unusually early start date
10.08% have a negative score somewhere


Unnamed: 0,key,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,best_of,location_names,start,end,p1_tag,p2_tag
14,,httpsbeastsmashchallongecomb5msb,1000,1000,Stelzig,2,0,3.0,"[""W1"", ""Winners 1"", ""Winners Round 1""]",1420985773,1421110948,Mang0,Stelzig
15,,httpsbeastsmashchallongecomb5msb,IVP,Flikkflakk,IVP,0,2,3.0,"[""W1"", ""Winners 1"", ""Winners Round 1""]",1420985773,1421110948,Flikkflakk,IVP
16,,httpsbeastsmashchallongecomb5msb,Humpe,Humpe,Yomi,2,0,3.0,"[""W1"", ""Winners 1"", ""Winners Round 1""]",1420985773,1421110948,Humpe,Yomi
17,,httpsbeastsmashchallongecomb5msb,Thomas,Thomas,Hao,2,0,3.0,"[""W1"", ""Winners 1"", ""Winners Round 1""]",1420985773,1421110948,Thomas,Hao
18,,httpsbeastsmashchallongecomb5msb,12870,12870,JJLinyard,2,0,3.0,"[""W1"", ""Winners 1"", ""Winners Round 1""]",1420985773,1421110948,Android,JJLinyard
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795675,gg__76275445,naonline-21__melee-singles,3137063,2558872,3137063,0,3,5.0,"[""WSF"", ""Winners Semis"", ""Winners Semi-Final""]",1719147600,1719176400,GokhanZ,Lwi
1795676,gg__76275446,naonline-21__melee-singles,2092907,2092907,3137063,3,2,5.0,"[""WF"", ""Winners Final"", ""Winners Final""]",1719147600,1719176400,armando,Lwi
1795677,gg__76275447,naonline-21__melee-singles,3137063,2092907,3137063,1,3,5.0,"[""GF"", ""Grand Final"", ""Grand Final""]",1719147600,1719176400,armando,Lwi
1795679,gg__76275453,naonline-21__melee-singles,2558872,3910942,2558872,0,3,5.0,"[""LSF"", ""Losers Semis"", ""Losers Semi-Final""]",1719147600,1719176400,nyts,GokhanZ


## More data cleanup, this time in players

It appears as if the same player can have several different ids, as can be seen with the following example.

In [10]:
# Ex: "["Zain", "DontTestMe"]"

tag = "Zain"

def str_to_array(s):
    s = s.replace('[', '').replace(']', '')
    
    elements = s.split(',')
    elements = [e.strip().replace('\'','').replace('\"','') for e in elements]

    return elements

df_specific_tag = df_players[df_players['all_tags'].apply(lambda x: tag in str_to_array(x))]

df_specific_tag

#for p in df_specific_tag['placings']:
#    print(p)

Unnamed: 0,game,player_id,tag,all_tags,prefixes,social,country,state,region,c_country,c_state,c_region,placings,characters,alias
102,melee,1000,Mang0,"[""C9 | Mango"", ""Mango"", ""C9. Mango"", ""C9|Mango...","[""C9""]","{""twitter"": [""C9Mang0""]}",United States,CA,SoCal,US,CA,Los Angeles,"[{""key"": ""slippi-champions-league-week-1__mele...","{""melee/falco"": 812, ""melee/fox"": 516, ""melee/...",
38507,melee,330371,Sol!d,"[""Sol!d"", ""Mang0""]","[""W&L"", ""SMB""]","{""twitter"": []}",Uruguay,,,UY,Montevideo Department,Montevideo,"[{""key"": ""montevideo-melee-monthly-2021-1__mel...","{""melee/captainfalcon"": 124, ""melee/mario"": 39...",
44464,melee,2396108,Lean_dad,"[""Mang0"", ""Lean_dad""]",[],"{""twitter"": []}",United States,PA,,US,PA,,"[{""key"": ""2021-philly-melee-arcadian-the-lab__...","""""",
57296,melee,148442,Mang0,"[""Mang0""]","[""C9""]","{""twitter"": []}",United States,CA,,US,CA,,"[{""key"": ""c9-mang0-5-2-stock-side-event__two-s...","""""",
77494,melee,1456076,Leffen,"[""Mang0"", ""Cody Schwab"", ""Leffen""]","[""C9"", ""TSM""]","{""twitter"": []}",United States,CA,,US,CA,Benicia,"[{""key"": ""blg-smash-tarn-36__melee-singles"", ""...","""""",
