# Baseline Test #
Our first baseline is to simply use the Glicko-2 player rank to predict the outcome of the set.

In [1]:
!pip install numba


Collecting numba
  Downloading numba-0.60.0-cp312-cp312-win_amd64.whl.metadata (2.8 kB)
Collecting llvmlite<0.44,>=0.43.0dev0 (from numba)
  Downloading llvmlite-0.43.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting numpy<2.1,>=1.22 (from numba)
  Downloading numpy-2.0.2-cp312-cp312-win_amd64.whl.metadata (59 kB)
Downloading numba-0.60.0-cp312-cp312-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 2.7/2.7 MB 26.0 MB/s eta 0:00:00
Downloading llvmlite-0.43.0-cp312-cp312-win_amd64.whl (28.1 MB)
   ---------------------------------------- 0.0/28.1 MB ? eta -:--:--
   ----------- ---------------------------- 8.1/28.1 MB 38.7 MB/s eta 0:00:01
   ----------------------- ---------------- 16.5/28.1 MB 40.0 MB/s eta 0:00:01
   ----------------------------------- ---- 24.6/28.1 MB 40.0 MB/s eta 0:00:01
   ---------------------------------------- 28.1/28.1 MB 37.9 MB/s eta 0:00:00
Downloading numpy-2.0.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict
import matplotlib.pyplot as plt
from numba import njit, prange

import sqlite3
import sys
import time
import tqdm

In [5]:
glicko2_df = pd.read_pickle('../data/overall_players_ranking.pkl')
glicko2_df.head(2)

Unnamed: 0_level_0,dates,rating_history,rd_history,glicko2
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000,"[2015-01-13 01:02:28, 2015-02-06 18:58:36, 201...","[1669.3438572516538, 1699.3674585229132, 1882....","[171.20863576993426, 129.71212804199854, 83.73...",<glicko2.glicko2.Player object at 0x00000238C8...
Stelzig,"[2015-01-13 01:02:28, 2015-08-17 23:13:14]","[1252.6819167554638, 1165.5738869144382]","[253.40460822505193, 211.92970378109877]",<glicko2.glicko2.Player object at 0x00000238C8...


In [6]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = "../data/melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

In [7]:
players_df = dfs['players_df']
ranking_df = dfs['ranking_df']
ranking_seasons_df = dfs['ranking_seasons_df']
sets_df = dfs['sets_df']
tournament_info_df = dfs['tournament_info_df']

In [8]:
print(glicko2_df.head(1))
print(tournament_info_df.head(1))
print(sets_df.head(1))


                                                       dates  \
player_id                                                      
1000       [2015-01-13 01:02:28, 2015-02-06 18:58:36, 201...   

                                              rating_history  \
player_id                                                      
1000       [1669.3438572516538, 1699.3674585229132, 1882....   

                                                  rd_history  \
player_id                                                      
1000       [171.20863576993426, 129.71212804199854, 83.73...   

                                                     glicko2  
player_id                                                     
1000       <glicko2.glicko2.Player object at 0x00000238C8...  
    game                                        key  \
0  melee  mdva-invitational-2017-(challonge-mirror)   

                                cleaned_name     source  \
0  MDVA Invitational 2017 (Challonge Mirror)  challonge   

  

In [None]:
def check_set(glicko2_df, set, tournament_info_df, correct_predictions):
    tournament = set['tournament_key']
    p1_id = set['p1_id']
    p2_id = set['p2_id']
    start_date = tournament_info_df[tournament_info_df['key']==tournament]['start']
    
    p1_index = np.searchsorted(glicko2_df.loc[p1_id,'dates'], start_date) - 1
    p2_index = np.searchsorted(glicko2_df.loc[p2_id,'dates'], start_date) - 1
    
    p1_glikco2 = glicko2_df.loc[p1_id,'rating_history'][p1_index]
    p2_glikco2 = glicko2_df.loc[p2_id,'rating_history'][p2_index]
    
    if p1_glikco2 > p2_glikco2 and set['winner_id'] == p1_id:
        return  1
    
    
    return 0
        
num_sets = 60000
random_set_sample = sets_df.sample(n = num_sets, random_state=42)   
correct_predictions = 0

for j in tqdm.tqdm(range(num_sets)):
    correct_predictions += check_set(glicko2_df, random_set_sample.iloc[j], tournament_info_df, correct_predictions)

print(f'The higher Glicko-2 rating wins {correct_predictions/num_sets:0.1%} of the time')