In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict
from joblib import Parallel, delayed

import sqlite3
import sys
import time
import math
#import tqdm
from tqdm.auto import tqdm
import datetime
import os
import pickle
from pathlib import Path

from glicko2 import Player
import multiprocessing

tqdm.pandas()

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'

## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [None]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [None]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [None]:
players_df = dfs['players_df']
players_df.head()

In [None]:
ranking_df = dfs['ranking_df']
ranking_df.head()

In [None]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

In [None]:
sets_df = dfs['sets_df']
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data)")

sets_df.head()

In [None]:
tournament_info_df = dfs['tournament_info_df']
tournament_info_df.head()

In [None]:
# Code optimization by Dan
# Basically we want to replace this line in process_tournament with something more efficient:
#
#      tournament_sets_df = sets_df[sets_df['tournament_key'] == tournament_key]
#
# Instead, we can
# - Merge the tournament date info into ``sets_df``
# - Sort by date
# - Store the start/end positions of each tournament in a separate dictionary
# - Use tournament_sets_df = sets_df.iloc[start:end+1] instead.

sets_df = sets_df.merge(tournament_info_df[['key', 'start', 'end']], left_on='tournament_key', right_on='key', how='left')
sets_df = sets_df.drop(labels=['key_y'], axis='columns')
sets_df = sets_df.rename(columns={"key_x": "key"})
sets_df = sets_df.sort_values(by=['end', 'tournament_key']) # Just in case there are tournaments with the exact same end date

In [None]:
# A bit of data cleanup
# TODO: Rerun!
min_date = datetime.datetime(2015, 1, 1)
max_date = datetime.datetime(2024, 12, 31)

sets_df = sets_df[(sets_df['start'] >= min_date) & (sets_df['end'] >= min_date) & (sets_df['start'] <= max_date) & (sets_df['end'] <= max_date)]

In [None]:
# For now we ignore the L{n} location name.
top_8_locations = [                                   
        ['WSF', 'Winners Semis', 'Winners Semi-Final'],
        ['LQF', 'Losers Quarters', 'Losers Quarter-Final'],
        ['WF', 'Winners Final', 'Winners Final'],
        ['LSF', 'Losers Semis', 'Losers Semi-Final'],
        ['LF', 'Losers Final', 'Losers Final'],
        ['GF', 'Grand Final', 'Grand Final'],
        ['GFR', 'GF Reset', 'Grand Final Reset']
    ] 

top_8 = sets_df['location_names'].isin(top_8_locations)

In [None]:
dataset_mini_df = pd.read_pickle(data_path + 'dataset_mini.pkl')

# Temporary bugfix, might have added stuff twice at some point
# dataset_mini_df = dataset_mini_df.loc[:,~dataset_mini_df.columns.duplicated()].copy()

#minier_cols = [x for x in dataset_mini_df.columns if "m2" not in x and "_alt_" not in x]
minier_cols = [x for x in dataset_mini_df.columns if "m2" not in x and "_alt_" not in x]
#minier_cols = [x for x in dataset_mini_df.columns if "m2" not in x]
dataset_minier_df = dataset_mini_df[minier_cols]

dataset_df = dataset_minier_df

features_default_elo = ['p1_elo', 'p2_elo']
features_all_elo = ['p1_elo', 'p2_elo', 'p1/m1/m1_elo', 'p2/m1/m1_elo', 'p1/m1_elo', 'p2/m1_elo']
features_all_rd = [x.replace('elo', 'rd') for x in features_all_elo]
features_all_eru = features_all_elo + features_all_rd + [x.replace('elo', 'updates') for x in features_all_elo]
features_all_everything = list(dataset_df.columns[:-1])

# Filter by elos that actually have nontrivial data
quality_filter = pd.Series(True, index=dataset_df.index)
for elo_col in features_all_elo:
    quality_filter = quality_filter & (dataset_df[elo_col] != 1500.0)

In [None]:
# Models that only use the ELO scores
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb

from sklearn.metrics import log_loss, accuracy_score

# Years to train on. We will test on the next year.
years = range(2016, 2023+1)

# These models work best with normally distributed data, which appears to be the case for the various ELOs
models = {'lr': LogisticRegression(penalty=None, max_iter=10000),
          'lda': LinearDiscriminantAnalysis(),
          'xgb': xgb.XGBClassifier()}

ll_scores = np.zeros(shape=(len(years), len(models)))
acc_scores = np.zeros(shape=(len(years), len(models)))

# Just using default ELOs
for i, y in enumerate(years):
    # Note that 2015 data is probably not that good. Elo scores barely started getting accurate.
    # NOTE: It is assumed that dataset_df and sets_df share the same rows, only with different engineered features in dataset_df
    dataset_train_df = dataset_df[(sets_df['start'] <= datetime.datetime(2016,1,1)) &
                                  (sets_df['end'] <= datetime.datetime(y,12,31))]
    dataset_test_df = dataset_df[(sets_df['start'] >= datetime.datetime(y+1,1,1)) &
                                 (sets_df['end'] <= datetime.datetime(y+1,12,31))]

    for j, name in enumerate(models):
        models[name].fit(dataset_train_df[features_default_elo], dataset_train_df['winner'])
        y_prob = models[name].predict_proba(dataset_test_df[features_default_elo])
        y_pred = (y_prob[:,1] >= 0.5)

        ll_scores[i,j] = round(log_loss(dataset_test_df['winner'], y_prob), 3)
        acc_scores[i,j] = round(100.0 * accuracy_score(dataset_test_df['winner'], y_pred), 1)

print("Scores involving just the default elos")
print(pd.DataFrame(np.concatenate([ll_scores, acc_scores], axis=1),
             index=years,
             columns=[x + "_ll" for x in models] + [x + "_acc" for x in models]))

# Using all ELOs
for i, y in enumerate(years):
    # Note that 2015 data is probably not that good. Elo scores barely started getting accurate.
    # NOTE: It is assumed that dataset_df and sets_df share the same rows, only with different engineered features in dataset_df
    dataset_train_df = dataset_df[(sets_df['start'] <= datetime.datetime(2016,1,1)) &
                                  (sets_df['end'] <= datetime.datetime(y,12,31))]
    dataset_test_df = dataset_df[(sets_df['start'] >= datetime.datetime(y+1,1,1)) &
                                 (sets_df['end'] <= datetime.datetime(y+1,12,31))]

    for j, name in enumerate(models):
        models[name].fit(dataset_train_df[features_all_elo], dataset_train_df['winner'])
        y_prob = models[name].predict_proba(dataset_test_df[features_all_elo])
        y_pred = (y_prob[:,1] >= 0.5)

        ll_scores[i,j] = round(log_loss(dataset_test_df['winner'], y_prob), 3)
        acc_scores[i,j] = round(100.0 * accuracy_score(dataset_test_df['winner'], y_pred), 1)

print()
print("Scores involving all elos")
print(pd.DataFrame(np.concatenate([ll_scores, acc_scores], axis=1),
             index=years,
             columns=[x + "_ll" for x in models] + [x + "_acc" for x in models]))

# Using all ELOs, but with only recent years for training
for i, y in enumerate(years):
    # Note that 2015 data is probably not that good. Elo scores barely started getting accurate.
    # NOTE: It is assumed that dataset_df and sets_df share the same rows, only with different engineered features in dataset_df
    dataset_train_df = dataset_df[(sets_df['start'] <= datetime.datetime(y,1,1)) &
                                  (sets_df['end'] <= datetime.datetime(y,12,31))]
    dataset_test_df = dataset_df[(sets_df['start'] >= datetime.datetime(y+1,1,1)) &
                                 (sets_df['end'] <= datetime.datetime(y+1,12,31))]

    for j, name in enumerate(models):
        models[name].fit(dataset_train_df[features_all_elo], dataset_train_df['winner'])
        y_prob = models[name].predict_proba(dataset_test_df[features_all_elo])
        y_pred = (y_prob[:,1] >= 0.5)

        ll_scores[i,j] = round(log_loss(dataset_test_df['winner'], y_prob), 3)
        acc_scores[i,j] = round(100.0 * accuracy_score(dataset_test_df['winner'], y_pred), 1)

print()
print("Scores involving all elos, but only one year for training")
print(pd.DataFrame(np.concatenate([ll_scores, acc_scores], axis=1),
             index=years,
             columns=[x + "_ll" for x in models] + [x + "_acc" for x in models]))


# Using all ELOs, but with only recent years for training
# AND with the restriction of only looking at quality data for training
# (and testing still makes use of all data)
for i, y in enumerate(years):
    # Note that 2015 data is probably not that good. Elo scores barely started getting accurate.
    # NOTE: It is assumed that dataset_df and sets_df share the same rows, only with different engineered features in dataset_df
    dataset_train_df = dataset_df[(sets_df['start'] <= datetime.datetime(y,1,1)) &
                                  (sets_df['end'] <= datetime.datetime(y,12,31)) &
                                  quality_filter]
    dataset_test_df = dataset_df[(sets_df['start'] >= datetime.datetime(y+1,1,1)) &
                                 (sets_df['end'] <= datetime.datetime(y+1,12,31))]

    for j, name in enumerate(models):
        models[name].fit(dataset_train_df[features_all_elo], dataset_train_df['winner'])
        y_prob = models[name].predict_proba(dataset_test_df[features_all_elo])
        y_pred = (y_prob[:,1] >= 0.5)

        ll_scores[i,j] = round(log_loss(dataset_test_df['winner'], y_prob), 3)
        acc_scores[i,j] = round(100.0 * accuracy_score(dataset_test_df['winner'], y_pred), 1)

print()
print("Scores involving all elos, but only one year for training")
print("AND only using quality data for training")
print(pd.DataFrame(np.concatenate([ll_scores, acc_scores], axis=1),
             index=years,
             columns=[x + "_ll" for x in models] + [x + "_acc" for x in models]))

In [None]:
# More advanced models
import xgboost as xgb
import errorlda
import importlib

# Just in case we make changes to this model.
importlib.reload(errorlda)

# Years to train on. We will test on the next year.
years = range(2016, 2023+1)

models = {'errorlda': errorlda.ErrorLDA(),
          'xgb': xgb.XGBClassifier()}

ll_scores = np.zeros(shape=(len(years), len(models)))
acc_scores = np.zeros(shape=(len(years), len(models)))

for i, y in enumerate(years):
    # Note that 2015 data is probably not that good. Elo scores barely started getting accurate.
    # NOTE: It is assumed that dataset_df and sets_df share the same rows, only with different engineered features in dataset_df
    dataset_train_df = dataset_df[(sets_df['start'] <= datetime.datetime(y,1,1)) &
                                  (sets_df['end'] <= datetime.datetime(y,12,31)) &
                                  quality_filter & top_8]
    dataset_test_df = dataset_df[(sets_df['start'] >= datetime.datetime(y+1,1,1)) &
                                 (sets_df['end'] <= datetime.datetime(y+1,12,31))]

    for j, name in enumerate(models):
        y_prob = None
        y_pred = None

        if name == 'errorlda': # Special syntax required here.
            models[name] = errorlda.ErrorLDA() # Not sure if I've implemented .fit() to reset everything upon every new fit.

            models[name].fit(dataset_train_df[features_all_elo], dataset_train_df['winner'],
                             X_train_errors=dataset_train_df[features_all_rd].apply(lambda row: np.diag(row.to_numpy() ** 2), axis=1),
                             error_scaling=False) # Note that error-scaling is slower by like a factor of 8, unfortunately
            
            # Literally just for numerical stability, in case some eigenvalues are near zero.
            # This will add 1 to each eigenvalue.
            models[name].variance += np.identity(len(features_all_rd))
            
            y_prob = models[name].predict_proba(dataset_test_df[features_all_elo],
                                                X_error=dataset_test_df[features_all_rd].apply(lambda row: np.diag(row.to_numpy() ** 2), axis=1))
            
            print("Trained errorlda, year {0}".format(y))
            
        else:
            models[name].fit(dataset_train_df[features_all_elo + features_all_rd], dataset_train_df['winner'])
            y_prob = models[name].predict_proba(dataset_test_df[features_all_elo + features_all_rd])
        
        y_pred = (y_prob[:,1] >= 0.5)

        ll_scores[i,j] = round(log_loss(dataset_test_df['winner'], y_prob), 3)
        acc_scores[i,j] = round(100.0 * accuracy_score(dataset_test_df['winner'], y_pred), 1)

print("Scores involving all ELOs and RDs")
print(pd.DataFrame(np.concatenate([ll_scores, acc_scores], axis=1),
             index=years,
             columns=[x + "_ll" for x in models] + [x + "_acc" for x in models]))