In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict
from joblib import Parallel, delayed

import sqlite3
import sys
import time
import math
#import tqdm
from tqdm.auto import tqdm
import datetime
import os
import pickle
from pathlib import Path

from glicko2 import Player
import multiprocessing

tqdm.pandas()

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'

## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [None]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [None]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [None]:
players_df = dfs['players_df']
players_df.head()

In [None]:
ranking_df = dfs['ranking_df']
ranking_df.head()

In [None]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

In [None]:
sets_df = dfs['sets_df']
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data)")

sets_df.head()

In [None]:
tournament_info_df = dfs['tournament_info_df']
tournament_info_df.head()

In [None]:
# Code optimization by Dan
# Basically we want to replace this line in process_tournament with something more efficient:
#
#      tournament_sets_df = sets_df[sets_df['tournament_key'] == tournament_key]
#
# Instead, we can
# - Merge the tournament date info into ``sets_df``
# - Sort by date
# - Store the start/end positions of each tournament in a separate dictionary
# - Use tournament_sets_df = sets_df.iloc[start:end+1] instead.

sets_df = sets_df.merge(tournament_info_df[['key', 'start', 'end']], left_on='tournament_key', right_on='key', how='left')
sets_df = sets_df.drop(labels=['key_y'], axis='columns')
sets_df = sets_df.rename(columns={"key_x": "key"})
sets_df = sets_df.sort_values(by=['end', 'tournament_key']) # Just in case there are tournaments with the exact same end date

In [None]:
# A bit of data cleanup
# TODO: Rerun!
min_date = datetime.datetime(2015, 1, 1)
max_date = datetime.datetime(2024, 12, 31)

sets_df = sets_df[(sets_df['start'] >= min_date) & (sets_df['end'] >= min_date) & (sets_df['start'] <= max_date) & (sets_df['end'] <= max_date)]

In [None]:
# For now we ignore the L{n} location name.
top_8_locations = [                                   
        ['WSF', 'Winners Semis', 'Winners Semi-Final'],
        ['LQF', 'Losers Quarters', 'Losers Quarter-Final'],
        ['WF', 'Winners Final', 'Winners Final'],
        ['LSF', 'Losers Semis', 'Losers Semi-Final'],
        ['LF', 'Losers Final', 'Losers Final'],
        ['GF', 'Grand Final', 'Grand Final'],
        ['GFR', 'GF Reset', 'Grand Final Reset']
    ] 

top_8 = sets_df['location_names'].isin(top_8_locations)

In [None]:
dataset_mini_df = pd.read_pickle(data_path + 'dataset_mini.pkl')

# Temporary bugfix, might have added stuff twice at some point
# dataset_mini_df = dataset_mini_df.loc[:,~dataset_mini_df.columns.duplicated()].copy()

#minier_cols = [x for x in dataset_mini_df.columns if "m2" not in x and "_alt_" not in x]
minier_cols = [x for x in dataset_mini_df.columns if "m2" not in x and "_alt_" not in x]
#minier_cols = [x for x in dataset_mini_df.columns if "m2" not in x]
dataset_minier_df = dataset_mini_df[minier_cols]

dataset_df = dataset_minier_df

features_default_elo = ['p1_elo', 'p2_elo']
features_all_elo = ['p1_elo', 'p2_elo', 'p1/m1/m1_elo', 'p2/m1/m1_elo', 'p1/m1_elo', 'p2/m1_elo']
features_all_rd = [x.replace('elo', 'rd') for x in features_all_elo]
features_all_updates = [x.replace('elo', 'updates') for x in features_all_elo]
features_all_eru = features_all_elo + features_all_rd + features_all_updates
features_all_everything = list(dataset_df.columns[:-1])

# Filter by elos that actually have nontrivial data
quality_filter = pd.Series(True, index=dataset_df.index)
for update_col in features_all_updates:
    quality_filter = quality_filter & (dataset_df[update_col] >= 10.0)

low_quality_filter = pd.Series(True, index=dataset_df.index)
for update_col in features_all_updates:
    low_quality_filter = low_quality_filter & ((dataset_df[update_col] >= 2.0) & (dataset_df[update_col] <= 10.0))

## An observation about the data

First, we note the motivation for including ``quality_filter`` as an option for the data. With it, ELO scores appear to (mostly) follow a multivariate normal distribution, while lower-quality data tends to cluster far more around the default elo values of 1500. This suggests, at least in the case of high-quality data, that very simplistic linear models might actually yield the best performance.

In [None]:
all_elos = ['p1_elo', 'p2_elo', 'p1/m1/m1_elo', 'p2/m1/m1_elo', 'p1/m1_elo', 'p2/m1_elo']

plt.title('default, alt2, alt3 ELOs for quality data')
plt.xticks([])
plt.yticks([])

for i in range(0,3):
    plt.subplot(3,2,2*i+1)
    plt.scatter(dataset_df[quality_filter & (dataset_df['winner'] == 1.0)][all_elos[2*i]],
                dataset_df[quality_filter & (dataset_df['winner'] == 1.0)][all_elos[2*i+1]],
                s=0.3, alpha=0.2, label='p1 wins')
    plt.xlim(0, 3000)
    plt.ylim(0, 3000)
    
    if i != 2:
        plt.xticks([])

    plt.legend()

    plt.subplot(3,2,2*i+2)
    plt.scatter(dataset_df[quality_filter & (dataset_df['winner'] == 0.0)][all_elos[2*i]],
                dataset_df[quality_filter & (dataset_df['winner'] == 0.0)][all_elos[2*i+1]],
                s=0.3, alpha=0.2, label='p2 wins')
    plt.xlim(0, 3000)
    plt.ylim(0, 3000)
    plt.legend()

    if i != 2:
        plt.xticks([])

    plt.yticks([])

plt.show()

## Testing some basic models

Before we begin, some important remarks are in order. In some sense, this is time series data, and in another sense, it is not. It somewhat showcases the evolution of player ELO ratings (and related features) over time, albeit without directly linking them to certain players. However, these ELO scores, in some sense, already take into account all of the player's past performance up to a certain point. Likewise, RD values are meant to indicate the "uncertainty" in any player's current ELO score, and at any one point in time it takes into account all of their previous games, and even how long it has been since they've last played.

That being said, it is somewhat ill-advised to shuffle the data when training. Especially for more advanced models, it might potentially be able to recognize that a very precise ELO score has shown up in some future match, and conclude that it must have won some previous matches (note that we have updated ELO scores only once a week).

Here, we begin by training some basic models that have the goal of predicting the outcome of individual sets, with no ability to look back on any past performance (single-set models). We first want to observe the impact of the following factors, and are not yet interested in serious hyperparameter tuning:

* Is it better to only train on more recent data rather than all data up to a certain point (perhaps game meta, average ELO, etc... shifts over time)
* Are we actually gaining anything by including all of our engineered ELO scores, rather than just the default ones?
* What is the impact of considering only on "high quality" data which has received many updates to all ELO scores?

We also note that for this single-set predictor, we train on data up to 2022 (and test on 2023) for cross-validation and hyperparameter tuning. The secondary model which takes tournament performance into account (which will use this single-set model), tuned on 2023 data and have its final performance tested on 2024 data. However, for the interest of obtaining a final performance score for the single-set predictor, it should be safely testable on 2024 data as well.

In [None]:
# Models that only use the ELO scores
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
import xgboost as xgb
from tqdm.auto import tqdm

from sklearn.metrics import log_loss, accuracy_score

# Years to train on. We will test on the next year.
years = range(2017, 2022+1)

# These models work best with normally distributed data, which appears to be the case for the various ELOs
models = {'lr': LogisticRegression(penalty=None, max_iter=10000),
          'lda': LinearDiscriminantAnalysis(),
          'qda': QuadraticDiscriminantAnalysis(),
          'xgb': xgb.XGBClassifier()}

ll_scores = np.zeros(shape=(len(years), len(models)))
acc_scores = np.zeros(shape=(len(years), len(models)))

training_modes = ['past_year', 'all_years'] # Train on just the past year, or all years (starting from 2016).
elo_modes = ['default_elo', 'all_elo'] # Only the default glicko2 elo scores, or all engineered ones
data_modes = ['quality_data', 'all_data'] # Whether or not each elo has had at least 10 updates

for training_mode in training_modes:
    for elo_mode in elo_modes:
        for data_mode in data_modes:
            # Just using default ELOs
            for i, y in enumerate(tqdm(years)):
                # Note that 2015 data is probably not that good. Elo scores barely started getting accurate.
                # NOTE: It is assumed that dataset_df and sets_df share the same rows, only with different engineered features in dataset_df
                dataset_train_df = dataset_df[(sets_df['start'] >= datetime.datetime(2016 if training_mode == 'all_years' else y, 1, 1)) &
                                            (sets_df['end'] <= datetime.datetime(y,12,31)) &
                                            (quality_filter if data_mode == 'quality_data' else True)]
                dataset_test_df = dataset_df[(sets_df['start'] >= datetime.datetime(y+1,1,1)) &
                                            (sets_df['end'] <= datetime.datetime(y+1,12,31)) &
                                            (quality_filter if data_mode == 'quality_data' else True)]
                
                for j, name in enumerate(models):
                    models[name].fit(dataset_train_df[features_default_elo if elo_mode == 'default_elo' else features_all_elo], dataset_train_df['winner'])
                    y_prob = models[name].predict_proba(dataset_test_df[features_default_elo if elo_mode == 'default_elo' else features_all_elo])
                    y_pred = (y_prob[:,1] >= 0.5)

                    ll_scores[i,j] = round(log_loss(dataset_test_df['winner'], y_prob), 3)
                    acc_scores[i,j] = round(100.0 * accuracy_score(dataset_test_df['winner'], y_pred), 1)

            print("Using " + training_mode + " and " + elo_mode + " and " + data_mode)
            print(pd.DataFrame(np.concatenate([ll_scores, acc_scores], axis=1),
                        index=years,
                        columns=[x + "_ll" for x in models] + [x + "_acc" for x in models]))

Of particular interest are the latter few years, because the data is of substantially higher quality, and it will also more closely follow years 2023 and 2024.

Without any need to run any statistical tests, a simple side-by-side comparison reveals that
* In all instances, training on more recent data is slightly favourable for both linear models and XGBoost. We will stick with that from now on.
* LDA appears favourable over QDA.
* When dealing with higher-quality data, basic linear models substantially outperform XGBoost, at least without any hyperparameter tuning.

## A closer look at linear models

Here, we test the following linear models in the case of high-quality data, and explicitly low-quality data. The models we will be testing are LogisticRegression (classification), LDA, and a custom ErrorLDA model which should take into account the various RD values (roughly interpreted as a measurement error on the player ELOs, with the "true" ELOs being unknown values).

In [None]:
# More advanced models
import xgboost as xgb
import errorlda
import importlib

# Just in case we make changes to this model.
importlib.reload(errorlda)

# Years to train on. We will test on the next year.
years = range(2020, 2022+1)

models = {'lr': LogisticRegression(penalty=None, max_iter=10000),
          'lda': LinearDiscriminantAnalysis(),
          'errorlda': errorlda.ErrorLDA(),
          'errorlda_scaling': errorlda.ErrorLDA(),
          'xgb': xgb.XGBClassifier()}

ll_scores = np.zeros(shape=(len(years), len(models)))
acc_scores = np.zeros(shape=(len(years), len(models)))

data_modes = ['low_quality', 'high_quality']

for data_mode in data_modes:
    for i, y in enumerate(tqdm(years)):
        # Note that 2015 data is probably not that good. Elo scores barely started getting accurate.
        # NOTE: It is assumed that dataset_df and sets_df share the same rows, only with different engineered features in dataset_df
        dataset_train_df = dataset_df[(sets_df['start'] >= datetime.datetime(y,1,1)) &
                                    (sets_df['end'] <= datetime.datetime(y,12,31)) &
                                    (low_quality_filter if data_mode == 'low_quality' else quality_filter)]
        dataset_test_df = dataset_df[(sets_df['start'] >= datetime.datetime(y+1,1,1)) &
                                    (sets_df['end'] <= datetime.datetime(y+1,12,31)) &
                                    (low_quality_filter if data_mode == 'low_quality' else quality_filter)]
        
        for j, name in enumerate(models):
            y_prob = None
            y_pred = None

            # Basically, all of these require slightly different syntax and restriction of features
            # * ErrorLDA to include the variances (RD values)
            # * LDA to just use the the ELO values without RD values (mainly to compare to ErrorLDA)
            match name:
                case 'lr' | 'lda':
                    models[name].fit(dataset_train_df[features_all_elo], dataset_train_df['winner'])
                    y_prob = models[name].predict_proba(dataset_test_df[features_all_elo])

                case 'errorlda' | 'errorlda_scaling':
                    models[name] = errorlda.ErrorLDA() # Not sure if I've implemented .fit() to reset everything upon every new fit.

                    # Experimental pre-scaling of the RD values
                    pre_scaler = np.diag([1.0, 1.0, 1.0, 1.0, 1.0, 1.0])

                    models[name].fit(dataset_train_df[features_all_elo], dataset_train_df['winner'],
                                    X_train_errors=dataset_train_df[features_all_rd].apply(lambda row: np.diag(row.to_numpy() ** 2) @ pre_scaler, axis=1),
                                    error_scaling=(True if name == 'errorlda_scaling' else False)) # Note that error-scaling is slower by like a factor of 8, unfortunately
                    
                    # Literally just for numerical stability, in case some eigenvalues are near zero.
                    # This will add 1 to each eigenvalue.
                    models[name].variance += np.identity(len(features_all_rd))
                    
                    y_prob = models[name].predict_proba(dataset_test_df[features_all_elo],
                                                        X_error=dataset_test_df[features_all_rd].apply(lambda row: np.diag(row.to_numpy() ** 2) @ pre_scaler, axis=1))

                case 'xgb': # Special syntax required here.
                    models[name].fit(dataset_train_df[features_all_elo + features_all_rd], dataset_train_df['winner'])
                    y_prob = models[name].predict_proba(dataset_test_df[features_all_elo + features_all_rd])

            # Rest of the prediction code is the same among all models    
            y_pred = (y_prob[:,1] >= 0.5)

            ll_scores[i,j] = round(log_loss(dataset_test_df['winner'], y_prob), 3)
            acc_scores[i,j] = round(100.0 * accuracy_score(dataset_test_df['winner'], y_pred), 1)

    print("Scores involving all ELOs and RDs in " + data_mode + " mode")
    print(pd.DataFrame(np.concatenate([ll_scores, acc_scores], axis=1),
                index=years,
                columns=[x + "_ll" for x in models] + [x + "_acc" for x in models]))

## The best linear model

At least from the looks of things, it does not appear that there is any noticeable difference in accuracy and log loss with the ErrorLDA model, compared to the other two linear models. As such, considering the enormous lack of speed in training these models, we will just stick with LogisticRegression() in such special cases for now.

In [None]:
# Split the data according to the various amount of updates ("no data/low quality data/high quality data")

def get_quality_encoding(row):
    result = ''

    for update_col in features_all_updates:
        if row[update_col] >= 5.0:  # High quality
            result += '2'
        #elif row[update_col] >= 2.0: # Low quality
        #    result += '1'
        else:                        # None (or basically none)
            result += '0'
    
    return result

dataset_df = dataset_df.copy() # Fixes "copy of a slice" nonsense
dataset_df['quality_class'] = dataset_df.apply(get_quality_encoding, axis=1)

dataset_df

In [None]:
# Splits off the data into different classes and applies distinct models to each
class SplitRegression:
    # min_data is the minimum number of occurences of a class to split it off
    def __init__(self, min_data=10000):
        self.features = None # Basically the list of columns of the training data
        self.min_data = min_data

        self.large_classes = None
        self.small_classes = None
        self.zero_classes = None

        self.large_models = {} # A dictionary of models, one for each class with lots of data
        self.small_model = None # Just bundle together all small classes (except zero) into one group and apply a more complex model to them
        self.zero_model = None # Apply another separate model to the "basically no data" group

    # Figure out what features we actually need to pull from (and ignore the ones that have basically zero data)
    def class_to_features(self, c):
        return [self.features[int(x)] for x in c if x != '0']
    
    # Jankiness, but it works. If there is *technically* only one class in whatever we've split off here,
    # then we should make sure to add the other class. Ideally with fake data.
    def patch_missing_outcome(self, X_train, y_train):
        old_len = len(X_train.index)

        y_unique = list(y_train.unique()) # Could very well be empty, or just one value. This handles both cases.
        for y in [y for y in [0.0, 1.0] if y not in y_unique]:
            print("PATCHED {0}".format(y))
            X_train = pd.concat([X_train, pd.Series([1500.0] * len(X_train.columns), index=X_train.columns, axis=0)]) # Most canonical fake data
            y_train = pd.concat([y_train, pd.Series([y])])

        if len(X_train.index) > old_len:
            print(len(X_train.index))

        return X_train, y_train

    # Kind of assumes these are all dataframes and series-es
    # X_class corresponds to a unique id of the form n_1 ... n_k for the class
    # A value of n_i = 0 means we will not actually use data from that class
    def fit(self, X, y, X_class):
        self.features = list(X.columns)

        class_counts = X_class.value_counts()

        # The zero classes, treated as separate classes. Realistically, there should only be at most one.
        self.zero_classes = [c for c in class_counts.index if self.class_to_features(c) == []]
        
        self.large_classes = [x for x in class_counts[class_counts >= self.min_data].index if x not in self.zero_classes]
        self.small_classes = [x for x in class_counts[class_counts < self.min_data].index if x not in self.zero_classes]

        # As we've seen, LogisticRegression does quite well on classes with lots of data.
        # We kind of assume each class has slightly different means, hence the need to train separate models,
        # and also just skip out entirely on passing it info from features with basically no info.
        for c in tqdm(self.large_classes):
            class_features = self.class_to_features(c)

            self.large_models[c] = LogisticRegression(penalty=None, max_iter=10000)

            # Just in case the outcome is constant on this training subset.
            X_train, y_train = self.patch_missing_outcome(X[X_class == c][class_features], y[X_class == c])

            self.large_models[c].fit(X_train, y_train)

        # Now the zero classes (realistically at most one of them) get lumped together and have a single model used.
        # Same for the small classes.
        # Might as well toss all features in there, just in case it feels like extracting *some* kind of info, somehow.
        small_class_filter = X_class.apply(lambda c: c in self.small_classes)
        zero_class_filter = X_class.apply(lambda c: c in self.zero_classes)
        
        # TODO: Tuning of hyperparameters
        self.small_model = xgb.XGBClassifier()
        self.zero_model = xgb.XGBClassifier()        

        # These could technically be empty, or have just one outcome
        # Small class
        X_train, y_train = self.patch_missing_outcome(X[small_class_filter], y[small_class_filter])
        self.small_model.fit(X_train, y_train)
        # Zero class
        X_train, y_train = self.patch_missing_outcome(X[zero_class_filter], y[zero_class_filter])
        self.zero_model.fit(X_train, y_train)

    def predict_proba(self, X, X_class):

        merged_df = pd.concat([X, X_class], axis=1)
        merged_df.columns = list(X.columns) + ['quality_class']

        # It is substantially more efficient to figure out which model applies to which row,
        # use a groupby(), and feed the entire block of data into the model,
        # rather than doing this entire operation row by row.

        # Also, this breaks if we try to put the models directly in the dataframe.
        # Let's instead just assign them numeric values.
        model_list = [self.large_models[c] for c in self.large_models] + [self.zero_model, self.small_model]
        model_to_num_dict = {}

        for i, model in enumerate(model_list):
            model_to_num_dict[model] = i

        def assign_model(row):
            c = row['quality_class']

            if c in self.large_classes:
                return model_to_num_dict[self.large_models[c]]
            
            # Not a large model. Perhaps zero?
            if self.class_to_features(c) == []:
                return model_to_num_dict[self.zero_model]
            
            # Only possibility is the "small" model. Just lump it in with the rest of the data.
            return model_to_num_dict[self.small_model]
        
        merged_df['model'] = merged_df.apply(assign_model, axis=1)
        merged_df['model_copy'] = merged_df['model'] # include_groups=True deprecation nonsense

        # Run predict_proba on entire blocks of data that use the same model,
        # rather than running it row by row (slow)!
        def block_proba(df):
            model_num = df.iloc[0]['model_copy']
            c = df.iloc[0]['quality_class']
            c_features = df.columns[:-2] # Ignore 'quality_class' and 'model_copy' columns

            # Large model, need to actually restrict features
            if model_num < len(self.large_models):
                c_features = self.class_to_features(c)
            
            model = model_list[model_num]
            
            return pd.DataFrame(model.predict_proba(df[c_features]), index=df.index)
        
        
        result = merged_df.groupby('model').apply(block_proba, include_groups=False)

        # Note that this will have a two-layered index now.
        # One for the model number, and one for the original index.
        # Let's remove it.
        result = result.droplevel(0)

        # NOTE: The index is NOT the original order anymore, because of the above! Let's fix that
        result = result.loc[X.index]

        return result.to_numpy()       

In [None]:
# More advanced models
import xgboost as xgb
import errorlda
import importlib

# Just in case we make changes to this model.
importlib.reload(errorlda)

# Years to train on. We will test on the next year.
years = range(2018, 2022+1)

models = {'split': SplitRegression(),
          'xgb': xgb.XGBClassifier()}

ll_scores = np.zeros(shape=(len(years), len(models)))
acc_scores = np.zeros(shape=(len(years), len(models)))



for i, y in enumerate(tqdm(years)):
    # Note that 2015 data is probably not that good. Elo scores barely started getting accurate.
    # NOTE: It is assumed that dataset_df and sets_df share the same rows, only with different engineered features in dataset_df
    dataset_train_df = dataset_df[(sets_df['start'] >= datetime.datetime(y,1,1)) &
                                (sets_df['end'] <= datetime.datetime(y,12,31))]
    dataset_test_df = dataset_df[(sets_df['start'] >= datetime.datetime(y+1,1,1)) &
                                (sets_df['end'] <= datetime.datetime(y+1,12,31))]
    
    for j, name in enumerate(models):
        y_prob = None
        y_pred = None

        # Basically, all of these require slightly different syntax and restriction of features
        # * ErrorLDA to include the variances (RD values)
        # * LDA to just use the the ELO values without RD values (mainly to compare to ErrorLDA)
        match name:
            case 'split':
                models[name].fit(dataset_train_df[features_all_elo], dataset_train_df['winner'], dataset_train_df['quality_class'])
                y_prob = models[name].predict_proba(dataset_test_df[features_all_elo], dataset_test_df['quality_class'])
            case 'xgb': # Special syntax required here.
                models[name].fit(dataset_train_df[features_all_elo + features_all_rd], dataset_train_df['winner'])
                y_prob = models[name].predict_proba(dataset_test_df[features_all_elo + features_all_rd])

        # Rest of the prediction code is the same among all models    
        y_pred = (y_prob[:,-1] >= 0.5)

        ll_scores[i,j] = round(log_loss(dataset_test_df['winner'], y_prob), 3)
        acc_scores[i,j] = round(100.0 * accuracy_score(dataset_test_df['winner'], y_pred), 1)

print("Scores involving all ELOs and RDs in " + data_mode + " mode")
print(pd.DataFrame(np.concatenate([ll_scores, acc_scores], axis=1),
            index=years,
            columns=[x + "_ll" for x in models] + [x + "_acc" for x in models]))

In [None]:
pd.DataFrame(dataset_df['p1_elo']).T