In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict
from joblib import Parallel, delayed

import sqlite3
import sys
import time
import math
import tqdm
import datetime
import os
import pickle
import random

from glicko2 import Player

if os.path.exists('/workspace/data_2'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data_2/'
else:
    data_path = '../data/'

In [None]:
sets_df = pd.read_pickle(data_path + 'sets_with_results_df.pkl')
sets_df.columns

In [None]:
# A bit of data cleanup
min_date = datetime.datetime(2015, 1, 1)
max_date = datetime.datetime(2024, 12, 31)

sets_df = sets_df[(sets_df['start'] >= min_date) & (sets_df['end'] >= min_date) & (sets_df['start'] <= max_date) & (sets_df['end'] <= max_date)]
sets_df

## Generate features

In [None]:
# Append to this list evey time you add features!
# For convenience, it is blank here, and you should append in each feature-adding cell
output = 'winner'

In [None]:
# The outcome of the match, encoded as 1.0 if p1 wins and 0.0 if p2 wins
sets_df[output] = (sets_df['winner_id'] == sets_df['p1_id']).astype(float)

In [None]:
# Player matchup data. The past 10 matches between p1 and p2, and who won each (0.5 if it lacks any data)
rename_mapper = {}
features_matchup = []

for i in range(1, 10+1):
    rename_mapper['result_' + str(i)] = 'matchup_' + str(i)
    features_matchup.append('matchup_' + str(i))

sets_df.rename(columns=rename_mapper, inplace=True)
sets_df.columns

### "Default" ELO/RD features

In [None]:
# Overall ELO/RD as features
player_ratings_df = pd.read_pickle(data_path + 'overall_players_ranking_new_weekly.pkl')
player_rds_df = pd.read_pickle(data_path + 'overall_players_rds_new_weekly.pkl')

# Might as well compute a (mostly accurate) number of updates from the ratings dataframe
# This isn't 100% accurate, as sometimes (especially if default elo playing default elo)
# the elo might not change from round to round.
player_updates_df = pd.DataFrame(0, index=player_ratings_df.index, columns=player_ratings_df.columns)

for i in range(1, len(player_ratings_df.index)):
    row_updates = (player_ratings_df.iloc[i] != player_ratings_df.iloc[i-1]).astype(int)
    player_updates_df.iloc[i] = row_updates

player_updates_df = player_updates_df.cumsum()


features_default = ['p1_default_elo', 'p2_default_elo', 'p1_default_rd', 'p2_default_rd', 'p1_default_updates', 'p2_default_updates']
dates = list(player_ratings_df.index)

# Efficiency purposes. We are assuming that constant intervals are used.
initial_date = dates[0]
interval = dates[1] - dates[0]

# start_date is the start date of the specific set (tournament)
def get_info(df, player_id, start_date):
    # Far more efficient than searching the index manually every time
    week_num = int((start_date - initial_date) / interval)
    newest_date = initial_date + week_num * interval
    if dates[-1] < newest_date:
        newest_date = dates[-1]
    return df.loc[newest_date, player_id]

sets_df['p1_default_elo'] = sets_df.apply(lambda row: get_info(player_ratings_df, row['p1_id'], row['start']), axis=1)
sets_df['p2_default_elo'] = sets_df.apply(lambda row: get_info(player_ratings_df, row['p2_id'], row['start']), axis=1)

sets_df['p1_default_rd'] = sets_df.apply(lambda row: get_info(player_rds_df, row['p1_id'], row['start']), axis=1)
sets_df['p2_default_rd'] = sets_df.apply(lambda row: get_info(player_rds_df, row['p2_id'], row['start']), axis=1)

sets_df['p1_default_updates'] = sets_df.apply(lambda row: get_info(player_updates_df, row['p1_id'], row['start']), axis=1)
sets_df['p2_default_updates'] = sets_df.apply(lambda row: get_info(player_updates_df, row['p2_id'], row['start']), axis=1)

# Might as well clear if these aren't needed. They are huge in memory.
del player_ratings_df
del player_rds_df
del player_updates_df

### Number of times each player has played each character

In [None]:
initial_date = datetime.datetime(2015,1,1)
interval = datetime.timedelta(weeks=1)

end_date = initial_date + math.ceil((sets_df['end'].max() - initial_date) / interval) * interval

# Assumes game_data_extractor.ipynb was run
game_data = pd.read_pickle(data_path + 'individual_game_data.pkl')

In [None]:
# Actually get a list of characters and their individual popularities
# In the end, a series with index as the character and value as the number of times played.
# Probably sorted in the end.
all_characters = pd.concat([game_data['winner_char'], game_data['loser_char']], ignore_index=True)
all_characters = all_characters.value_counts()

In [None]:
# Player/character combos
game_data['winner_pc'] = game_data.apply(lambda row: row['winner_id'] + '/' + row['winner_char'], axis=1)
game_data['loser_pc'] = game_data.apply(lambda row: row['loser_id'] + '/' + row['loser_char'], axis=1)

# What week number this is, for example.
game_data['end_index'] = game_data['end'].apply(lambda x: math.ceil((x - initial_date) / interval))
game_data['end_date'] = game_data['end_index'].apply(lambda x: initial_date + x*interval)
game_data['end_date_copy'] = game_data['end_index'].apply(lambda x: initial_date + x*interval) # To deal with include_groups=True deprecation


In [None]:
# Actually compute the dates (index) without loading a dataframe for reference.
dates = []

date = initial_date
while date <= end_date:
    dates.append(date)
    date += interval

pc_combos = list(set(list(game_data['winner_pc']) + list(game_data['loser_pc'])))

# Count the number of times a character has been used.
# Columns will be player/character (or rather, player/melee/character)
# Initialize everything to zero, initially.
character_usage_df = pd.DataFrame(0, index=dates, columns=pc_combos)

In [None]:
def update_row(df):
    date = df.iloc[0]['end_date_copy']

    pc_series = pd.concat([df['winner_pc'], df['loser_pc']], ignore_index=True)

    # Deprecation-proofing
    temp = character_usage_df.loc[date].copy()
    temp.update(pc_series.value_counts())
    
    character_usage_df.loc[date] = temp
    
game_data.groupby('end_date').apply(update_row, include_groups=False)

In [None]:
character_usage_df = character_usage_df.cumsum()
character_usage_df

In [None]:
# Newest possible date to get data from.
sets_df['start_index'] = sets_df['start'].apply(lambda x: int((x - initial_date) / interval))
sets_df['start_date'] = sets_df['start_index'].apply(lambda x: initial_date + x * interval)

In [None]:
# No point in recomputing every single time.
p1c_final_index = ['p1' + '_' + x + '_count' for x in all_characters.index]
p2c_final_index = ['p2' + '_' + x + '_count' for x in all_characters.index]

features_pc = p1c_final_index + p2c_final_index

def get_char_count(row):
    # We'll rename the index entries later for these.
    p1c_series = pd.Series(0, index=[row['p1_id'] + '/' + x for x in all_characters.index])
    p2c_series = pd.Series(0, index=[row['p2_id'] + '/' + x for x in all_characters.index])

    p1c_series.update(character_usage_df.loc[row['start_date']])
    p2c_series.update(character_usage_df.loc[row['start_date']])

    p1c_series.index = p1c_final_index
    p2c_series.index = p2c_final_index

    return pd.concat([p1c_series, p2c_series])

sets_df = pd.concat([sets_df, sets_df.apply(get_char_count, axis=1)], axis=1)

## Compute most-used characters for each individual player (mains)

In [None]:
p1_char_usage_df = sets_df[p1c_final_index]
p2_char_usage_df = sets_df[p2c_final_index]

p1_char_usage_df

In [None]:
def get_mains(row, player):
    # Compute three mains and their usages
    index = ['m1', 'm2', 'm1_usage', 'm2_usage']
    index = [player + '_' + x for x in index]

    row_sorted = row.sort_values(ascending=False)

    mains = []

    # First two entries are the actual mains
    for i in range(0,2):
        mains.append(row_sorted.index[i].removeprefix(player + '_').removesuffix('_count'))

    # Next two are how many times they have been played
    for i in range(0,2):
        mains.append(row_sorted.iloc[i])

    return pd.Series(mains, index=index)


p1_mains_df = p1_char_usage_df.apply(lambda row: get_mains(row, 'p1'), axis=1)
p2_mains_df = p2_char_usage_df.apply(lambda row: get_mains(row, 'p2'), axis=1)

In [None]:
# Temporary workspace for computing individual matchup elos
player_mains_df = pd.concat([sets_df[['p1_id', 'p2_id', 'start']], p1_mains_df, p2_mains_df], axis=1)

# TODO: Compute this properly!
start_date = datetime.datetime(2015,1,1)
interval = datetime.timedelta(weeks=1)

# Newest possible date to pull matchup info from
player_mains_df['start_index'] = player_mains_df['start'].apply(lambda x: int((x - start_date) / interval))
player_mains_df['newest_date'] = player_mains_df['start_index'].apply(lambda x: start_date + x*interval)

player_mains_df

In [None]:
features_mains = ['p1_m1_usage', 'p1_m2_usage', 'p2_m1_usage', 'p2_m2_usage']

## player/char/char elos and updates, alt2 variant

In [None]:
alt2_df = pd.read_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2.pkl')
alt2_rds_df = pd.read_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2_rds.pkl')

# How many times each column was updated so far
# Keep everything float to not change the data type
alt2_updates_df = alt2_df.copy()
alt2_updates_df.iloc[0] = 0.0
alt2_updates_df.iloc[1:] = (alt2_updates_df.iloc[1:].reset_index(drop=True) != alt2_updates_df.iloc[:-1].reset_index(drop=True)).astype(float).values
alt2_updates_df = alt2_updates_df.cumsum()

start_date = alt2_df.index[0]
interval = alt2_df.index[1] - alt2_df.index[0]

In [None]:
# TODO: Pull number of times this specific matchup happened as well?

matchups = []

# Compute the actual matchups first.
# Then obtain the elos.

for i in [1,2]: # player_i
    for j in [1,2]: # main_j
        for k in [1,2]: # opponent_main_k
            player_num = i
            opponent_num = 3-i # swap 1 and 2

            player = player_mains_df['p' + str(player_num) + '_id']
            player_main = player_mains_df['p' + str(player_num) + '_m' + str(j)]
            opponent_main = player_mains_df['p' + str(opponent_num) + '_m' + str(k)]

            col_name = 'p' + str(player_num) + '/m' + str(j) + '/m' + str(k)

            matchups.append(col_name)
            player_mains_df[col_name] = player + '/' + player_main + '/' + opponent_main

player_mains_df


In [None]:
# NOTE: This cell in particular likes to crash if you interrupt it.
#       Do NOT interrupt it.

def get_entry(row, matchup, df, default_value):
    if row[matchup] not in df.columns:
        return default_value
    else:
        return df.at[row['newest_date'], row[matchup]]

# p1/m1/m1, etc...
for matchup in tqdm.tqdm(matchups):
    player_mains_df[matchup + '_alt2_elo'] = player_mains_df[['newest_date', matchup]].apply(lambda row: get_entry(row, matchup, alt2_df, 1500.0), axis=1)
    features_mains.append(matchup + '_alt2_elo')

    player_mains_df[matchup + '_alt2_rd'] = player_mains_df[['newest_date', matchup]].apply(lambda row: get_entry(row, matchup, alt2_rds_df, 350.0), axis=1)
    features_mains.append(matchup + '_alt2_rd')

    player_mains_df[matchup + '_alt2_updates'] = player_mains_df[['newest_date', matchup]].apply(lambda row: get_entry(row, matchup, alt2_updates_df, 0.0), axis=1)
    features_mains.append(matchup + '_alt2_updates')

In [None]:
# Save memory
del alt2_df
del alt2_rds_df
del alt2_updates_df

In [None]:
player_mains_df

## player/char/char elos and updates, alt

In [None]:
alt_df = pd.read_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt.pkl')

# How many times each column was updated so far
# Keep everything float to not change the data type
alt_updates_df = alt_df.copy()
alt_updates_df.iloc[0] = 0.0
alt_updates_df.iloc[1:] = (alt_updates_df.iloc[1:].reset_index(drop=True) != alt_updates_df.iloc[:-1].reset_index(drop=True)).astype(float).values
alt_updates_df = alt_updates_df.cumsum()

start_date = alt_df.index[0]
interval = alt_df.index[1] - alt_df.index[0]

In [None]:
# TODO: Pull number of times this specific matchup happened as well?

matchups = []
reference_matchups = [] # player/same_char/same_char to compare against

# Compute the actual matchups first.
# Then obtain the elos.

for i in [1,2]: # player_i
    for j in [1,2]: # main_j
        for k in [1,2]: # opponent_main_k
            player_num = i
            opponent_num = 3-i # swap 1 and 2

            player = player_mains_df['p' + str(player_num) + '_id']
            player_main = player_mains_df['p' + str(player_num) + '_m' + str(j)]
            opponent_main = player_mains_df['p' + str(opponent_num) + '_m' + str(k)]

            col_name = 'p' + str(player_num) + '/m' + str(j) + '/m' + str(k)

            matchups.append(col_name)
            player_mains_df[col_name] = player + '/' + player_main + '/' + opponent_main

# opponent/omain/omain as a reference for player/pmain/omain
for i in [1,2]: # player_i
    for j in [1,2]: # main_j
        opponent_num = 3-i # swap 1 and 2

        opponent = player_mains_df['p' + str(opponent_num) + '_id']
        opponent_main = player_mains_df['p' + str(opponent_num) + '_m' + str(k)]

        col_name = 'p' + str(opponent_num) + '/m' + str(j) + '/m' + str(j)
        
        reference_matchups.append(col_name)
        player_mains_df[col_name + '_ref'] = opponent + '/' + opponent_main + '/' + opponent_main 

player_mains_df


In [None]:
# NOTE: This cell in particular likes to crash if you interrupt it.
#       Do NOT interrupt it.

def get_entry(row, matchup, df, default_value):
    if row[matchup] not in df.columns:
        return default_value
    else:
        return df.at[row['newest_date'], row[matchup]]

# p1/m1/m1, etc...
for matchup in tqdm.tqdm(matchups):
    player_mains_df[matchup + '_alt_elo'] = player_mains_df[['newest_date', matchup]].apply(lambda row: get_entry(row, matchup, alt_df, 1500.0), axis=1)
    features_mains.append(matchup + '_alt_elo')

    player_mains_df[matchup + '_alt_updates'] = player_mains_df[['newest_date', matchup]].apply(lambda row: get_entry(row, matchup, alt_updates_df, 0.0), axis=1)
    features_mains.append(matchup + '_alt_updates')

# p1/same_m1/same_m1, etc...
for reference_matchup in tqdm.tqdm(reference_matchups):
    player_mains_df[reference_matchup + '_alt_ref_elo'] = player_mains_df[['newest_date', reference_matchup + '_ref']].apply(lambda row: get_entry(row, reference_matchup + '_ref', alt_df, 1500.0), axis=1)
    features_mains.append(reference_matchup + '_alt_ref_elo')

    player_mains_df[reference_matchup + '_alt_ref_updates'] = player_mains_df[['newest_date', reference_matchup + '_ref']].apply(lambda row: get_entry(row, reference_matchup + '_ref', alt_updates_df, 0.0), axis=1)
    features_mains.append(reference_matchup + '_alt_ref_updates')

In [None]:
# Save memory
del alt_df
del alt_updates_df

In [None]:
player_mains_df

## player/char "global" elos (also called "alt3")

In [None]:
pc_rankings_df = pd.read_pickle(data_path + 'player_char_overall_rankings_weekly.pkl')
pc_rds_df = pd.read_pickle(data_path + 'player_char_overall_rds_weekly.pkl')

pc_rankings_updates_df = pc_rankings_df.copy()
pc_rankings_updates_df.iloc[0] = 0.0
pc_rankings_updates_df.iloc[1:] = (pc_rankings_df.iloc[1:].reset_index(drop=True) != pc_rankings_df.iloc[:-1].reset_index(drop=True)).astype(float).values
pc_rankings_updates_df = pc_rankings_updates_df.cumsum()

pc_rankings_updates_df

In [None]:
pc_combos = []

# Compute the actual matchups first.
# Then obtain the elos.

for i in [1,2]: # player_i
    for j in [1,2]: # main_j
        player_num = i

        player = player_mains_df['p' + str(player_num) + '_id']
        player_main = player_mains_df['p' + str(player_num) + '_m' + str(j)]

        col_name = 'p' + str(player_num) + '/m' + str(j)

        pc_combos.append(col_name)
        player_mains_df[col_name] = player + '/' + player_main

player_mains_df

In [None]:
def get_entry(row, matchup, df, default_value):
    if row[matchup] not in df.columns:
        return default_value
    else:
        return df.at[row['newest_date'], row[matchup]]

# p1/m1/m1, etc...
for pc_combo in tqdm.tqdm(pc_combos):
    player_mains_df[pc_combo + '_alt3_elo'] = player_mains_df[['newest_date', pc_combo]].apply(lambda row: get_entry(row, pc_combo, pc_rankings_df, 1500.0), axis=1)
    features_mains.append(pc_combo + '_alt3_elo')

    player_mains_df[pc_combo + '_alt3_rd'] = player_mains_df[['newest_date', pc_combo]].apply(lambda row: get_entry(row, pc_combo, pc_rds_df, 350.0), axis=1)
    features_mains.append(pc_combo + '_alt3_rd')

    player_mains_df[pc_combo + '_alt3_updates'] = player_mains_df[['newest_date', pc_combo]].apply(lambda row: get_entry(row, pc_combo, pc_rankings_updates_df, 0.0), axis=1)
    features_mains.append(pc_combo + '_alt3_updates')

In [None]:
player_mains_df

In [None]:
# Save memory
del pc_rankings_df
del pc_rankings_updates_df
del pc_rds_df

## Generate final dataset from above features

In [None]:
# Kind of a lot to include both main1 and main2 for the mains-based data (alt, alt2, alt3)
# Also, 'alt' does not seem to be that good of a feature. We will exclude it for now.
features_mains_reduced = [x for x in features_mains if 'm2' not in x]
features_mains_reduced = [x for x in features_mains_reduced if '_alt_' not in x]

In [None]:
# Might be useful info for when we are training the single-set data
extras = ['start', 'end', 'p1_score', 'p2_score', 'valid_score', 'best_of', 'top_8', 'top_8_location_names', 'valid_top_8_bracket', 'top_8_bracket_location_names']

In [None]:
# Optionally restrict to character-based features where we only consider their first main
dataset_df = pd.concat([sets_df[features_default + features_matchup], player_mains_df[features_mains_reduced], sets_df[extras], sets_df[[output]]], axis=1)
dataset_df.sort_values(by='end', inplace=True)
dataset_df

In [None]:
dataset_df.to_pickle(data_path + ('dataset_full.pkl'))

In [None]:
dataset_df.columns