In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict
from joblib import Parallel, delayed

import sqlite3
import sys
import time
import math
#import tqdm
from tqdm.auto import tqdm
import datetime
import os
import pickle
from pathlib import Path

from glicko2 import Player
import multiprocessing

tqdm.pandas()

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'

## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [2]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [3]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [4]:
players_df = dfs['players_df']
players_df.head()

Unnamed: 0,game,player_id,tag,all_tags,prefixes,social,country,state,region,c_country,c_state,c_region,placings,characters,alias
0,melee,Rishi,Rishi,[Rishi],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
1,melee,15634,lloD,"[lloD, VGz | lloD, Llod]",[],{'twitter': ['lloD74']},United States,VA,,US,CA,Laurel,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/peach': 1089, 'melee/falco': 1, 'melee...",
2,melee,6126,Zain,"[Zain, DontTestMe]",[PG],{'twitter': ['PG_Zain']},United States,VA,,US,CA,Los Angeles,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/marth': 1065, 'melee/pichu': 1, 'melee...",DontTestMe
3,melee,Chu,Chu,[Chu],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
4,melee,5620,Junebug,"[Junebug, LS | VGz Junebug]",[],{'twitter': ['arJunebug']},United States,VA,,US,VA,Richmond,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/sheik': 46, 'melee/falco': 4, 'melee/g...",


In [5]:
ranking_df = dfs['ranking_df']
ranking_df.head()

Unnamed: 0,game,ranking_name,priority,region,seasons,tournaments,icon
0,melee,SSBMRank,0,world,"[2015, 2016, 2017, 2018, 2019]",[],miom


In [6]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

Unnamed: 0,game,ranking_name,season,start,end,total,by_id,by_placing,final,name
0,melee,SSBMRank,2015,1420070400,1451606399,100,"{'6189': 1, '1004': 2, '4465': 3, '1000': 4, '...","{'1': '6189', '2': '1004', '3': '4465', '4': '...",0,
1,melee,SSBMRank,2016,1451606400,1483228799,100,"{'6189': 1, '1004': 2, '1000': 3, '1003': 4, '...","{'1': '6189', '2': '1004', '3': '1000', '4': '...",0,
2,melee,SSBMRank,2017,1483228800,1514764799,100,"{'1004': 1, '6189': 2, '1000': 3, '1003': 4, '...","{'1': '1004', '2': '6189', '3': '1000', '4': '...",0,
3,melee,SSBMRank,2018,1514793600,1546329600,100,"{'1004': 1, '6189': 2, '4465': 3, '15990': 4, ...","{'1': '1004', '2': '6189', '3': '4465', '4': '...",0,
4,melee,SSBMRank,2019,1546329600,1577836800,100,"{'1004': 1, '4465': 2, '1000': 3, '16342': 4, ...","{'1': '1004', '2': '4465', '3': '1000', '4': '...",0,


In [7]:
sets_df = dfs['sets_df']
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data)")

sets_df.head()

32.9% percent of sets have some game data)


Unnamed: 0,key,game,tournament_key,winner_id,p1_id,p2_id,p1_score,p2_score,location_names,bracket_name,bracket_order,set_order,best_of,game_data
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,5620,Chillin,3,1,"[R1, Round 1, Round 1]",,1,A,5,[]
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,Aglet,2,3,"[R1, Round 1, Round 1]",,1,B,5,[]
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,6126,1097,3,0,"[R1, Round 1, Round 1]",,1,C,5,[]
3,104675846,melee,mdva-invitational-2017-(challonge-mirror),1069,Chu,1069,0,3,"[R1, Round 1, Round 1]",,1,D,5,[]
4,104675847,melee,mdva-invitational-2017-(challonge-mirror),Rishi,Jerry,Rishi,1,3,"[R1, Round 1, Round 1]",,1,E,5,[]


In [8]:
tournament_info_df = dfs['tournament_info_df']
tournament_info_df.head()

Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,country,state,city,entrants,placings,losses,bracket_types,online,lat,lng
0,melee,mdva-invitational-2017-(challonge-mirror),MDVA Invitational 2017 (Challonge Mirror),challonge,https://challonge.com/mdva_invitational_2017,,17,,2017-11-26 08:05:11,2017-11-26 08:48:09,US,VA,Fall's Church,10,"[[Rishi, 1], [15634, 3], [6126, 4], [Chu, 8], ...",{},b'{}',0,,
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,US,MI,Ann Arbor,92,[],{},b'{}',0,,
2,melee,slippi-champions-league-week-1__melee-singles,Slippi Champions League Week 1,pgstats,slippi-champions-league-week-1,melee-singles,20,,2020-10-11 14:00:00,2020-10-11 14:00:00,,,,20,"[[1000, 1], [6126, 2], [4107, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0
3,melee,slippi-champions-league-week-2__melee-singles,Slippi Champions League Week 2,pgstats,slippi-champions-league-week-2,melee-singles,20,,2020-10-18 14:00:00,2020-10-18 14:00:00,,,,20,"[[6126, 1], [4107, 2], [1000, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0
4,melee,slippi-champions-league-week-3__melee-singles,Slippi Champions League Week 3,pgstats,slippi-champions-league-week-3,melee-singles,20,,2020-10-25 14:00:00,2020-10-25 14:00:00,,,,20,"[[6126, 1], [3359, 2], [19554, 3], [4107, 3], ...",{},b'{}',1,0.0,0.0


In [9]:
# Code optimization by Dan
# Basically we want to replace this line in process_tournament with something more efficient:
#
#      tournament_sets_df = sets_df[sets_df['tournament_key'] == tournament_key]
#
# Instead, we can
# - Merge the tournament date info into ``sets_df``
# - Sort by date
# - Store the start/end positions of each tournament in a separate dictionary
# - Use tournament_sets_df = sets_df.iloc[start:end+1] instead.

sets_df = sets_df.merge(tournament_info_df[['key', 'start', 'end']], left_on='tournament_key', right_on='key', how='left')
sets_df = sets_df.drop(labels=['key_y'], axis='columns')
sets_df = sets_df.rename(columns={"key_x": "key"})
sets_df = sets_df.sort_values(by=['end', 'tournament_key']) # Just in case there are tournaments with the exact same end date

In [10]:
# A bit of data cleanup
# TODO: Rerun!
min_date = datetime.datetime(2015, 1, 1)
max_date = datetime.datetime(2024, 12, 31)

sets_df = sets_df[(sets_df['start'] >= min_date) & (sets_df['end'] >= min_date) & (sets_df['start'] <= max_date) & (sets_df['end'] <= max_date)]

In [11]:
# Example of game data. List of dictionaries.
sets_df[sets_df['game_data'].apply(lambda x: x != [])].iloc[0]['game_data']

[{'winner_id': 4643,
  'loser_id': 4532,
  'winner_score': 2,
  'loser_score': 0,
  'winner_char': 'melee/jigglypuff',
  'loser_char': 'melee/zelda',
  'stage': None},
 {'winner_id': 4643,
  'loser_id': 4532,
  'winner_score': 2,
  'loser_score': 0,
  'winner_char': 'melee/jigglypuff',
  'loser_char': 'melee/zelda',
  'stage': None}]

## A variation on the other character matchup

In short, the other character vs character matchup that I made relies on collecting all, for example, fox vs fox matches and computing rankings for those. (Seeing how fox players compare to each other). It then uses those elos to compute the elo of someone playing, for example, yoshi vs all fox players.

This variation is different. It uses the *general* elo of every player to compute character matchup elos instead.

In [12]:
# Assumes dataset_generation/game_data_extractor.ipynb was run
game_data_df = pd.read_pickle(data_path + 'individual_game_data.pkl')

In [13]:
all_characters = list(set(list(game_data_df['p1_char'].unique()) + list(game_data_df['p1_char'].unique())))
all_characters

['drmario',
 'ganondorf',
 'pichu',
 'bowser',
 'iceclimbers',
 'zelda',
 'younglink',
 'samus',
 'yoshi',
 'captainfalcon',
 'sheik',
 'ness',
 'random',
 'roy',
 'mewtwo',
 'falco',
 'luigi',
 'fox',
 'peach',
 'mario',
 'kirby',
 'marth',
 'link',
 'pikachu',
 'mrgameandwatch',
 'donkeykong',
 'jigglypuff']

## Compute the rankings for player/char/char

In [14]:
# Good for testing
# 1021/yoshi - aMSa
# 19554/fox - Cody

# First, we build the set of rounds with one player and one opponent.
# Each original row in game_data_df will contribute twice, with each player swapping the above roles.

# For convenience, we can restrict our attention to players who actually have a reasonable amount of data with a certain character
MIN_GAMES = 1

game_players_series = pd.concat([game_data_df['p1_id'], game_data_df['p2_id']])
total_games = game_players_series.value_counts()
regular_players = total_games[total_games >= MIN_GAMES]

print("{0} regular player/char combos".format(len(regular_players.index)))

# Lots of memory usage. Let's just reduce down to what we need.
rounds_df = game_data_df[['p1_id', 'p2_id', 'winner_id', 'p1_char', 'p2_char', 'winner_char', 'start', 'end']]

# Each row should contribute twice, swapping 'player' and 'opponent'
df_p1 = rounds_df[rounds_df['p1_id'].apply(lambda x: x in regular_players.index)].copy()
df_p1.rename(columns={'p1_id': 'player_id', 'p2_id': 'opponent_id',
                      'p1_char': 'player_char', 'p2_char': 'opponent_char'}, inplace=True)
df_p1['outcome'] = (df_p1['winner_id'] == df_p1['player_id']).astype(int)
df_p1 = df_p1[['player_id', 'opponent_id', 'player_char', 'opponent_char', 'outcome', 'start', 'end']]
    
df_p2 = rounds_df[rounds_df['p2_id'].apply(lambda x: x in regular_players.index)].copy()
df_p2.rename(columns={'p2_id': 'player_id', 'p1_id': 'opponent_id',
                      'p2_char': 'player_char', 'p1_char': 'opponent_char'}, inplace=True)
df_p2['outcome'] = (df_p2['winner_id'] == df_p2['player_id']).astype(int)
df_p2 = df_p2[['player_id', 'opponent_id', 'player_char', 'opponent_char', 'outcome', 'start', 'end']]

rounds_df = pd.concat([df_p1, df_p2], ignore_index=True)

# Save some memory - these are probably huge.
del df_p1
del df_p2

33427 regular player/char combos


In [15]:
rounds_df

Unnamed: 0,player_id,opponent_id,player_char,opponent_char,outcome,start,end
0,4643,4532,jigglypuff,zelda,1,2015-04-11 21:30:00,2015-04-13 05:00:00
1,4643,4532,jigglypuff,zelda,1,2015-04-11 21:30:00,2015-04-13 05:00:00
2,4005,3349,mario,marth,1,2015-04-11 21:30:00,2015-04-13 05:00:00
3,4005,3349,mario,marth,1,2015-04-11 21:30:00,2015-04-13 05:00:00
4,4608,4624,zelda,pikachu,1,2015-04-11 21:30:00,2015-04-13 05:00:00
...,...,...,...,...,...,...,...
3170387,148873,1565365,marth,yoshi,0,2024-06-04 21:30:00,2024-06-26 03:59:00
3170388,55592,512704,falco,captainfalcon,0,2024-06-04 21:30:00,2024-06-26 03:59:00
3170389,55592,512704,falco,captainfalcon,0,2024-06-04 21:30:00,2024-06-26 03:59:00
3170390,55592,2496254,falco,fox,0,2024-06-04 21:30:00,2024-06-26 03:59:00


In [16]:
player_ratings_df = pd.read_pickle(data_path + 'overall_players_ranking_new_weekly.pkl')
player_rds_df = pd.read_pickle(data_path + 'overall_players_rds_new_weekly.pkl')

# Returns Rating, RD, and a bool for (actually found = True, default values = False)
def get_opponent_elo_rd(row):
    # Not in our main list of players
    if row['opponent_id'] not in player_ratings_df.columns:
        return (1500.0, 350.0, False)

    # No old enough data
    if player_ratings_df.index[0] > row['start']:
        return (1500.0, 350.0, False)

    # We can take advantage of the fact that the index of player_ratings is always in regular intervals.
    start_date = player_ratings_df.index[0]
    interval = player_ratings_df.index[1] - player_ratings_df.index[0] # I guess we're assuming at least two entries?

    newest_index = int((row['start'] - start_date) / interval)

    # Might actually be out of bounds on the data we have,
    # i.e. 'start' might be well beyond the dates we have data on.
    # In this case, just use the newest piece of data.
    if newest_index >= len(player_ratings_df.index):
        newest_index = len(player_ratings_df.index) - 1

    return (player_ratings_df.iloc[newest_index][row['opponent_id']], player_rds_df.iloc[newest_index][row['opponent_id']], True)

rounds_df['result'] = rounds_df.apply(get_opponent_elo_rd, axis=1)

rounds_df['opponent_rating'] = rounds_df['result'].apply(lambda x: x[0])
rounds_df['opponent_rd']     = rounds_df['result'].apply(lambda x: x[1])
rounds_df['opponent_found']  = rounds_df['result'].apply(lambda x: x[2])

rounds_df.drop(columns=['result'], inplace=True)

rounds_df

Unnamed: 0,player_id,opponent_id,player_char,opponent_char,outcome,start,end,opponent_rating,opponent_rd,opponent_found
0,4643,4532,jigglypuff,zelda,1,2015-04-11 21:30:00,2015-04-13 05:00:00,1500.000000,350.000000,True
1,4643,4532,jigglypuff,zelda,1,2015-04-11 21:30:00,2015-04-13 05:00:00,1500.000000,350.000000,True
2,4005,3349,mario,marth,1,2015-04-11 21:30:00,2015-04-13 05:00:00,1500.000000,350.000000,True
3,4005,3349,mario,marth,1,2015-04-11 21:30:00,2015-04-13 05:00:00,1500.000000,350.000000,True
4,4608,4624,zelda,pikachu,1,2015-04-11 21:30:00,2015-04-13 05:00:00,1500.000000,350.000000,True
...,...,...,...,...,...,...,...,...,...,...
3170387,148873,1565365,marth,yoshi,0,2024-06-04 21:30:00,2024-06-26 03:59:00,1198.733900,76.008231,True
3170388,55592,512704,falco,captainfalcon,0,2024-06-04 21:30:00,2024-06-26 03:59:00,1840.190400,45.418449,True
3170389,55592,512704,falco,captainfalcon,0,2024-06-04 21:30:00,2024-06-26 03:59:00,1840.190400,45.418449,True
3170390,55592,2496254,falco,fox,0,2024-06-04 21:30:00,2024-06-26 03:59:00,1505.854529,37.149149,True


In [17]:
# Compute weekly intervals to group by, quite easily.
start_date = datetime.datetime(2015,1,1)
interval = player_ratings_df.index[1] - player_ratings_df.index[0]

# "Copy of a slice" nonsense, this should fix it.
rounds_df = rounds_df.copy()

# Round up, as this computes the date that receives this elo update.
rounds_df['end_index'] = rounds_df['end'].apply(lambda x: math.ceil((x - start_date) / interval))

In [18]:
# Group by player/character (pc_combo), opponent character, week index
rounds_df['pc_combo'] = rounds_df['player_id'] + '/' + rounds_df['player_char']

grouped_df = rounds_df[['pc_combo', 'opponent_char', 'end_index',
                        'opponent_rating', 'opponent_rd', 'outcome']].groupby(['pc_combo', 'opponent_char', 'end_index']).agg({
        'opponent_rating': list,
        'opponent_rd': list,
        'outcome': list
    }).reset_index()

grouped_df['player_char_char'] = grouped_df['pc_combo'] + '/' + grouped_df['opponent_char']
grouped_df.drop(columns=['pc_combo', 'opponent_char'], inplace=True)
grouped_df

Unnamed: 0,end_index,opponent_rating,opponent_rd,outcome,player_char_char
0,15,"[1500.0, 1500.0, 1500.0, 1500.0]","[350.0, 350.0, 350.0, 350.0]","[1, 1, 1, 1]",1000/captainfalcon/captainfalcon
1,283,"[1982.4160953710762, 1982.4160953710762]","[65.51124507872034, 65.51124507872034]","[0, 0]",1000/captainfalcon/captainfalcon
2,116,"[1931.2751663011863, 1931.2751663011863, 1931....","[125.24224169278143, 125.24224169278143, 125.2...","[0, 0, 1, 1, 1]",1000/captainfalcon/falco
3,488,"[2368.460869466365, 2368.460869466365]","[39.39171475598802, 39.39171475598802]","[0, 1]",1000/captainfalcon/falco
4,116,"[1932.5226544016818, 1932.5226544016818, 1932....","[71.06657629446525, 71.06657629446525, 71.0665...","[0, 1, 0, 0]",1000/captainfalcon/fox
...,...,...,...,...,...
1015755,357,"[1500.0, 1500.0, 1500.0]","[350.0, 350.0, 350.0]","[0, 1, 1]",999888/sheik/jigglypuff
1015756,320,"[1570.589221451713, 1570.589221451713]","[43.39844123266524, 43.39844123266524]","[0, 0]",999888/sheik/marth
1015757,321,"[1319.6323690365448, 1319.6323690365448, 1319....","[145.77126593308802, 145.77126593308802, 145.7...","[0, 0, 0, 0, 0, 0]",999888/sheik/marth
1015758,320,"[1500.0, 1500.0, 1500.0]","[350.0, 350.0, 350.0]","[1, 1, 1]",999888/sheik/mrgameandwatch


In [19]:
# Actually start computing elos for player/char/char combos.
# TODO: This is REALLY slow. Optimize!

# To deal with inlcude_groups=True being deprecated and disallowed soon,
# let's just create a copy of this column
grouped_df['pcc_duplicate'] = grouped_df['player_char_char']

player_char_char_elos = {}

# We will create a single table. Index is dates, columns is player/pchar/ochar.
unique_players = list(grouped_df['player_char_char'].unique())

initial_date = datetime.datetime(2015, 1, 1)
end_date = datetime.datetime(2024, 12, 31) # TODO: Properly compute this instead of just guessing
interval = datetime.timedelta(weeks=1)

# Bugfix stuff
MIN_ELO = 500.0
MAX_RD = 350.0

# TODO: Surely there's a more professional way to do this bit.
dates = {0: initial_date}

date = initial_date + interval
i = 1

while date <= end_date:
    dates[i] = date

    date += interval
    i += 1

# Convenient store of glicko objects
glicko_objects = {}
for player in unique_players:
    glicko_objects[player] = Player()

# Pre-allocating the dataframe for maximum efficiency.
player_ratings_df = pd.DataFrame([[1500.0] * len(unique_players)], columns=unique_players, index=list(dates.values())) 

def compute_pcc_elo(x):
    # player/char/char
    pcc = x.iloc[0]['pcc_duplicate']

    # More easily allow for getting the week number
    x = x.set_index('end_index')

    glicko_object = glicko_objects[pcc]

    # More efficient to keep track of where every occuring week number is (as an iloc).
    weeknum_to_iloc = [-1]*len(dates)
    for i in range(0, len(x.index)):
        weeknum_to_iloc[x.index[i]] = i

    for index in dates:
        if weeknum_to_iloc[index] == -1:
            glicko_object.did_not_compete()
        else:
            glicko_object.update_player(x.iloc[weeknum_to_iloc[index]]['opponent_rating'],
                                        x.iloc[weeknum_to_iloc[index]]['opponent_rd'],
                                        x.iloc[weeknum_to_iloc[index]]['outcome'])

        # Bugfix stuff
        if glicko_object.getRating() < MIN_ELO:
            glicko_object.setRating(MIN_ELO)

        if glicko_object.getRd() > MAX_RD:
            glicko_object.setRd(MAX_RD)

        player_ratings_df.loc[initial_date + index*interval, pcc] = glicko_object.getRating()

# Parallelization, cause this be SLOW
hyperthreading = True
n_jobs = multiprocessing.cpu_count() // 2 if hyperthreading else multiprocessing.cpu_count()

# Split into separate dataframes and save in separate files.
# This lets us easily run a multiprocessing script later on them.
unique_pcc_combos = list(grouped_df['player_char_char'].unique())
split_pcc_combos = [] # List of lists to filter by

for i in range(0, n_jobs):
    # First n-1 lists will have this length.
    # Last one will have the remainder.
    # This isn't the most even split, but it gets the job done.
    default_length = len(unique_pcc_combos) // n_jobs

    if i != n_jobs - 1:
        split_pcc_combos += [unique_pcc_combos[i*default_length : (i+1)*default_length]]
    else:
        split_pcc_combos += [unique_pcc_combos[i*default_length : ]]

for i, split in enumerate(split_pcc_combos):
    filter = grouped_df['player_char_char'].isin(split)
    split_grouped_df = grouped_df[filter]
    split_grouped_df.to_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2_temp_' + str(i) + '.pkl')

# grouped_df.groupby('player_char_char').progress_apply(compute_pcc_elo, include_groups=False)

In [None]:
# Good test: aMSa, as yoshi, vs fox
player_ratings_df['1021/yoshi/fox']

In [23]:
player_ratings_df.to_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2.pkl')