# Player match history
We add columns to sets_df that track the player vs player history. The values are stored in arrays that have a 1 in the first position if player 1 won the most recent game and a 0 if player 2 won the most recent game, the second value corresponds to the second most recent game, ect. For each set, we only look at results from the previous week or earlier.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, cohen_kappa_score
from sklearn.model_selection import train_test_split  # Correct import

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data_2'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data_2/'
else:
    data_path = '../data/'


### Load Data


In [2]:
sets_df = pd.read_pickle(data_path + '/labelled_sets_df.pkl') 
print(sets_df.columns)
sets_df.head(3)



Index(['key_x', 'game', 'tournament_key', 'winner_id', 'loser_id', 'p1_id',
       'p2_id', 'p1_score', 'p2_score', 'valid_score', 'best_of',
       'location_names', 'bracket_name', 'bracket_order', 'set_order',
       'game_data', 'top_8', 'top_8_location_names', 'valid_top_8_bracket',
       'top_8_bracket_location_names', 'major', 'key_y', 'start', 'end',
       'start_week', 'p1_characters', 'p2_characters', 'p1_consistent',
       'p2_consistent', 'matchup_strings', 'end_week'],
      dtype='object')


Unnamed: 0,key_x,game,tournament_key,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,valid_score,...,key_y,start,end,start_week,p1_characters,p2_characters,p1_consistent,p2_consistent,matchup_strings,end_week
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,Chillin,Chillin,5620,1,3,True,...,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,2017-11-26 08:48:09,151,,,,,,152
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,Aglet,15634,3,2,True,...,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,2017-11-26 08:48:09,151,,,,,,152
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,1097,1097,6126,0,3,True,...,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,2017-11-26 08:48:09,151,,,,,,152


Order the players of a set alphabetically so that each pair of players that played a set against each other has only one label. Make a dataframe with information we will need.

In [3]:
players_np = sets_df[['p1_id', 'p2_id']].to_numpy()
players_sort_np = np.sort(players_np)

set_history_df = sets_df[['start_week','end_week','start', 'end', 'p1_id', 'p2_id', 'winner_id']].copy()
set_history_df['p1/p2'] = [pair[0]+'/'+pair[1] for pair in players_np]
set_history_df['sort'] = [pair[0]+'/'+pair[1] for pair in players_sort_np]
set_history_df['same'] = (set_history_df['p1/p2'] == set_history_df['sort'])

set_history_df.head()

Unnamed: 0,start_week,end_week,start,end,p1_id,p2_id,winner_id,p1/p2,sort,same
0,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,Chillin,5620,5620,Chillin/5620,5620/Chillin,False
1,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,Aglet,15634,Aglet,Aglet/15634,15634/Aglet,False
2,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,1097,6126,6126,1097/6126,1097/6126,True
3,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,1069,Chu,1069,1069/Chu,1069/Chu,True
4,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,Jerry,Rishi,Rishi,Jerry/Rishi,Jerry/Rishi,True


In [4]:
set_history_df.sort_values(['start_week'])
set_history_df.head()

Unnamed: 0,start_week,end_week,start,end,p1_id,p2_id,winner_id,p1/p2,sort,same
0,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,Chillin,5620,5620,Chillin/5620,5620/Chillin,False
1,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,Aglet,15634,Aglet,Aglet/15634,15634/Aglet,False
2,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,1097,6126,6126,1097/6126,1097/6126,True
3,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,1069,Chu,1069,1069/Chu,1069/Chu,True
4,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,Jerry,Rishi,Rishi,Jerry/Rishi,Jerry/Rishi,True


Filter ``sets_df`` to sets between players who play at least two sets against each other.

In [5]:
counts = set_history_df['sort'].value_counts()
multiple_sets = list(counts[counts > 1].index)
filtered_sets_df = set_history_df[set_history_df['sort'].isin(multiple_sets)]
print(f"There are {filtered_sets_df.shape[0]:,} sets between players who played against each other more than once and make up {filtered_sets_df.shape[0]/sets_df.shape[0]:.0%} of the sets.")
filtered_sets_df.head(5)

There are 824,986 sets between players who played against each other more than once and make up 46% of the sets.


Unnamed: 0,start_week,end_week,start,end,p1_id,p2_id,winner_id,p1/p2,sort,same
2,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,1097,6126,6126,1097/6126,1097/6126,True
6,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,5620,1069,5620,5620/1069,1069/5620,False
10,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,1097,5620,5620,1097/5620,1097/5620,True
19,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,15634,6126,15634,15634/6126,15634/6126,True
24,151,152,2017-11-26 08:05:11,2017-11-26 08:48:09,1097,1069,1097,1097/1069,1069/1097,False


In [6]:
mask = 1-(filtered_sets_df['start_week']==0) & (filtered_sets_df['end_week']>1)
mask = mask.astype(bool)
filtered_sets_df = filtered_sets_df.loc[mask]
filtered_sets_df.shape


(824690, 10)

In [7]:
def compute_results(filtered_sets_df):
    # Ensure the DataFrame is sorted by ranking_date_index for chronological processing
    filtered_sets_df = filtered_sets_df.sort_values(by='start_week').copy()
    
    # Initialize a dictionary to store the results for each row
    results_dict = {}

    # Group by the 'sort' column
    for _, group_df in tqdm(filtered_sets_df.groupby('sort')):
        # Initialize a list to store past results for this group
        results_so_far = []
        
        # Iterate through the rows of this group
        for idx, row in group_df.iterrows():
            # Determine the winner in terms of p1_id
            if row['same']:
                # If p1_id and p2_id are in alphabetical order
                result = 1 if row['winner_id'] == row['p1_id'] else 0
            else:
                # If p1_id and p2_id are not it alphabetical order
                result = 1 if row['winner_id'] == row['p2_id'] else 0
            
            # Add the results so far for this row
            results_dict[idx] = results_so_far.copy()
            
            # Update the results for future rows
            results_so_far.insert(0, result)  # Add the result to the front (most recent match first)
    
    # Add the results column to the DataFrame
    filtered_sets_df['results_sort'] = filtered_sets_df.index.map(results_dict)
    
    return filtered_sets_df

# Apply the function to your DataFrame
filtered_sets_df = compute_results(filtered_sets_df)


filtered_sets_df.head()

  0%|          | 0/223421 [00:00<?, ?it/s]

Unnamed: 0,start_week,end_week,start,end,p1_id,p2_id,winner_id,p1/p2,sort,same,results_sort
804,1,2,2015-01-11 14:16:13,2015-01-13 01:02:28,19573,6189,6189,19573/6189,19573/6189,True,[]
806,1,2,2015-01-11 14:16:13,2015-01-13 01:02:28,4465,6189,4465,4465/6189,4465/6189,True,[]
805,1,2,2015-01-11 14:16:13,2015-01-13 01:02:28,1000,6189,6189,1000/6189,1000/6189,True,[]
801,1,2,2015-01-11 14:16:13,2015-01-13 01:02:28,12870,5956,5956,12870/5956,12870/5956,True,[]
798,1,2,2015-01-11 14:16:13,2015-01-13 01:02:28,Beat,12870,12870,Beat/12870,12870/Beat,False,[]


In [8]:
def compute_results(filtered_sets_df):
    # Ensure the DataFrame is sorted by start_week for chronological processing
    filtered_sets_df = filtered_sets_df.sort_values(by='start_week').copy()
    
    # Initialize a dictionary to store the results for each row
    results_dict = {}

    # Group by the 'sort' column
    for _, group_df in tqdm(filtered_sets_df.groupby('sort')):
        # Initialize a list to store past results for this group
        results_so_far = []
        
        # Iterate through the rows of this group in chronological order
        for idx, row in group_df.iterrows():
            # Filter results_so_far to include only sets with end_week < current start_week
            results_so_far = [
                result for result_idx, result in zip(results_dict.keys(), results_so_far)
                if filtered_sets_df.loc[result_idx, 'end_week'] < row['start_week']
            ]
            
            # Determine the winner in terms of p1_id
            if row['same']:
                # If p1_id and p2_id are in alphabetical order
                result = 1 if row['winner_id'] == row['p1_id'] else 0
            else:
                # If p1_id and p2_id are not in alphabetical order
                result = 1 if row['winner_id'] == row['p2_id'] else 0
            
            # Add the results so far for this row
            results_dict[idx] = results_so_far.copy()
            
            # Update the results for future rows
            results_so_far.insert(0, result)  # Add the result to the front (most recent match first)
    
    # Add the results column to the DataFrame
    filtered_sets_df['results_sorted'] = filtered_sets_df.index.map(results_dict)
    
    return filtered_sets_df

# Apply the function to your DataFrame
filtered_sets_df = compute_results(filtered_sets_df)

# Display the updated DataFrame
print(filtered_sets_df.head())


  0%|          | 0/223421 [00:00<?, ?it/s]

     start_week  end_week               start                 end     p1_id  \
804           1         2 2015-01-11 14:16:13 2015-01-13 01:02:28     19573   
741           1         2 2015-01-11 14:16:13 2015-01-13 01:02:28     19573   
715           1         2 2015-01-11 14:16:13 2015-01-13 01:02:28  NamiNami   
717           1         2 2015-01-11 14:16:13 2015-01-13 01:02:28   Fauster   
725           1         2 2015-01-11 14:16:13 2015-01-13 01:02:28      4465   

     p2_id winner_id           p1/p2            sort   same results_sort  \
804   6189      6189      19573/6189      19573/6189   True           []   
741   1000      1000      19573/1000      1000/19573  False           []   
715  12870     12870  NamiNami/12870  12870/NamiNami  False           []   
717  19573     19573   Fauster/19573   19573/Fauster  False           []   
725  Eagle      4465      4465/Eagle      4465/Eagle   True           []   

    results_sorted  
804             []  
741             []  
715  

In [9]:
# Convert the 'results' column to a column of NumPy arrays
filtered_sets_df['results_sorted'] = filtered_sets_df['results_sorted'].apply(np.array)
filtered_sets_df['results'] = filtered_sets_df['results_sorted'].copy()

# Swap the 1s and 0s if the order of p1 and p2 are not alphabetical.
filtered_sets_df.loc[(filtered_sets_df['same'] == False), 'results'] = filtered_sets_df.loc[(filtered_sets_df['same'] == False), 'results'].apply(lambda x: 1-x)

# Display the updated DataFrame
filtered_sets_df.head()

Unnamed: 0,start_week,end_week,start,end,p1_id,p2_id,winner_id,p1/p2,sort,same,results_sort,results_sorted,results
804,1,2,2015-01-11 14:16:13,2015-01-13 01:02:28,19573,6189,6189,19573/6189,19573/6189,True,[],[],[]
741,1,2,2015-01-11 14:16:13,2015-01-13 01:02:28,19573,1000,1000,19573/1000,1000/19573,False,[],[],[]
715,1,2,2015-01-11 14:16:13,2015-01-13 01:02:28,NamiNami,12870,12870,NamiNami/12870,12870/NamiNami,False,[],[],[]
717,1,2,2015-01-11 14:16:13,2015-01-13 01:02:28,Fauster,19573,19573,Fauster/19573,19573/Fauster,False,[],[],[]
725,1,2,2015-01-11 14:16:13,2015-01-13 01:02:28,4465,Eagle,4465,4465/Eagle,4465/Eagle,True,[],[],[]


In [10]:
# Copy the original DataFrame
sets_with_player_history_df = sets_df.copy()

# Identify rows where players have history
filtered_sets_df['players_have_history'] = filtered_sets_df['results'].apply(lambda x: len(x) > 0)

# Update `players_have_history` column in `sets_with_player_history_df`
sets_with_player_history_df['players_have_history'] = False
sets_with_player_history_df.loc[filtered_sets_df.index[filtered_sets_df['players_have_history']], 'players_have_history'] = True

# Rename columns
col_rename = {'sort': '(p1/p2)_sorted', 'same': '(p1/p2)_was_sorted'}
filtered_sets_df = filtered_sets_df.rename(columns=col_rename)

# Define columns to merge
col_to_merge = ['(p1/p2)_sorted', '(p1/p2)_was_sorted', 'results_sorted', 'results']

# Merge the two DataFrames
sets_with_player_history_df = pd.merge(
    sets_with_player_history_df,
    filtered_sets_df[col_to_merge],
    how='left',
    left_index=True,
    right_index=True
)

# Display the updated DataFrame
sets_with_player_history_df.head()


Unnamed: 0,key_x,game,tournament_key,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,valid_score,...,p2_characters,p1_consistent,p2_consistent,matchup_strings,end_week,players_have_history,(p1/p2)_sorted,(p1/p2)_was_sorted,results_sorted,results
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),5620,Chillin,Chillin,5620,1,3,True,...,,,,,152,False,,,,
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),Aglet,15634,Aglet,15634,3,2,True,...,,,,,152,False,,,,
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),6126,1097,1097,6126,0,3,True,...,,,,,152,False,1097/6126,True,[],[]
3,104675846,melee,mdva-invitational-2017-(challonge-mirror),1069,Chu,1069,Chu,3,0,True,...,,,,,152,False,,,,
4,104675847,melee,mdva-invitational-2017-(challonge-mirror),Rishi,Jerry,Jerry,Rishi,1,3,True,...,,,,,152,False,,,,


In [11]:
# index = sets_with_player_history_df[sets_with_player_history_df['players_have_history']==True].index
# # Create a list of empty arrays, one for each index
# empty_arrays = [[] for _ in range(len(index))]
# # Assign the list of empty arrays to the column
# sets_with_player_history_df.loc[index, 'results_sorted'] = empty_arrays

In [12]:
index = sets_with_player_history_df[sets_with_player_history_df['players_have_history'] == False].index

# Assign an empty numpy array to each row in 'results_sorted'
for i in index:
    sets_with_player_history_df.at[i, 'results_sorted'] = np.array([])

# Assign an empty numpy array to each row in 'results_sorted'
for i in index:
    sets_with_player_history_df.at[i, 'results'] = np.array([])

In [None]:
def history_cols(results):
    past_10 = np.full(10,.5)
    past_10[:min(results.shape[0],10)] = results[:10]
    return past_10

tqdm.pandas()

# Apply the history_cols function to generate the 10 most recent results
sets_with_player_history_df[['result_1', 'result_2', 'result_3', 'result_4', 'result_5',
                              'result_6', 'result_7', 'result_8', 'result_9', 'result_10']] = (
    sets_with_player_history_df['results'].progress_apply(history_cols).progress_apply(pd.Series)
)

# Display the updated DataFrame
print(sets_with_player_history_df.head())


    

  0%|          | 0/1795681 [00:00<?, ?it/s]

       key_x   game                             tournament_key winner_id  \
0  104675843  melee  mdva-invitational-2017-(challonge-mirror)      5620   
1  104675844  melee  mdva-invitational-2017-(challonge-mirror)     Aglet   
2  104675845  melee  mdva-invitational-2017-(challonge-mirror)      6126   
3  104675846  melee  mdva-invitational-2017-(challonge-mirror)      1069   
4  104675847  melee  mdva-invitational-2017-(challonge-mirror)     Rishi   

  loser_id    p1_id  p2_id  p1_score  p2_score  valid_score  ...  result_1  \
0  Chillin  Chillin   5620         1         3         True  ...       0.5   
1    15634    Aglet  15634         3         2         True  ...       0.5   
2     1097     1097   6126         0         3         True  ...       0.5   
3      Chu     1069    Chu         3         0         True  ...       0.5   
4    Jerry    Jerry  Rishi         1         3         True  ...       0.5   

  result_2 result_3 result_4 result_5 result_6  result_7 result_8  result_

In [22]:
sets_with_player_history_df[sets_with_player_history_df['results'].apply(lambda x: x.shape[0]>3)][['result_1', 'result_2', 'result_3', 'result_4', 'result_5',
                              'result_6', 'result_7', 'result_8', 'result_9', 'result_10']]

Unnamed: 0,result_1,result_2,result_3,result_4,result_5,result_6,result_7,result_8,result_9,result_10
228,0.0,0.0,1.0,0.0,0.0,0.5,0.5,0.5,0.5,0.5
230,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5
231,1.0,1.0,1.0,0.0,0.5,0.5,0.5,0.5,0.5,0.5
234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
235,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.5,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...
1795667,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.5,0.5,0.5
1795671,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5,0.5
1795672,1.0,1.0,0.0,1.0,1.0,1.0,0.5,0.5,0.5,0.5
1795676,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.5


In [21]:
sets_with_player_history_df.to_pickle(data_path + 'sets_with_results_df.pkl')




In [None]:
top_player_id = {
    'aMSa': '1021',
    'Cody': '19554',
    'Mang0': '1000',
    'Zain': '6126',
    'Armada': '6189',
    'Wizzrobe': '1028',
    'Mew2King': '1003',
    'PPMD': '1002',
    'Hungrybox': '1004',
    'Plup': '15990',
    'Axe': '16342'
}

In [None]:
df = filtered_sets_df[filtered_sets_df['sort'] == (top_player_id['Mang0'] + '/' + top_player_id['Hungrybox'])]

overall = df['results_sort'][1:].apply(lambda x: np.sum(x) / x.shape[0]) 
running = overall.copy()
running[9:] = df['results_sort'][10:].apply(lambda x: np.sum(x[:10]) / 10) 
x = np.arange(0,overall.shape[0])
plt.plot(x, running, label = 'Mang0 Previous 10', alpha=.7)
plt.plot(x, overall, label = 'Mang0 Lifetime', alpha=.7)

plt.ylim([0,1])
# Add a horizontal line at y=0
plt.axhline(y=.5, color='gray', linestyle='--', linewidth=1)
plt.title("Mang0's Win Rate vs Hungrybox")
plt.legend()


KeyError: 'sort'

In [None]:
df = filtered_sets_df[filtered_sets_df['sort'] == (top_player_id['Mang0'] + '/' + top_player_id['Armada'])]

overall = df['results_sort'][1:].apply(lambda x: np.sum(x) / x.shape[0]) 
running = overall.copy()
running[9:] = df['results_sort'][10:].apply(lambda x: np.sum(x[:10]) / 10) 
x = np.arange(0,overall.shape[0])
plt.plot(x, running, label = 'Mang0 Previous 10', alpha=.7)
plt.plot(x, overall, label = 'Mang0 Lifetime', alpha=.7)

plt.ylim([0,1])
# Add a horizontal line at y=0
plt.axhline(y=.5, color='gray', linestyle='--', linewidth=1)
plt.title("Mang0's Win Rate vs Armada")
plt.legend()


KeyError: 'sort'

In [None]:
df = filtered_sets_df[filtered_sets_df['sort'] == (top_player_id['Cody'] + '/' + top_player_id['Zain'])]

overall = df['results_sort'][1:].apply(lambda x: np.sum(x) / x.shape[0]) 
running = overall.copy()
running[9:] = df['results_sort'][10:].apply(lambda x: np.sum(x[:10]) / 10) 
x = np.arange(0,overall.shape[0])
plt.plot(x, running, label = 'Cody Previous 10', alpha=.7)
plt.plot(x, overall, label = 'Cody Lifetime', alpha=.7)

plt.ylim([0,1])
# Add a horizontal line at y=0
plt.axhline(y=.5, color='gray', linestyle='--', linewidth=1)
plt.title("Cody's Win Rate vs Zain")
plt.legend()

KeyError: 'sort'

In [None]:
df = filtered_sets_df[filtered_sets_df['sort'] == (top_player_id['Mang0'] + '/' + top_player_id['aMSa'])]

overall = df['results_sort'][1:].apply(lambda x: np.sum(x) / x.shape[0]) 
running = overall.copy()
running[9:] = df['results_sort'][10:].apply(lambda x: np.sum(x[:10]) / 10) 
x = np.arange(0,overall.shape[0])
plt.plot(x, running, label = 'Mang0 Previous 10', alpha=.7)
plt.plot(x, overall, label = 'Mang0 Lifetime', alpha=.7)

plt.ylim([0,1])
# Add a horizontal line at y=0
plt.axhline(y=.5, color='gray', linestyle='--', linewidth=1)
plt.title("Mang0's Win Rate vs aMSa")
plt.legend()

KeyError: 'sort'