# Predict winner with matchup data.
Here we try to use what we learned in winner_prediction_with_matchup_rating.ipynb to improve the prediction on a large subset of the sets. We will try various things:
- We will try restricting to the most popular 5-7 charcaters.
- We will try to find a subset of the sets with character data of players who played a fixed character against each other (or maybe two characters). Then we take expand our training/test set to sets between those players without character data. We can train a model that uses only the top 5, 6, and 7 characters seperately, and then apply the models to the players that appear in each set. So we use the model with the top 5 characters when players who only play the top 5 against each other. We use the model with the top 6 characters only when at least one of the players plays puff and the top 7 model only when one player plays peach.
- We will train the models on the score.
We will test the model to see how it does on sets where we do not have the information about characters and sets.
- We could test randomizing the player order.
- We could try players who only play one character (aMSa, Axe)

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, classification_report, cohen_kappa_score, log_loss, root_mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split  # Correct import
from prettytable import PrettyTable

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'

In [74]:
players_df = pd.read_pickle(data_path + '/labelled_data/players_df.pkl')
players_df.head()

Unnamed: 0,game,player_id,tag,all_tags,prefixes,social,country,state,region,c_country,c_state,c_region,placings,characters,alias
0,melee,Rishi,Rishi,[Rishi],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
1,melee,15634,lloD,"[lloD, VGz | lloD, Llod]",[],{'twitter': ['lloD74']},United States,VA,,US,CA,Laurel,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/peach': 1089, 'melee/falco': 1, 'melee...",
2,melee,6126,Zain,"[Zain, DontTestMe]",[PG],{'twitter': ['PG_Zain']},United States,VA,,US,CA,Los Angeles,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/marth': 1065, 'melee/pichu': 1, 'melee...",DontTestMe
3,melee,Chu,Chu,[Chu],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
4,melee,5620,Junebug,"[Junebug, LS | VGz Junebug]",[],{'twitter': ['arJunebug']},United States,VA,,US,VA,Richmond,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/sheik': 46, 'melee/falco': 4, 'melee/g...",


In [75]:
sets_df = pd.read_pickle(data_path + '/labelled_data/sets_df_3.pkl')
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
print(sets_df.shape)
sets_df

32.9% percent of sets have some game data
(1795681, 27)


Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,charcter_data,charcter_change,p1_characters,p2_characters
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,5620,Chillin,5620,Chillin,3,1,...,False,,False,,False,151.0,False,,,
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,Aglet,15634,15634,Aglet,2,3,...,False,,False,,False,151.0,False,,,
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,6126,1097,6126,1097,3,0,...,False,,False,,False,151.0,False,,,
3,104675846,melee,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,1069,Chu,Chu,1069,0,3,...,False,,False,,False,151.0,False,,,
4,104675847,melee,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,Rishi,Jerry,Jerry,Rishi,1,3,...,False,,False,,False,151.0,False,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795676,gg__76279646,melee,dice-dance-17__1v1-melee,2024-06-23 13:00:00,908884,3196854,908884,3196854,3,1,...,True,LN,True,LN_B,False,494.0,False,,,
1795677,gg__76279647,melee,dice-dance-17__1v1-melee,2024-06-23 13:00:00,2791218,495503,2791218,495503,3,1,...,True,LQF,True,LQF_A,False,494.0,False,,,
1795678,gg__76279648,melee,dice-dance-17__1v1-melee,2024-06-23 13:00:00,908884,2407110,2407110,908884,0,3,...,True,LQF,True,LQF_B,False,494.0,False,,,
1795679,gg__76279649,melee,dice-dance-17__1v1-melee,2024-06-23 13:00:00,2791218,908884,2791218,908884,3,0,...,True,LSF,True,LSF,False,494.0,False,,,


In [76]:
tournament_info_df = pd.read_pickle(data_path + '/labelled_data/tournament_info_df.pkl')
print(tournament_info_df.shape)
tournament_info_df.head(3)

(39675, 37)


Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,...,WSF_B_p2,LN_A_p1_non_top_8_sets,LN_A_p2_non_top_8_sets,LN_B_p1_non_top_8_sets,LN_B_p2_non_top_8_sets,WSF_A_p1_non_top_8_sets,WSF_A_p2_non_top_8_sets,WSF_B_p1_non_top_8_sets,WSF_B_p2_non_top_8_sets,major
0,melee,mdva-invitational-2017-(challonge-mirror),MDVA Invitational 2017 (Challonge Mirror),challonge,https://challonge.com/mdva_invitational_2017,,17,,2017-11-26 08:05:11,2017-11-26 08:48:09,...,,,,,,,,,,
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,...,Ginger,"[(32, True), (62, True), (77, False), (164, Tr...","[(39, True), (65, True), (78, False), (165, Tr...","[(47, True), (69, True), (80, False), (159, Tr...","[(40, True), (66, True), (79, True), (85, Fals...","[(28, True), (60, True), (76, True), (84, True)]","[(36, True), (64, True), (78, True), (85, True)]","[(44, True), (68, True), (80, True), (86, True)]","[(52, True), (72, True), (82, True), (87, True)]",
2,melee,slippi-champions-league-week-1__melee-singles,Slippi Champions League Week 1,pgstats,slippi-champions-league-week-1,melee-singles,20,,2020-10-11 14:00:00,2020-10-11 14:00:00,...,,,,,,,,,,True


In [77]:
overall_players_ranking_new_weekly_df = pd.read_pickle(data_path + 'overall_players_ranking_new_weekly.pkl')
overall_players_ranking_new_weekly_df.sample(3)

Unnamed: 0,1617201,1497667,2653190,3657740,41449,6039,3621289,3682293,3332271,2034855,...,1911774,4106746,Black hayato,The boy,138692,3293641,1701726,2408932,SmyD,15495
2017-04-13,1500.0,1500.0,1500.0,1500.0,1147.194923,1362.829389,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1337.689105,1400.124736,1397.772865,1500.0,1500.0,1500.0,1559.078031,1482.878104
2021-05-13,1305.337441,1500.0,1500.0,1500.0,1147.194923,1362.829389,1500.0,1500.0,1500.0,1500.0,...,1206.449862,1500.0,1337.689105,1400.124736,1397.772865,1500.0,1500.0,1500.0,1559.078031,1482.878104
2019-11-28,1500.0,1500.0,1500.0,1500.0,1147.194923,1362.829389,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1337.689105,1400.124736,1397.772865,1500.0,1500.0,1500.0,1559.078031,1482.878104


In [78]:
overall_players_rds_new_weekly_df = pd.read_pickle(data_path + 'overall_players_rds_new_weekly.pkl')
overall_players_rds_new_weekly_df.sample(3)

Unnamed: 0,1617201,1497667,2653190,3657740,41449,6039,3621289,3682293,3332271,2034855,...,1911774,4106746,Black hayato,The boy,138692,3293641,1701726,2408932,SmyD,15495
2016-09-01,350.0,350.0,350.0,350.0,350.0,223.647664,350.0,350.0,350.0,350.0,...,350.0,350.0,300.975483,241.171966,350.0,350.0,350.0,350.0,191.183689,187.22719
2020-01-09,350.0,350.0,350.0,350.0,282.066563,262.734056,350.0,350.0,350.0,350.0,...,350.0,350.0,331.056343,277.802204,254.183249,350.0,350.0,350.0,235.709325,202.023897
2021-11-25,255.537296,350.0,350.0,350.0,69.004895,282.268121,350.0,350.0,350.0,350.0,...,92.376671,350.0,346.763689,296.343967,274.325862,350.0,350.0,191.935423,257.299802,226.842584


In [79]:
char_vs_char_player_rankings_weekly_alt2_df = pd.read_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2.pkl')
char_vs_char_player_rankings_weekly_alt2_df.sample(3)

Unnamed: 0,3688504/sheik/roy,3688504/sheik/samus,3688504/sheik/sheik,3689802/sheik/marth,3689821/falco/falco,3689821/falco/fox,3689821/falco/jigglypuff,3689821/falco/kirby,3689821/falco/marth,3689821/falco/mewtwo,...,368847/fox/pikachu,368847/fox/samus,368847/fox/sheik,3688504/sheik/captainfalcon,3688504/sheik/falco,3688504/sheik/fox,3688504/sheik/jigglypuff,3688504/sheik/luigi,3688504/sheik/marth,3688504/sheik/peach
2017-01-26,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2023-09-28,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1596.896931,1638.465668,1938.327843,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2022-07-14,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0


In [80]:
char_vs_char_player_rankings_weekly_alt2_rds_df = pd.read_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2_rds.pkl')
char_vs_char_player_rankings_weekly_alt2_rds_df.sample(3)

Unnamed: 0,1960158/fox/mewtwo,1960158/fox/roy,1960158/jigglypuff/falco,1960158/jigglypuff/fox,1960158/jigglypuff/iceclimbers,1960158/jigglypuff/jigglypuff,1960158/jigglypuff/link,1960158/jigglypuff/luigi,1960158/jigglypuff/marth,1960158/jigglypuff/sheik,...,999886/luigi/falco,999886/marth/captainfalcon,999886/sheik/falco,999888/sheik/captainfalcon,999888/sheik/falco,999888/sheik/fox,999888/sheik/jigglypuff,999888/sheik/marth,999888/sheik/mrgameandwatch,999888/sheik/samus
2023-04-27,350.0,350.0,153.859733,108.881937,309.26452,161.779446,307.757331,257.099827,187.436279,182.673489,...,272.235722,247.392791,286.325016,193.668639,219.083612,223.746585,245.413683,171.292358,253.480081,253.480081
2023-08-24,350.0,350.0,159.745065,117.042386,312.237064,167.390512,310.7434,260.66659,192.298639,187.659834,...,275.606924,251.097387,289.532677,198.393614,223.265077,227.83688,249.147699,176.6022,257.098413,257.098413
2016-06-30,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0


In [81]:
sets_without_character_changes_df=pd.read_pickle(data_path + '/labelled_data/sets_without_character_changes_df.pkl')
sets_without_character_changes_df.head(3)

Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,length_gamedata,p1_characters,p2_characters,matchup
19575,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,6126,1009,1009,6126,0,2,...,False,,False,,True,187,2,fox,marth,fox/marth
19582,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,6126,1004,6126,2,0,...,False,,False,,True,187,2,jigglypuff,marth,jigglypuff/marth
19626,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1028,1055,1028,1055,2,0,...,True,LN,True,LN_B,True,187,2,captainfalcon,sheik,captainfalcon/sheik


In [82]:
path = data_path + 'predict_matchup_dataset/'
overall_elos = pd.read_pickle(path + 'overall_elos.pkl')
matchup_sets_df = pd.read_pickle(path + 'matchup_sets_df.pkl')
p1_alt_2 = pd.read_pickle(path + 'p1_alt_2.pkl')
p2_alt_2 = pd.read_pickle(path + 'p2_alt_2.pkl')
p1_alt_2_rds = pd.read_pickle(path + 'p1_alt_2_rds.pkl')
p2_alt_2_rds = pd.read_pickle(path + 'p2_alt_2_rds.pkl')

In [83]:
print(overall_elos.shape)
print(matchup_sets_df.shape)
print(p1_alt_2.shape)
print(p2_alt_2.shape)
print(p1_alt_2_rds.shape)
print(p2_alt_2_rds.shape)

(437200, 4)
(437200, 27)
(437200, 678)
(437200, 678)
(437200, 678)
(437200, 678)


In [84]:
matchup_sets_df.head()


Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,length_gamedata,p1_characters,p2_characters,matchup
19575,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,6126,1009,1009,6126,0,2,...,False,,False,,True,187,2,fox,marth,fox/marth
19582,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,6126,1004,6126,2,0,...,False,,False,,True,187,2,jigglypuff,marth,jigglypuff/marth
19626,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1028,1055,1028,1055,2,0,...,True,LN,True,LN_B,True,187,2,captainfalcon,sheik,captainfalcon/sheik
19628,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,15990,1000,15990,1000,2,0,...,True,WSF,True,WSF_A,True,187,2,sheik,falco,sheik/falco
19629,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,1028,1004,1028,2,1,...,True,LQF,True,LQF_B,True,187,3,jigglypuff,captainfalcon,jigglypuff/captainfalcon


Find the top n most popular characters, filter the dataset, then count the number of times the matchup was played.

In [118]:

least_frequent_matchup_list = []
least_frequent_count_list = []
for num_pop in range(1,27):

    # Filter based on popular characters
    characters = sets_without_character_changes_df['p1_characters'].value_counts().head(num_pop).index.values
    filtered_sets_df = sets_without_character_changes_df[
        sets_without_character_changes_df['p1_characters'].isin(characters) &
        sets_without_character_changes_df['p2_characters'].isin(characters)
    ].copy()

    # Debugging step to ensure characters are correct
    print(characters)

    # Process matchups
    p1_characters = np.array(filtered_sets_df[['p1_characters']])
    p2_characters = np.array(filtered_sets_df[['p2_characters']])
    matchups = np.concatenate([p1_characters, p2_characters], axis=1)
    matchups = np.sort(matchups, axis=1)

    # Convert matchups into a Series of strings
    matchups_df = pd.Series(['-'.join(pair) for pair in matchups])

    # Value counts for matchups
    matchup_counts = matchups_df.value_counts()

    # Accessing the last item properly
    least_frequent_matchup = matchup_counts.index[-1]  # Get the matchup string
    least_frequent_count = matchup_counts.iloc[-1]    # Get the count
    least_frequent_matchup_list += [least_frequent_matchup]
    least_frequent_count_list += [least_frequent_count]
    

    print(f"Least frequent matchup between the {num_pop} most popular characters is: {least_frequent_matchup_list[num_pop-1]}")
    print(f"Count: {least_frequent_count_list[num_pop-1]}")
    print()


['fox']
Least frequent matchup between the 1 most popular characters is: fox-fox
Count: 17883

['fox' 'falco']
Least frequent matchup between the 2 most popular characters is: falco-falco
Count: 14010

['fox' 'falco' 'marth']
Least frequent matchup between the 3 most popular characters is: marth-marth
Count: 6078

['fox' 'falco' 'marth' 'sheik']
Least frequent matchup between the 4 most popular characters is: sheik-sheik
Count: 3750

['fox' 'falco' 'marth' 'sheik' 'captainfalcon']
Least frequent matchup between the 5 most popular characters is: captainfalcon-captainfalcon
Count: 3384

['fox' 'falco' 'marth' 'sheik' 'captainfalcon' 'jigglypuff']
Least frequent matchup between the 6 most popular characters is: jigglypuff-jigglypuff
Count: 1161

['fox' 'falco' 'marth' 'sheik' 'captainfalcon' 'jigglypuff' 'peach']
Least frequent matchup between the 7 most popular characters is: jigglypuff-jigglypuff
Count: 1161

['fox' 'falco' 'marth' 'sheik' 'captainfalcon' 'jigglypuff' 'peach'
 'luigi']


We should try testing the most frequent 1, 2, 3, 5, and 7 most popular characters. 



In [86]:
p1_focus_df = sets_without_character_changes_df[['p1_id', 'p2_id', 'p1_characters', 'p2_characters']].copy()
p2_focus_df = sets_without_character_changes_df[['p1_id', 'p2_id', 'p1_characters', 'p2_characters']].copy()
new_cols = {'p1_id':'p2_id','p2_id':'p1_id', 'p1_characters':'p2_characters','p2_characters':'p1_characters'}
p2_focus_df = p1_focus_df.rename(columns = new_cols)
player_characters_df = pd.concat([p1_focus_df, p2_focus_df])

new_cols = {'p1_id':'player_id','p2_id':'opponent_id', 'p1_characters':'player_character','p2_characters':'opponent_character'}
player_characters_df = player_characters_df.rename(columns = new_cols)

num_pop_opponent = 5
characters = sets_without_character_changes_df['p1_characters'].value_counts().head(num_pop).index.values

for character in characters:

    player_character_vs_character_df = player_characters_df[player_characters_df['opponent_character']==character]

    print('Opponent character:', character)
    print(player_character_vs_character_df['player_character'].value_counts().iloc[:14])
    print()



Opponent character: fox
player_character
fox              35766
falco            33265
marth            22489
sheik            19453
captainfalcon    15686
jigglypuff       13678
peach            11222
luigi             5484
samus             5086
iceclimbers       3298
ganondorf         3191
drmario           2582
yoshi             2324
pikachu           2133
Name: count, dtype: int64

Opponent character: falco
player_character
fox              33265
falco            28020
marth            19559
sheik            15692
captainfalcon    13992
jigglypuff        8698
peach             8411
luigi             4522
samus             4052
ganondorf         2591
iceclimbers       2319
drmario           2047
yoshi             1857
pikachu           1689
Name: count, dtype: int64

Opponent character: marth
player_character
fox              22489
falco            19559
marth            12156
sheik            11582
captainfalcon     9497
jigglypuff        6657
peach             6006
luigi         

We want players who have only played one of ['fox' 'falco' 'marth'] against each of those three characters.

In [87]:
p1_focus_df = sets_without_character_changes_df[['p1_id', 'p2_id', 'p1_characters', 'p2_characters']].copy()
p2_focus_df = sets_without_character_changes_df[['p1_id', 'p2_id', 'p1_characters', 'p2_characters']].copy()
new_cols = {'p1_id':'p2_id','p2_id':'p1_id', 'p1_characters':'p2_characters','p2_characters':'p1_characters'}
p2_focus_df = p1_focus_df.rename(columns = new_cols)
player_characters_df = pd.concat([p1_focus_df, p2_focus_df])

new_cols = {'p1_id':'player_id','p2_id':'opponent_id', 'p1_characters':'player_character','p2_characters':'opponent_character'}
player_characters_df = player_characters_df.rename(columns = new_cols)

num_pop_opponent = 3
characters = sets_without_character_changes_df['p1_characters'].value_counts().head(num_pop).index.values
player_char_vs_pop_char_df = player_characters_df[player_characters_df['opponent_character'].isin(characters)]
print(player_char_vs_pop_char_df['player_id'].value_counts().iloc[:100].to_string())



player_id
65777      2875
31487      2319
7277       1807
694497     1732
147811     1375
1482652    1358
1007304    1280
153599     1278
2244671    1251
154396     1242
25924      1156
133666     1147
1959305    1088
4390       1049
51240      1041
45356      1034
2600974    1029
220377     1028
61570      1025
2177269    1001
657464      997
542693      995
65566       977
958716      973
5702        954
257406      947
55304       933
5869        927
674084      921
1945856     906
1524828     885
1375808     880
701767      877
25950       875
2326935     875
1030        874
1432116     866
2146437     863
1693244     861
68390       850
1194651     843
1004        841
1222893     839
2086583     830
2934299     822
54316       815
1987739     813
4317        807
255496      800
2188672     781
132938      775
916636      770
51488       758
3561        749
318568      742
1400299     742
188953      737
2792879     715
45726       710
1620271     705
26187       700
53647       69

In [None]:
# p1_focus_df = sets_without_character_changes_df[['p1_id', 'p2_id', 'p1_characters', 'p2_characters']].copy()
# p2_focus_df = sets_without_character_changes_df[['p1_id', 'p2_id', 'p1_characters', 'p2_characters']].copy()
# new_cols = {'p1_id':'p2_id','p2_id':'p1_id', 'p1_characters':'p2_characters','p2_characters':'p1_characters'}
# p2_focus_df = p1_focus_df.rename(columns = new_cols)
# player_characters_df = pd.concat([p1_focus_df, p2_focus_df])

# new_cols = {'p1_id':'player_id','p2_id':'opponent_id', 'p1_characters':'player_character','p2_characters':'opponent_character'}
# player_characters_df = player_characters_df.rename(columns = new_cols)

# num_pop_opponent = 6
# characters = sets_without_character_changes_df['p1_characters'].value_counts().head(num_pop_opponent).index.values
# multi_char = []
# for character in characters:
#     player_char_vs_pop_char_df = player_characters_df[player_characters_df['opponent_character']==character]
#     # print(player_char_vs_pop_char_df['player_id'].value_counts().iloc[:100].to_string())

#     # make two lists of players. one that played only one character against characters[0] and anonther with players who played more than one
#     # Group by 'player_id' and calculate the number of unique characters played
#     character_counts = player_char_vs_pop_char_df.groupby('player_id')['player_character'].nunique()
#     # Separate the players into two lists
#     players_one_character = character_counts[character_counts == 1].index.tolist()
#     players_multiple_characters = character_counts[character_counts > 1].index.tolist()
#         # Update dictionaries

#     multi_char.append(players_multiple_characters)

#     # Output the results
#     print(f"Number of players who played only one character vs {character}: {len(players_one_character)}, {len(players_one_character)/character_counts.shape[0]:.1%}")
#     print(f"Number of players who played multiple characters vs {character}: {len(players_multiple_characters)}, {len(players_multiple_characters)/character_counts.shape[0]:.1%}")
#     print()

Number of players who played only one character vs fox: 16282, 80.6%
Number of players who played multiple characters vs fox: 3915, 19.4%

Number of players who played only one character vs falco: 15901, 80.9%
Number of players who played multiple characters vs falco: 3745, 19.1%

Number of players who played only one character vs marth: 14280, 82.8%
Number of players who played multiple characters vs marth: 2976, 17.2%

Number of players who played only one character vs sheik: 12963, 83.6%
Number of players who played multiple characters vs sheik: 2538, 16.4%

Number of players who played only one character vs captainfalcon: 12304, 83.7%
Number of players who played multiple characters vs captainfalcon: 2403, 16.3%

Number of players who played only one character vs jigglypuff: 10263, 85.4%
Number of players who played multiple characters vs jigglypuff: 1751, 14.6%



Find the players who have only played one character against fox, falco, and marth.

In [120]:
from itertools import chain

for num_pop_opponent in range(2, 27):
    characters = sets_without_character_changes_df['p1_characters'].value_counts().head(num_pop_opponent).index.values
    multi_char = []
    for character in characters:
        player_char_vs_pop_char_df = player_characters_df[player_characters_df['opponent_character']==character]
        # print(player_char_vs_pop_char_df['player_id'].value_counts().iloc[:100].to_string())

        # make two lists of players. one that played only one character against characters[0] and anonther with players who played more than one
        # Group by 'player_id' and calculate the number of unique characters played
        character_counts = player_char_vs_pop_char_df.groupby('player_id')['player_character'].nunique()
        # Separate the players into two lists
        players_one_character = character_counts[character_counts == 1].index.tolist()
        players_multiple_characters = character_counts[character_counts > 1].index.tolist()
            # Update dictionaries

        multi_char.append(players_multiple_characters)

        # Output the results
        # print(f"Number of players who played only one character vs {character}: {len(players_one_character)}, {len(players_one_character)/character_counts.shape[0]:.1%}")
        # print(f"Number of players who played multiple characters vs {character}: {len(players_multiple_characters)}, {len(players_multiple_characters)/character_counts.shape[0]:.1%}")
        # print()
        
    # Ensure multi_char_list is flattened
    if isinstance(multi_char, list):
        multi_char_list = list(chain.from_iterable(multi_char))

    # Filter player_characters_df for rows where opponent_character is in the specified characters
    player_char_vs_pop_chars_df = player_characters_df[player_characters_df['opponent_character'].isin(characters)]

    # Get the unique list of players
    players = list(player_char_vs_pop_chars_df['player_id'].unique())

    # Create one_char_list
    one_char_list = [player for player in players if player not in multi_char_list]

    # Output the proportion
    # print(f"Players who played one character against each of {characters}: {len(one_char_list) / len(players):.2f}")

    one_char_player_vs_pop_chars_df = player_char_vs_pop_chars_df[player_char_vs_pop_chars_df['player_id'].isin(one_char_list)]
    # print(f"{one_char_player_vs_pop_chars_df.shape[0] / player_char_vs_pop_chars_df.shape[0]:.1%} of sets vs {characters} were played by players who only play one character")

    pop_one_char_player_vs_pop_chars_df = one_char_player_vs_pop_chars_df[one_char_player_vs_pop_chars_df['player_character'].isin(characters)]
    # print(f"{pop_one_char_player_vs_pop_chars_df.shape[0] / sets_without_character_changes_df.shape[0]:.1%} of sets play one character in matches between {characters}")
    
    valid_players = pop_one_char_player_vs_pop_chars_df['player_id'].unique()
    sets_between_valid_players_df = sets_df[sets_df['p1_id'].isin(valid_players) & sets_df['p2_id'].isin(valid_players)]
    print(f"The most popular {num_pop_opponent} characters are {characters}.")
    print(f"There are {len(one_char_list)} out of {players_df.shape[0]} players who play only one of characters against each of those characters, or {len(one_char_list) / players_df.shape[0]:.1%}.")
    print(f"There are {sets_between_valid_players_df.shape[0]} sets played amongst those players making up {sets_between_valid_players_df.shape[0] / sets_df.shape[0]:.1%} of sets.")
    print(f"Least frequent matchup between the {num_pop_opponent} most popular characters is: {least_frequent_matchup_list[num_pop_opponent-1]}")
    print(f"Count: {least_frequent_count_list[num_pop_opponent-1]}")
    print()
    

The most popular 2 characters are ['fox' 'falco'].
There are 19478 out of 96689 players who play only one of characters against each of those characters, or 20.1%.
There are 47130 sets played amongst those players making up 2.6% of sets.
Least frequent matchup between the 2 most popular characters is: falco-falco
Count: 14010

The most popular 3 characters are ['fox' 'falco' 'marth'].
There are 20760 out of 96689 players who play only one of characters against each of those characters, or 21.5%.
There are 82673 sets played amongst those players making up 4.6% of sets.
Least frequent matchup between the 3 most popular characters is: marth-marth
Count: 6078

The most popular 4 characters are ['fox' 'falco' 'marth' 'sheik'].
There are 21490 out of 96689 players who play only one of characters against each of those characters, or 22.2%.
There are 111925 sets played amongst those players making up 6.2% of sets.
Least frequent matchup between the 4 most popular characters is: sheik-sheik
Cou

37.6% of sets vs ['fox' 'falco' 'marth' 'sheik' 'captainfalcon' 'jigglypuff'] were played by players who only play one character
43.2% of sets play one character in matches between ['fox' 'falco' 'marth' 'sheik' 'captainfalcon' 'jigglypuff']


The most popular 6 characters are ['fox' 'falco' 'marth' 'sheik' 'captainfalcon' 'jigglypuff'].
There are 22152 out of 96689 players who play only one of characters against each of those characters, or 22.9%.
There are 160829 sets played amongst those players making up 9.0% of sets.
