# Predict winner with matchup data.
Here we try to use what we learned in winner_prediction_with_matchup_rating.ipynb to improve the prediction on a large subset of the sets. We will try various things:
- We will try restricting to the most popular 5-7 charcaters.
- We will try to find a subset of the sets with character data of players who played a fixed character against each other (or maybe two characters). Then we take expand our training/test set to sets between those players without character data. We can train a model that uses only the top 5, 6, and 7 characters seperately, and then apply the models to the players that appear in each set. So we use the model with the top 5 characters when players who only play the top 5 against each other. We use the model with the top 6 characters only when at least one of the players plays puff and the top 7 model only when one player plays peach.
- We will train the models on the score.
We will test the model to see how it does on sets where we do not have the information about characters and sets.
- We could test randomizing the player order.
- We could try players who only play one character (aMSa, Axe)

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, classification_report, cohen_kappa_score, log_loss, root_mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split  # Correct import
from prettytable import PrettyTable

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'

In [17]:
players_df = pd.read_pickle(data_path + '/labelled_data/players_df.pkl')
players_df.head()

Unnamed: 0,game,player_id,tag,all_tags,prefixes,social,country,state,region,c_country,c_state,c_region,placings,characters,alias
0,melee,Rishi,Rishi,[Rishi],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
1,melee,15634,lloD,"[lloD, VGz | lloD, Llod]",[],{'twitter': ['lloD74']},United States,VA,,US,CA,Laurel,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/peach': 1089, 'melee/falco': 1, 'melee...",
2,melee,6126,Zain,"[Zain, DontTestMe]",[PG],{'twitter': ['PG_Zain']},United States,VA,,US,CA,Los Angeles,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/marth': 1065, 'melee/pichu': 1, 'melee...",DontTestMe
3,melee,Chu,Chu,[Chu],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
4,melee,5620,Junebug,"[Junebug, LS | VGz Junebug]",[],{'twitter': ['arJunebug']},United States,VA,,US,VA,Richmond,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/sheik': 46, 'melee/falco': 4, 'melee/g...",


In [18]:
sets_df = pd.read_pickle(data_path + '/labelled_data/sets_df_3.pkl')
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
print(sets_df.shape)
sets_df.head(3)

32.9% percent of sets have some game data
(1795681, 27)


Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,charcter_data,charcter_change,p1_characters,p2_characters
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,5620,Chillin,5620,Chillin,3,1,...,False,,False,,False,151.0,False,,,
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,Aglet,15634,15634,Aglet,2,3,...,False,,False,,False,151.0,False,,,
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,6126,1097,6126,1097,3,0,...,False,,False,,False,151.0,False,,,


In [5]:
tournament_info_df = pd.read_pickle(data_path + '/labelled_data/tournament_info_df.pkl')
print(tournament_info_df.shape)
tournament_info_df.head(3)

(39675, 37)


Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,...,WSF_B_p2,LN_A_p1_non_top_8_sets,LN_A_p2_non_top_8_sets,LN_B_p1_non_top_8_sets,LN_B_p2_non_top_8_sets,WSF_A_p1_non_top_8_sets,WSF_A_p2_non_top_8_sets,WSF_B_p1_non_top_8_sets,WSF_B_p2_non_top_8_sets,major
0,melee,mdva-invitational-2017-(challonge-mirror),MDVA Invitational 2017 (Challonge Mirror),challonge,https://challonge.com/mdva_invitational_2017,,17,,2017-11-26 08:05:11,2017-11-26 08:48:09,...,,,,,,,,,,
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,...,Ginger,"[(32, True), (62, True), (77, False), (164, Tr...","[(39, True), (65, True), (78, False), (165, Tr...","[(47, True), (69, True), (80, False), (159, Tr...","[(40, True), (66, True), (79, True), (85, Fals...","[(28, True), (60, True), (76, True), (84, True)]","[(36, True), (64, True), (78, True), (85, True)]","[(44, True), (68, True), (80, True), (86, True)]","[(52, True), (72, True), (82, True), (87, True)]",
2,melee,slippi-champions-league-week-1__melee-singles,Slippi Champions League Week 1,pgstats,slippi-champions-league-week-1,melee-singles,20,,2020-10-11 14:00:00,2020-10-11 14:00:00,...,,,,,,,,,,True


In [6]:
overall_players_ranking_new_weekly_df = pd.read_pickle(data_path + 'overall_players_ranking_new_weekly.pkl')
overall_players_ranking_new_weekly_df.sample(3)

Unnamed: 0,1617201,1497667,2653190,3657740,41449,6039,3621289,3682293,3332271,2034855,...,1911774,4106746,Black hayato,The boy,138692,3293641,1701726,2408932,SmyD,15495
2015-01-15,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2017-04-20,1500.0,1500.0,1500.0,1500.0,1147.194923,1362.829389,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1337.689105,1400.124736,1397.772865,1500.0,1500.0,1500.0,1559.078031,1482.878104
2018-12-27,1500.0,1500.0,1500.0,1500.0,1147.194923,1362.829389,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1337.689105,1400.124736,1397.772865,1500.0,1500.0,1500.0,1559.078031,1482.878104


In [7]:
overall_players_rds_new_weekly_df = pd.read_pickle(data_path + 'overall_players_rds_new_weekly.pkl')
overall_players_rds_new_weekly_df.sample(3)

Unnamed: 0,1617201,1497667,2653190,3657740,41449,6039,3621289,3682293,3332271,2034855,...,1911774,4106746,Black hayato,The boy,138692,3293641,1701726,2408932,SmyD,15495
2020-07-09,239.743488,350.0,350.0,350.0,287.031627,268.055334,350.0,350.0,350.0,350.0,...,350.0,350.0,335.295319,282.839935,259.679524,350.0,350.0,350.0,241.625504,208.896021
2022-03-31,259.3355,350.0,350.0,350.0,53.472049,285.710868,350.0,350.0,350.0,350.0,...,95.417083,350.0,349.571999,299.624892,277.866846,350.0,350.0,196.962448,261.071377,231.111576
2023-03-09,269.403895,350.0,182.297796,350.0,52.14006,294.879219,350.0,350.0,350.0,350.0,...,61.94074,350.0,350.0,308.379502,287.285157,145.557225,350.0,210.038444,271.072712,242.352041


In [8]:
char_vs_char_player_rankings_weekly_alt2_df = pd.read_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2.pkl')
char_vs_char_player_rankings_weekly_alt2_df.sample(3)

Unnamed: 0,3688504/sheik/roy,3688504/sheik/samus,3688504/sheik/sheik,3689802/sheik/marth,3689821/falco/falco,3689821/falco/fox,3689821/falco/jigglypuff,3689821/falco/kirby,3689821/falco/marth,3689821/falco/mewtwo,...,368847/fox/pikachu,368847/fox/samus,368847/fox/sheik,3688504/sheik/captainfalcon,3688504/sheik/falco,3688504/sheik/fox,3688504/sheik/jigglypuff,3688504/sheik/luigi,3688504/sheik/marth,3688504/sheik/peach
2016-11-17,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2022-04-14,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2023-10-19,1500.0,1500.0,1500.0,500.0,1633.049974,1500.0,1500.0,1500.0,1624.167982,1500.0,...,1596.896931,1638.465668,1938.327843,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0


In [9]:
char_vs_char_player_rankings_weekly_alt2_rds_df = pd.read_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2_rds.pkl')
char_vs_char_player_rankings_weekly_alt2_rds_df.sample(3)

Unnamed: 0,1960158/fox/mewtwo,1960158/fox/roy,1960158/jigglypuff/falco,1960158/jigglypuff/fox,1960158/jigglypuff/iceclimbers,1960158/jigglypuff/jigglypuff,1960158/jigglypuff/link,1960158/jigglypuff/luigi,1960158/jigglypuff/marth,1960158/jigglypuff/sheik,...,999886/luigi/falco,999886/marth/captainfalcon,999886/sheik/falco,999888/sheik/captainfalcon,999888/sheik/falco,999888/sheik/fox,999888/sheik/jigglypuff,999888/sheik/marth,999888/sheik/mrgameandwatch,999888/sheik/samus
2017-04-27,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
2016-08-18,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
2017-04-13,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0


In [10]:
sets_without_character_changes_df=pd.read_pickle(data_path + '/labelled_data/sets_without_character_changes_df.pkl')
sets_without_character_changes_df.head(3)

Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,length_gamedata,p1_characters,p2_characters,matchup
19575,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,6126,1009,1009,6126,0,2,...,False,,False,,True,187,2,fox,marth,fox/marth
19582,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,6126,1004,6126,2,0,...,False,,False,,True,187,2,jigglypuff,marth,jigglypuff/marth
19626,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1028,1055,1028,1055,2,0,...,True,LN,True,LN_B,True,187,2,captainfalcon,sheik,captainfalcon/sheik


In [11]:
path = data_path + 'predict_matchup_dataset/'
overall_elos = pd.read_pickle(path + 'overall_elos.pkl')
matchup_sets_df = pd.read_pickle(path + 'matchup_sets_df.pkl')
p1_alt_2 = pd.read_pickle(path + 'p1_alt_2.pkl')
p2_alt_2 = pd.read_pickle(path + 'p2_alt_2.pkl')
p1_alt_2_rds = pd.read_pickle(path + 'p1_alt_2_rds.pkl')
p2_alt_2_rds = pd.read_pickle(path + 'p2_alt_2_rds.pkl')

In [12]:
print(overall_elos.shape)
print(matchup_sets_df.shape)
print(p1_alt_2.shape)
print(p2_alt_2.shape)
print(p1_alt_2_rds.shape)
print(p2_alt_2_rds.shape)

(437200, 4)
(437200, 27)
(437200, 678)
(437200, 678)
(437200, 678)
(437200, 678)


In [14]:
matchup_sets_df.head()


Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,length_gamedata,p1_characters,p2_characters,matchup
19575,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,6126,1009,1009,6126,0,2,...,False,,False,,True,187,2,fox,marth,fox/marth
19582,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,6126,1004,6126,2,0,...,False,,False,,True,187,2,jigglypuff,marth,jigglypuff/marth
19626,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1028,1055,1028,1055,2,0,...,True,LN,True,LN_B,True,187,2,captainfalcon,sheik,captainfalcon/sheik
19628,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,15990,1000,15990,1000,2,0,...,True,WSF,True,WSF_A,True,187,2,sheik,falco,sheik/falco
19629,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,1028,1004,1028,2,1,...,True,LQF,True,LQF_B,True,187,3,jigglypuff,captainfalcon,jigglypuff/captainfalcon
