In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, classification_report, cohen_kappa_score, log_loss, root_mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split  # Correct import
from prettytable import PrettyTable

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'

In [52]:
players_df = pd.read_pickle(data_path + '/labelled_data/players_df.pkl')
players_df.head()

Unnamed: 0,game,player_id,tag,all_tags,prefixes,social,country,state,region,c_country,c_state,c_region,placings,characters,alias
0,melee,Rishi,Rishi,[Rishi],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
1,melee,15634,lloD,"[lloD, VGz | lloD, Llod]",[],{'twitter': ['lloD74']},United States,VA,,US,CA,Laurel,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/peach': 1089, 'melee/falco': 1, 'melee...",
2,melee,6126,Zain,"[Zain, DontTestMe]",[PG],{'twitter': ['PG_Zain']},United States,VA,,US,CA,Los Angeles,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/marth': 1065, 'melee/pichu': 1, 'melee...",DontTestMe
3,melee,Chu,Chu,[Chu],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
4,melee,5620,Junebug,"[Junebug, LS | VGz Junebug]",[],{'twitter': ['arJunebug']},United States,VA,,US,VA,Richmond,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/sheik': 46, 'melee/falco': 4, 'melee/g...",


In [53]:
sets_df = pd.read_pickle(data_path + '/labelled_data/sets_df_2.pkl')
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
print(sets_df.shape)
sets_df.head(3)

32.9% percent of sets have some game data
(1795681, 23)


Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,bracket_order,set_order,best_of,game_data,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index
0,104675843,melee,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,5620,Chillin,5620,Chillin,3,1,...,1,A,5,[],False,,False,,False,151
1,104675844,melee,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,Aglet,15634,15634,Aglet,2,3,...,1,B,5,[],False,,False,,False,151
2,104675845,melee,mdva-invitational-2017-(challonge-mirror),2017-11-26 08:05:11,6126,1097,6126,1097,3,0,...,1,C,5,[],False,,False,,False,151


In [54]:
tournament_info_df = pd.read_pickle(data_path + '/labelled_data/tournament_info_df.pkl')
print(tournament_info_df.shape)
tournament_info_df.head(3)

(39675, 37)


Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,...,WSF_B_p2,LN_A_p1_non_top_8_sets,LN_A_p2_non_top_8_sets,LN_B_p1_non_top_8_sets,LN_B_p2_non_top_8_sets,WSF_A_p1_non_top_8_sets,WSF_A_p2_non_top_8_sets,WSF_B_p1_non_top_8_sets,WSF_B_p2_non_top_8_sets,major
0,melee,mdva-invitational-2017-(challonge-mirror),MDVA Invitational 2017 (Challonge Mirror),challonge,https://challonge.com/mdva_invitational_2017,,17,,2017-11-26 08:05:11,2017-11-26 08:48:09,...,,,,,,,,,,
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,...,Ginger,"[(32, True), (62, True), (77, False), (164, Tr...","[(39, True), (65, True), (78, False), (165, Tr...","[(47, True), (69, True), (80, False), (159, Tr...","[(40, True), (66, True), (79, True), (85, Fals...","[(28, True), (60, True), (76, True), (84, True)]","[(36, True), (64, True), (78, True), (85, True)]","[(44, True), (68, True), (80, True), (86, True)]","[(52, True), (72, True), (82, True), (87, True)]",
2,melee,slippi-champions-league-week-1__melee-singles,Slippi Champions League Week 1,pgstats,slippi-champions-league-week-1,melee-singles,20,,2020-10-11 14:00:00,2020-10-11 14:00:00,...,,,,,,,,,,True


In [55]:
overall_players_ranking_new_weekly_df = pd.read_pickle(data_path + 'overall_players_ranking_new_weekly.pkl')
overall_players_ranking_new_weekly_df.sample(3)

Unnamed: 0,1617201,1497667,2653190,3657740,41449,6039,3621289,3682293,3332271,2034855,...,1911774,4106746,Black hayato,The boy,138692,3293641,1701726,2408932,SmyD,15495
2018-07-12,1500.0,1500.0,1500.0,1500.0,1147.194923,1362.829389,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1337.689105,1400.124736,1397.772865,1500.0,1500.0,1500.0,1559.078031,1482.878104
2020-06-25,1305.337441,1500.0,1500.0,1500.0,1147.194923,1362.829389,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1337.689105,1400.124736,1397.772865,1500.0,1500.0,1500.0,1559.078031,1482.878104
2015-06-04,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0


In [56]:
overall_players_rds_new_weekly_df = pd.read_pickle(data_path + 'overall_players_rds_new_weekly.pkl')
overall_players_rds_new_weekly_df.sample(3)

Unnamed: 0,1617201,1497667,2653190,3657740,41449,6039,3621289,3682293,3332271,2034855,...,1911774,4106746,Black hayato,The boy,138692,3293641,1701726,2408932,SmyD,15495
2018-05-24,350.0,350.0,350.0,350.0,265.186689,244.530806,350.0,350.0,350.0,350.0,...,350.0,350.0,316.802577,260.654108,235.320495,350.0,350.0,350.0,215.236109,177.712404
2019-03-07,350.0,350.0,350.0,350.0,273.458858,253.474455,350.0,350.0,350.0,350.0,...,350.0,350.0,323.756277,269.062019,244.60067,350.0,350.0,350.0,225.343762,189.828238
2022-07-21,262.665578,350.0,350.0,350.0,62.202897,288.736627,350.0,350.0,350.0,350.0,...,41.011294,350.0,350.0,302.511399,280.976928,350.0,350.0,201.325564,264.378723,234.841096


In [57]:
char_vs_char_player_rankings_weekly_alt2_df = pd.read_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2.pkl')
char_vs_char_player_rankings_weekly_alt2_df.sample(3)

Unnamed: 0,3688504/sheik/roy,3688504/sheik/samus,3688504/sheik/sheik,3689802/sheik/marth,3689821/falco/falco,3689821/falco/fox,3689821/falco/jigglypuff,3689821/falco/kirby,3689821/falco/marth,3689821/falco/mewtwo,...,368847/fox/pikachu,368847/fox/samus,368847/fox/sheik,3688504/sheik/captainfalcon,3688504/sheik/falco,3688504/sheik/fox,3688504/sheik/jigglypuff,3688504/sheik/luigi,3688504/sheik/marth,3688504/sheik/peach
2021-12-23,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2018-11-01,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2021-04-29,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0


In [58]:
char_vs_char_player_rankings_weekly_alt2_rds_df = pd.read_pickle(data_path + 'char_vs_char_player_rankings_weekly_alt2_rds.pkl')
char_vs_char_player_rankings_weekly_alt2_rds_df.sample(3)

Unnamed: 0,1960158/fox/mewtwo,1960158/fox/roy,1960158/jigglypuff/falco,1960158/jigglypuff/fox,1960158/jigglypuff/iceclimbers,1960158/jigglypuff/jigglypuff,1960158/jigglypuff/link,1960158/jigglypuff/luigi,1960158/jigglypuff/marth,1960158/jigglypuff/sheik,...,999886/luigi/falco,999886/marth/captainfalcon,999886/sheik/falco,999888/sheik/captainfalcon,999888/sheik/falco,999888/sheik/fox,999888/sheik/jigglypuff,999888/sheik/marth,999888/sheik/mrgameandwatch,999888/sheik/samus
2023-03-23,350.0,350.0,152.085416,106.362718,308.38479,160.091709,306.873545,256.041323,185.981982,181.180802,...,271.236217,246.292599,285.374724,192.256843,217.838495,222.52925,244.304582,169.699024,252.405994,252.405994
2020-04-23,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0
2015-04-16,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,...,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0,350.0


In [59]:
sets_without_character_changes_df=pd.read_pickle(data_path + '/labelled_data/sets_without_character_changes_df.pkl')
sets_without_character_changes_df.head(3)

Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,length_gamedata,p1_characters,p2_characters,matchup
19575,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,6126,1009,1009,6126,0,2,...,False,,False,,True,187,2,fox,marth,fox/marth
19582,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,6126,1004,6126,2,0,...,False,,False,,True,187,2,jigglypuff,marth,jigglypuff/marth
19626,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1028,1055,1028,1055,2,0,...,True,LN,True,LN_B,True,187,2,captainfalcon,sheik,captainfalcon/sheik


In [60]:
path = data_path + 'predict_matchup_dataset/'
overall_elos = pd.read_pickle(path + 'overall_elos.pkl')
matchup_sets_df = pd.read_pickle(path + 'matchup_sets_df.pkl')
p1_alt_2 = pd.read_pickle(path + 'p1_alt_2.pkl')
p2_alt_2 = pd.read_pickle(path + 'p2_alt_2.pkl')
p1_alt_2_rds = pd.read_pickle(path + 'p1_alt_2_rds.pkl')
p2_alt_2_rds = pd.read_pickle(path + 'p2_alt_2_rds.pkl')

In [61]:
print(overall_elos.shape)
print(matchup_sets_df.shape)
print(p1_alt_2.shape)
print(p2_alt_2.shape)
print(p1_alt_2_rds.shape)
print(p2_alt_2_rds.shape)

(437200, 4)
(437200, 27)
(437200, 678)
(437200, 678)
(437200, 678)
(437200, 678)


In [62]:
matchup_sets_df['p1_characters'].value_counts().index.values

array(['fox', 'falco', 'marth', 'sheik', 'captainfalcon', 'jigglypuff',
       'peach', 'luigi', 'samus', 'ganondorf', 'iceclimbers', 'drmario',
       'yoshi', 'pikachu', 'link', 'mario', 'mrgameandwatch',
       'donkeykong', 'roy', 'zelda', 'kirby', 'ness', 'younglink',
       'pichu', 'bowser', 'mewtwo'], dtype=object)

Add the matchup ratings of the players.


In [63]:
matchup_sets_rating_df = matchup_sets_df.copy()
matchup_sets_rating_df.head(3)

Unnamed: 0,key,game,tournament_key,tournament_start_date,winner_id,loser_id,p1_id,p2_id,p1_score,p2_score,...,top_8,top_8_location_names,valid_top_8_bracket,top_8_bracket_location_names,major,ranking_date_index,length_gamedata,p1_characters,p2_characters,matchup
19575,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,6126,1009,1009,6126,0,2,...,False,,False,,True,187,2,fox,marth,fox/marth
19582,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1004,6126,1004,6126,2,0,...,False,,False,,True,187,2,jigglypuff,marth,jigglypuff/marth
19626,,melee,evo-2018__evo-2018-1,2018-08-03 15:00:00,1028,1055,1028,1055,2,0,...,True,LN,True,LN_B,True,187,2,captainfalcon,sheik,captainfalcon/sheik


In [64]:
tqdm.pandas()

# Ensure matchup key exists
matchup_sets_rating_df['p1_matchup_key'] = matchup_sets_rating_df['p1_characters'] + '/' + matchup_sets_rating_df['p2_characters']

# Check for matchups that are valid in `p1_alt_2` columns
valid_mask = matchup_sets_rating_df['p1_matchup_key'].isin(p1_alt_2.columns)
matchup_sets_rating_df['p1_matchup_rating'] = None  # Initialize with None

# For valid matchups, retrieve the corresponding values in a vectorized manner
matchup_sets_rating_df.loc[valid_mask, 'p1_matchup_rating'] = matchup_sets_rating_df[valid_mask].progress_apply(
    lambda row: p1_alt_2.at[row.name, row['p1_matchup_key']], axis=1
)

matchup_sets_rating_df['p1_matchup_rds'] = None  # Initialize with None

# For valid matchups, retrieve the corresponding values in a vectorized manner
matchup_sets_rating_df.loc[valid_mask, 'p1_matchup_rds'] = matchup_sets_rating_df[valid_mask].progress_apply(
    lambda row: p1_alt_2_rds.at[row.name, row['p1_matchup_key']], axis=1
)





  0%|          | 0/437200 [00:00<?, ?it/s]

  0%|          | 0/437200 [00:00<?, ?it/s]

In [65]:
tqdm.pandas()

# Ensure matchup key exists
matchup_sets_rating_df['p2_matchup_key'] = matchup_sets_rating_df['p2_characters'] + '/' + matchup_sets_rating_df['p1_characters']

# Check for matchups that are valid in `p1_alt_2` columns
valid_mask = matchup_sets_rating_df['p2_matchup_key'].isin(p1_alt_2.columns)
matchup_sets_rating_df['p2_matchup_rating'] = None  # Initialize with None

# For valid matchups, retrieve the corresponding values in a vectorized manner
matchup_sets_rating_df.loc[valid_mask, 'p2_matchup_rating'] = matchup_sets_rating_df[valid_mask].progress_apply(
    lambda row: p1_alt_2.at[row.name, row['p2_matchup_key']], axis=1
)

matchup_sets_rating_df['p2_matchup_rds'] = None  # Initialize with None

# For valid matchups, retrieve the corresponding values in a vectorized manner
matchup_sets_rating_df.loc[valid_mask, 'p2_matchup_rds'] = matchup_sets_rating_df[valid_mask].progress_apply(
    lambda row: p1_alt_2_rds.at[row.name, row['p2_matchup_key']], axis=1
)



  0%|          | 0/437200 [00:00<?, ?it/s]

  0%|          | 0/437200 [00:00<?, ?it/s]

In [66]:
matchup_sets_rating_df[overall_elos.columns] = overall_elos

In [67]:
matchup_sets_rating_df.columns

Index(['key', 'game', 'tournament_key', 'tournament_start_date', 'winner_id',
       'loser_id', 'p1_id', 'p2_id', 'p1_score', 'p2_score', 'valid_score',
       'location_names', 'bracket_name', 'bracket_order', 'set_order',
       'best_of', 'game_data', 'top_8', 'top_8_location_names',
       'valid_top_8_bracket', 'top_8_bracket_location_names', 'major',
       'ranking_date_index', 'length_gamedata', 'p1_characters',
       'p2_characters', 'matchup', 'p1_matchup_key', 'p1_matchup_rating',
       'p1_matchup_rds', 'p2_matchup_key', 'p2_matchup_rating',
       'p2_matchup_rds', 'p1_elo', 'p2_elo', 'p1_rd', 'p2_rd'],
      dtype='object')

In [68]:
matchup_sets_rating_df['cts_score']  = matchup_sets_rating_df['p1_score'] / (matchup_sets_rating_df['p1_score'] + matchup_sets_rating_df['p2_score'])
matchup_sets_rating_df['cts_score'].describe()

count    437200.000000
mean          0.660805
std           0.394799
min           0.000000
25%           0.333333
50%           0.750000
75%           1.000000
max           1.000000
Name: cts_score, dtype: float64

Train a model

In [133]:
num_pop_list = [18]


# min_overall_rds_list = [300, 200, 150, 100]
min_overall_rds_list = [150]
min_matchup_rds_list = [150]
# min_matchup_rds_list = [350, 300, 200, 150, 100]

for num_pop in num_pop_list:
       for min_overall_rds in min_overall_rds_list:
              for min_matchup_rds in min_matchup_rds_list:
                     characters = matchup_sets_rating_df['p1_characters'].value_counts().head(num_pop).index.values
                     filtered_sets_df = matchup_sets_rating_df[matchup_sets_rating_df['p1_characters'].isin(characters) & matchup_sets_rating_df['p2_characters'].isin(characters)].copy()

                     filtered_sets_df = matchup_sets_rating_df[(matchup_sets_rating_df['p1_matchup_rds']<min_matchup_rds) & (matchup_sets_rating_df['p2_matchup_rds']<min_matchup_rds)]
                     filtered_sets_df = matchup_sets_rating_df[(matchup_sets_rating_df['p1_rd']<min_overall_rds) & (matchup_sets_rating_df['p2_rd']<min_overall_rds)]


                     filtered_sets_df = filtered_sets_df[filtered_sets_df['valid_score']==True]
                     filtered_sets_df = filtered_sets_df[(filtered_sets_df['best_of']==3) | (filtered_sets_df['best_of']==5)]
                     print(f"There are {filtered_sets_df.shape[0]} sets, or {filtered_sets_df.shape[0] / matchup_sets_rating_df.shape[0]:.2%}")

                     features = ['p1_matchup_rating',
                            'p1_matchup_rds', 'p2_matchup_rating',
                            'p2_matchup_rds', 'p1_elo', 'p2_elo', 'p1_rd', 'p2_rd']

                     X = filtered_sets_df[features].astype(float)

                     y =  (filtered_sets_df['winner_id'] == filtered_sets_df['p1_id']).to_numpy()

                     # Perform train-test split with stratification
                     X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42, stratify=y)


                     classifier = XGBClassifier(eval_metric='mlogloss', random_state=42, n_estimators=100, tree_method='hist', max_depth=7)

                     # Train the classifier on the training data
                     classifier.fit(X_train, y_train)

                     # Make predictions on the test set
                     y_pred = classifier.predict(X_test)
                     y_pred_proba = classifier.predict_proba(X_test)  # Get probabilities for log_loss

                     accuracy_1 = accuracy_score(y_test, y_pred)
                     log_loss_score_1 = log_loss(y_test, y_pred_proba)  # Use y_pred_proba here

                     print(f'{num_pop} most popular characters')
                     print(f'min overall rds = {min_overall_rds}')
                     print(f'min matchup rds = {min_matchup_rds}')
                     print()
                     print(f'Predict Outcome Only')
                     print('---------------------------')
                     print(f"Test Accuracy with matchup: {accuracy_1:.4%}")
                     print(f"Log Loss Score with matchup: {log_loss_score_1:.4f}")
                     features = ['p1_elo', 'p2_elo', 'p1_rd', 'p2_rd']

                     X = filtered_sets_df[features].astype(float)

                     # Encode the labels as integers

                     y =  (filtered_sets_df['winner_id'] == filtered_sets_df['p1_id']).to_numpy()

                     # Perform train-test split with stratification
                     X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42, stratify=y)


                     classifier = XGBClassifier(eval_metric='mlogloss', random_state=42, n_estimators=100, tree_method='hist', max_depth=7)

                     # Train the classifier on the training data
                     classifier.fit(X_train, y_train)

                     # Make predictions on the test set
                     y_pred = classifier.predict(X_test)
                     y_pred_proba = classifier.predict_proba(X_test)  # Get probabilities for log_loss

                     accuracy_2 = accuracy_score(y_test, y_pred)
                     log_loss_score_2 = log_loss(y_test, y_pred_proba)  # Use y_pred_proba here

                     print(f"Test Accuracy without matchup: {accuracy_2:.4%}")
                     print(f"Log Loss Score without matchup: {log_loss_score_2:.4f}")
                     print()
                     if accuracy_1 - accuracy_2> 0:
                            print(f"Predict with matchup did better by {accuracy_1 - accuracy_2:.4%}")
                     else:
                            print(f"Predict without matchup did better by {-(accuracy_1 - accuracy_2):.4%}") 
                     print(f"Difference Log Loss Score: {log_loss_score_2 - log_loss_score_1:.4f}")
                     print()

                     features = ['p1_matchup_rating',
                            'p1_matchup_rds', 'p2_matchup_rating',
                            'p2_matchup_rds', 'p1_elo', 'p2_elo', 'p1_rd', 'p2_rd']

                     X = filtered_sets_df[features].astype(float)

                     y =  filtered_sets_df['cts_score']

                     # Perform train-test split with stratification
                     X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42, stratify=y)

                     regressor = XGBRegressor(eval_metric='rmse', random_state=42, n_estimators=100, tree_method='hist', max_depth=7)

                     # Train the classifier on the training data
                     regressor.fit(X_train, y_train)

                     # Make predictions on the test set
                     y_pred_score = regressor.predict(X_test)
                     y_pred_results = y_pred_score > .5

                     accuracy_3 = accuracy_score((y_test > .5), y_pred_results)
                     rmse_1 = root_mean_squared_error(y_test, y_pred_score)  # Use y_pred_proba here
                     print(f'Predict Score')
                     print('---------------------------')
                     print(f"Test Accuracy with matchup: {accuracy_3:.4%}")
                     print(f"RMSE with matchup: {rmse_1:.4f}")

                     features = ['p1_elo', 'p2_elo', 'p1_rd', 'p2_rd']

                     X = filtered_sets_df[features].astype(float)

                     # Perform train-test split with stratification
                     X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42, stratify=y)

                     regressor = XGBRegressor(eval_metric='rmse', random_state=42, n_estimators=100, tree_method='hist', max_depth=7)

                     # Train the classifier on the training data
                     regressor.fit(X_train, y_train)

                     # Make predictions on the test set
                     y_pred_score = regressor.predict(X_test)
                     y_pred_results = y_pred_score > .5

                     accuracy_4 = accuracy_score((y_test > .5), y_pred_results)
                     rmse_2 = root_mean_squared_error(y_test, y_pred_score)  # Use y_pred_proba here

                     print(f"Test Accuracy with matchup: {accuracy_4:.4%}")
                     print(f"RMSE with matchup: {rmse_2:.4f}")
                     print()

                     if accuracy_3 - accuracy_2> 0:
                            print(f"Predict with matchup did better by {accuracy_3 - accuracy_4:.4%}")
                     else:
                            print(f"Predict without matchup did better by {-(accuracy_3 - accuracy_4):.4%}") 
                     print(f"Difference Log Loss: {rmse_2 -rmse_1:.4f}")
                     print()
                     print('Summary:')
                     print('---------------------------')
                     if accuracy_1 - accuracy_3 > 0:
                            print(f"Matchup: Predict outcome did better by {accuracy_1 - accuracy_3:.4%}")
                     else:
                            print(f"Matchup: Predict score did better by {-(accuracy_1 - accuracy_3):.4%}")   
                     if accuracy_2 - accuracy_4 > 0:
                            print(f"No Matchup: Predict outcome did better by {accuracy_2 - accuracy_4:.4%}")
                     else:
                            print(f"No Matchup: Predict score did better by {-(accuracy_2 - accuracy_4):.4%}") 
                     print(f'Best Accuracy: {np.max(np.array([accuracy_1, accuracy_2, accuracy_3, accuracy_4])):.4%} vs Baseline {np.max(np.array([accuracy_1, accuracy_2, accuracy_3, accuracy_4]))-accuracy_2:.4%}')
                     print('\n++++++++++++++++++++++++++++++++++++++++++++++\n')
       print('\n=====================================================\n')

There are 344568 sets, or 78.81%
18 most popular characters
min overall rds = 150
min matchup rds = 150

Predict Outcome Only
---------------------------
Test Accuracy with matchup: 78.9201%
Log Loss Score with matchup: 0.4370
Test Accuracy without matchup: 78.6894%
Log Loss Score without matchup: 0.4451

Predict with matchup did better by 0.2307%
Difference Log Loss Score: 0.0081

Predict Score
---------------------------
Test Accuracy with matchup: 79.3075%
RMSE with matchup: 0.2992
Test Accuracy with matchup: 78.8998%
RMSE with matchup: 0.3025

Predict with matchup did better by 0.4078%
Difference Log Loss: 0.0032

Summary:
---------------------------
Matchup: Predict score did better by 0.3874%
No Matchup: Predict score did better by 0.2104%
Best Accuracy: 79.3075% vs Baseline 0.6182%

++++++++++++++++++++++++++++++++++++++++++++++



