# Explore the full dataset

In [1]:
# Standard library imports
import datetime
import os
from collections import deque
import time

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns

from xgboost import XGBClassifier


# from sklearn.model_selection import train_test_split


from tqdm import tqdm

if os.path.exists('/workspace/data_2'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data_2/'
else:
    data_path = '../data/'
    
# if torch.cuda.is_available() == False:
#     RuntimeError("GPU detected: False")
#     print("GPU detected: False")
# else:
#     device = torch.device("cuda")
#     print("The GPU is detected.")



### Load Data

In [3]:
dataset_df = pd.read_pickle(data_path + 'dataset_full.pkl')

In [None]:
sets_df=pd.read_pickle(data_path + 'labelled_sets_df.pkl')

In [None]:
print((dataset_df[['matchup_1','winner']].value_counts().to_string()))

Print out the columns using a loop for ease of reading.

In [None]:
for i, col in enumerate(dataset_df.columns):
    print(i, col)


Make a list of features and target variable.

In [None]:
features = dataset_df.columns[36:46].append(dataset_df.columns[47:53])
features = features.append(dataset_df.columns[55:])
target = 'winner'

df = dataset_df.dropna(subset=features)

## Feature Importance
Train XGBoostClassifier and look at the feature importance.

In [None]:
# Convert features and target to numpy arrays
X = df[features].values.astype(float)
y = df['winner'].values.astype(int)

model = XGBClassifier()

model.fit(X,y)



In [None]:
# Get feature importance for weight, gain, and cover
importance_weight = model.get_booster().get_score(importance_type='weight')
importance_gain = model.get_booster().get_score(importance_type='gain')
importance_cover = model.get_booster().get_score(importance_type='cover')

# Map importance to feature names and create a comprehensive DataFrame
importance_df = pd.DataFrame({
    'Feature': [features[int(k[1:])] for k in importance_weight.keys()],  # Map f0, f1, ... to feature names
    'Weight': [importance_weight.get(f, 0) for f in importance_weight.keys()],
    'Gain': [importance_gain.get(f, 0) for f in importance_weight.keys()],
    'Cover': [importance_cover.get(f, 0) for f in importance_weight.keys()]
}).sort_values(by='Weight', ascending=False)  # Sort by Weight (or choose another metric)

# Display the DataFrame
print(importance_df.head(15))

**Weight:** Number of times a feature is used in a split across all trees. High weight means the feature is frequently used in decision-making.


In [None]:
print(importance_df.sort_values(by='Weight', ascending=False).head(20))

**Gain:** The average improvement in accuracy brought by a feature when it is used in trees. High gain means the feature has a strong impact on predictions.

In [None]:
print(importance_df.sort_values(by='Gain', ascending=False).head(20))

**Cover:** The average number of samples affected by splits involving the feature.
High cover means the feature is applied to a large number of samples.

In [None]:
print(importance_df.sort_values(by='Cover', ascending=False).head(15))

In [None]:
x_vars = ['p1_default_elo',  'p1/m1_alt3_elo', 'p1/m1/m1_alt2_elo', 'p1_default_rd', 'p1/m1_alt3_rd',  'p1/m1/m1_alt2_rd']
y_vars = ['p2_default_elo',  'p2/m1_alt3_elo', 'p2/m1/m1_alt2_elo', 'p2_default_rd','p2/m1_alt3_rd',  'p2/m1/m1_alt2_rd']
extra_variable = ['matchup_1'] # Can be either 1.0, 0.0, or 0.5
target = 'winner' # Can be 1 or 0

# Create the pair plot
sns.pairplot(
    df.sample(10_000),
    x_vars = x_vars,
    y_vars = y_vars,
    hue=target,  # Use matchup_1 to color the plots
    kind = 'kde',
    # diag_kind='kde',  # Kernel density estimate on diagonals
    # markers=['o', 's', 'D']  # Different markers for the hue variable
)

# Add a title
plt.suptitle("Pair Plot", y=1.02)
plt.show()


In [None]:
x_vars = ['p1_default_elo',  'p1/m1_alt3_elo', 'p1/m1/m1_alt2_elo', 'p1_default_rd', 'p1/m1_alt3_rd',  'p1/m1/m1_alt2_rd']
y_vars = ['p2_default_elo',  'p2/m1_alt3_elo', 'p2/m1/m1_alt2_elo', 'p2_default_rd','p2/m1_alt3_rd',  'p2/m1/m1_alt2_rd']
extra_variable = ['matchup_1'] # Can be either 1.0, 0.0, or 0.5
target = 'winner' # Can be 1 or 0

# Filter rows
plotting_df = df[(df['p1_default_elo'] != 1500.0) & (df['p2_default_elo'] != 1500.0)]

n_samples = 1_000
winner_sample = plotting_df[plotting_df['winner']==1.0].sample(int((n_samples/2)))
loser_sample = plotting_df[plotting_df['winner']==0.0].sample(int((n_samples/2)))
sample_df  = pd.concat([winner_sample,loser_sample])

# Create the pair plot
sns.pairplot(
    sample_df,
    x_vars = x_vars,
    y_vars = y_vars,
    hue=target,  # Use matchup_1 to color the plots
    kind = 'kde',
    # diag_kind='kde',  # Kernel density estimate on diagonals
    # markers=['o', 's', 'D']  # Different markers for the hue variable
)

# Add a title
plt.suptitle("Both players appeared in the dataset before", y=1.02)
plt.show()

In [None]:
x_vars = ['p1_default_elo',  'p1/m1_alt3_elo', 'p1/m1/m1_alt2_elo', 'p1_default_rd', 'p1/m1_alt3_rd',  'p1/m1/m1_alt2_rd']
y_vars = ['p2_default_elo',  'p2/m1_alt3_elo', 'p2/m1/m1_alt2_elo', 'p2_default_rd','p2/m1_alt3_rd',  'p2/m1/m1_alt2_rd']
extra_variable = 'matchup_1' # Can be either 1.0, 0.0, or 0.5
target = 'winner' # Can be 1 or 0

# Filter rows
# plotting_df = df[(df['p1_default_elo'] != 1500.0) & (df['p2_default_elo'] != 1500.0)]

print("Rows for (matchup_1 == 1) & (winner == 1):",
      df[(df['matchup_1'] == 1) & (df['winner'] == 1)].shape[0])
print("Rows for (matchup_1 == 0) & (winner == 0):",
      df[(df['matchup_1'] == 0) & (df['winner'] == 0)].shape[0])


i = 500
winner_sample = df[(df['matchup_1']==1) & (df['winner']==1)].sample(n=i)
loser_sample = df[(df['matchup_1']==0) & (df['winner']==0)].sample(n=i)
sample_df  = pd.concat([winner_sample,loser_sample])

# Create the pair plot
sns.pairplot(
    sample_df,
    x_vars = x_vars,
    y_vars = y_vars,
    hue=extra_variable,  # Use matchup_1 to color the plots
    kind = 'kde',

)

# Add a title
plt.suptitle("Player most recent set agrees with the outcome of this set", y=1.02)
plt.show()

In [None]:
x_vars = ['p1_default_elo',  'p1/m1_alt3_elo', 'p1/m1/m1_alt2_elo', 'p1_default_rd', 'p1/m1_alt3_rd',  'p1/m1/m1_alt2_rd']
y_vars = ['p2_default_elo',  'p2/m1_alt3_elo', 'p2/m1/m1_alt2_elo', 'p2_default_rd','p2/m1_alt3_rd',  'p2/m1/m1_alt2_rd']
extra_variable = 'matchup_1' # Can be either 1.0, 0.0, or 0.5
target = 'winner' # Can be 1 or 0

# Filter rows
# plotting_df = df[(df['p1_default_elo'] != 1500.0) & (df['p2_default_elo'] != 1500.0)]

print("Rows for (matchup_1 == 1) & (winner == 1):",
      df[(df['matchup_1'] == 0) & (df['winner'] == 1)].shape[0])
print("Rows for (matchup_1 == 0) & (winner == 0):",
      df[(df['matchup_1'] == 1) & (df['winner'] == 0)].shape[0])


i = 500
winner_sample = df[(df['matchup_1']==0) & (df['winner']==1)].sample(n=i)
loser_sample = df[(df['matchup_1']==1) & (df['winner']==0)].sample(n=i)
sample_df  = pd.concat([winner_sample,loser_sample])

# Create the pair plot
sns.pairplot(
    sample_df,
    x_vars = x_vars,
    y_vars = y_vars,
    hue=extra_variable,  # Use matchup_1 to color the plots
    kind = 'kde',

)

# Add a title
plt.suptitle("Player's most recent set disagrees with the outcome of this set", y=1.02)
plt.show()

In [None]:
x_vars = ['p1_default_elo',  'p1/m1_alt3_elo', 'p1/m1/m1_alt2_elo', 'p1_default_rd', 'p1/m1_alt3_rd',  'p1/m1/m1_alt2_rd']
y_vars = ['p2_default_elo',  'p2/m1_alt3_elo', 'p2/m1/m1_alt2_elo', 'p2_default_rd','p2/m1_alt3_rd',  'p2/m1/m1_alt2_rd']
extra_variable = 'matchup_1' # Can be either 1.0, 0.0, or 0.5
target = 'winner' # Can be 1 or 0

# Filter rows
# plotting_df = df[(df['p1_default_elo'] != 1500.0) & (df['p2_default_elo'] != 1500.0)]

# print("Rows for (matchup_1 == 1) & (winner == 1):",
#       df[(df[['matchup_1','matchup_2']] == [1,1]) & (df['winner'] == 1)].shape[0])
# print("Rows for (matchup_1 == 0) & (winner == 0):",
#       df[(df[['matchup_1','matchup_2']] == [0,0]) & (df['winner'] == 0)].shape[0])


# Filter and sample rows for winner
winner_sample = df[
    (df['matchup_1'] == 1) & (df['matchup_2'] == 1) & (df['winner'] == 1)
].sample(n=min(i, len(df[(df['matchup_1'] == 1) & (df['matchup_2'] == 0) & (df['winner'] == 1)])))

# Filter and sample rows for loser
loser_sample = df[
    (df['matchup_1'] == 0) & (df['matchup_2'] == 0) & (df['winner'] == 0)
].sample(n=min(i, len(df[(df['matchup_1'] == 0) & (df['matchup_2'] == 1) & (df['winner'] == 0)])))

# Create the pair plot
sns.pairplot(
    sample_df,
    x_vars = x_vars,
    y_vars = y_vars,
    hue=extra_variable,  # Use matchup_1 to color the plots
    kind = 'kde',

)

# Add a title
plt.suptitle("The most recent set agrees while the second most recent set dissagrees with the outcome of the set", y=1.02)
plt.show()

In [None]:
x_vars = ['p1_default_elo',  'p1/m1_alt3_elo', 'p1/m1/m1_alt2_elo', 'p1_default_rd', 'p1/m1_alt3_rd',  'p1/m1/m1_alt2_rd']
y_vars = ['p2_default_elo',  'p2/m1_alt3_elo', 'p2/m1/m1_alt2_elo', 'p2_default_rd','p2/m1_alt3_rd',  'p2/m1/m1_alt2_rd']
extra_variable = 'matchup_1' # Can be either 1.0, 0.0, or 0.5
target = 'winner' # Can be 1 or 0

# Filter and sample rows for winner
winner_sample = df[
    (df['matchup_1'] == 0) & (df['matchup_2'] == 1) & (df['winner'] == 1)
].sample(n=min(i, len(df[(df['matchup_1'] == 0) & (df['matchup_2'] == 1) & (df['winner'] == 1)])))

# Filter and sample rows for loser
loser_sample = df[
    (df['matchup_1'] == 1) & (df['matchup_2'] == 0) & (df['winner'] == 0)
].sample(n=min(i, len(df[(df['matchup_1'] == 1) & (df['matchup_2'] == 0) & (df['winner'] == 0)])))

# Create the pair plot
sns.pairplot(
    sample_df,
    x_vars = x_vars,
    y_vars = y_vars,
    hue=extra_variable,  # Use matchup_1 to color the plots
    kind = 'kde',

)

# Add a title
plt.suptitle("The most recent set disagrees while the second most recent set agrees with the outcome of the set", y=1.02)
plt.show()

In [None]:
x_vars = ['p1_default_elo',  'p1/m1_alt3_elo', 'p1/m1/m1_alt2_elo', 'p1_default_rd', 'p1/m1_alt3_rd',  'p1/m1/m1_alt2_rd']
y_vars = ['p2_default_elo',  'p2/m1_alt3_elo', 'p2/m1/m1_alt2_elo', 'p2_default_rd','p2/m1_alt3_rd',  'p2/m1/m1_alt2_rd']
extra_variable = ['matchup_1'] # Can be either 1.0, 0.0, or 0.5
target = 'winner' # Can be 1 or 0

# Filter rows
plotting_df = df[(df['p1_default_elo'] != 1500.0) & (df['p2_default_elo'] == 1500.0)]

i = 500
winner_sample = plotting_df[plotting_df['winner']==1.0].sample(i )
loser_sample = plotting_df[plotting_df['winner']==0.0].sample(i )
sample_df  = pd.concat([winner_sample,loser_sample])

# Create the pair plot
sns.pairplot(
    sample_df,
    x_vars = x_vars,
    y_vars = y_vars,
    hue=target,  # Use matchup_1 to color the plots
    kind = 'kde',
    # diag_kind='kde',  # Kernel density estimate on diagonals
    # markers=['o', 's', 'D']  # Different markers for the hue variable
)

# Add a title
plt.suptitle("Both players appeared in the dataset before", y=1.02)
plt.show()

In [None]:
x_vars = ['p1_default_elo',  'p1/m1_alt3_elo', 'p1/m1/m1_alt2_elo', 'p1_default_rd', 'p1/m1_alt3_rd',  'p1/m1/m1_alt2_rd']
y_vars = ['p2_default_elo',  'p2/m1_alt3_elo', 'p2/m1/m1_alt2_elo', 'p2_default_rd','p2/m1_alt3_rd',  'p2/m1/m1_alt2_rd']
extra_variable = 'matchup_1' # Can be either 1.0, 0.0, or 0.5
target = 'winner' # Can be 1 or 0

# Filter rows
# plotting_df = df[(df['p1_default_elo'] != 1500.0) & (df['p2_default_elo'] != 1500.0)]
plotting_df = plotting_df[plotting_df['winner']==1.0]

n_samples = 1_000
winner_sample = plotting_df[plotting_df['matchup_1']==1.0].sample(int((n_samples/2)))
loser_sample = plotting_df[plotting_df['matchup_1']==0.0].sample(int((n_samples/2)))
sample_df  = pd.concat([winner_sample,loser_sample])

# Create the pair plot
sns.pairplot(
    sample_df,
    x_vars = x_vars,
    y_vars = y_vars,
    hue=extra_variable,  # Use matchup_1 to color the plots
    kind = 'kde',

)

# Add a title
plt.suptitle("Both players appeared in the dataset before", y=1.02)
plt.show()

Shade the regions above and have new outlined regions being matchup_1 = 1.0. See how that shifts the regions.

### Look default elo

In [None]:
default_elo_df = df[['winner', 'p1_default_elo', 'p2_default_elo', 'p1_default_rd', 'p2_default_rd', 'p1_default_updates', 'p2_default_updates', 'matchup_1']].copy()
default_elo_df['elo_difference'] = df['p1_default_elo'] - df['p2_default_elo']
default_elo_df = default_elo_df[default_elo_df['elo_difference'] != 0]
default_elo_df['min_rd'] = np.minimum(df['p1_default_rd'], df['p2_default_rd'])
default_elo_df['max_rd'] = np.maximum(df['p1_default_rd'], df['p2_default_rd'])


p2_won = default_elo_df['winner']==0.0
default_elo_df['winning_difference'] = default_elo_df['elo_difference']
default_elo_df.loc[p2_won,'winning_difference'] = default_elo_df.loc[p2_won,'winning_difference'] * -1

In [None]:
default_elo_df.head()

In [None]:

# Plot the distribution of winning_difference
plt.figure(figsize=(12, 6))
sns.kdeplot(
    data=default_elo_df,
    x='winning_difference',
    fill=True,
    color='blue',
    alpha=0.6
)
plt.title('Distribution of Winning Difference')
plt.xlabel('Winning Difference')
plt.ylabel('Density')
plt.grid()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Define bins for Elo difference
elo_bin_size = 100
default_elo_df['elo_bin'] = pd.cut(
    default_elo_df['elo_difference'], 
    bins=np.arange(default_elo_df['elo_difference'].min(), 
                   default_elo_df['elo_difference'].max() + elo_bin_size, 
                   elo_bin_size)
)

# Calculate probabilities for each bin
binned_probabilities = (
    default_elo_df.groupby('elo_bin')['winner']
    .mean()
    .reset_index()
)

# Add bin midpoints for better x-axis representation
binned_probabilities['elo_bin_midpoint'] = binned_probabilities['elo_bin'].apply(
    lambda x: (x.left + x.right) / 2
)

# Plot the smoothed probabilities
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=binned_probabilities,
    x='elo_bin_midpoint',
    y='winner',
    label='P(Winner = 1.0 | Elo Bin)',
    color='blue'
)
plt.axhline(0.5, color='red', linestyle='--', label='Baseline (50%)')
plt.title('Probability of Winning by Binned Elo Difference')
plt.xlabel('Elo Difference (Higher - Lower)')
plt.ylabel('Probability Winner = 1.0')
plt.legend()
plt.grid()
plt.show()


In [None]:
elo_bin_size = 25
max_elo_bin_value = 500

# Shift the bin boundaries by subtracting half the bin size
default_elo_df['binned_elo_difference'] = (
    np.sign(default_elo_df['elo_difference']) * 
    np.minimum(
        np.ceil((np.abs(default_elo_df['elo_difference']) - elo_bin_size / 2) / elo_bin_size), 
        max_elo_bin_value / elo_bin_size
    ) * elo_bin_size
).astype(np.int16)

rd_bin_size = 100
max_rd_bin_value = 300


default_elo_df['p1_binned_rd'] = (
    np.ceil(default_elo_df['p1_default_rd'] / rd_bin_size) * rd_bin_size
).astype(np.int16)


default_elo_df['p2_binned_rd'] = (
    np.ceil(default_elo_df['p2_default_rd'] / rd_bin_size) * rd_bin_size
).astype(np.int16)

default_elo_df['min_rd'] = np.minimum(default_elo_df['p1_binned_rd'], default_elo_df['p2_binned_rd'])
default_elo_df['max_rd'] = np.maximum(default_elo_df['p1_binned_rd'], default_elo_df['p2_binned_rd'])

In [None]:
default_elo_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Calculate overall probability
overall_probabilities = (
    default_elo_df.groupby('binned_elo_difference')['winner']
    .mean()
    .reset_index()
)
overall_probabilities.rename(columns={'winner': 'probability_winner_1'}, inplace=True)

# Calculate probabilities for p1_binned_rd
p1_probabilities = (
    default_elo_df.groupby(['binned_elo_difference', 'p1_binned_rd'])['winner']
    .mean()
    .reset_index()
)
p1_probabilities.rename(columns={'winner': 'probability_winner_1'}, inplace=True)

# Calculate probabilities for p2_binned_rd
p2_probabilities = (
    default_elo_df.groupby(['binned_elo_difference', 'p2_binned_rd'])['winner']
    .mean()
    .reset_index()
)
p2_probabilities.rename(columns={'winner': 'probability_winner_1'}, inplace=True)

# Plot for p1_binned_rd
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=overall_probabilities,
    x='binned_elo_difference',
    y='probability_winner_1',
    label='Overall',
    color='black',
    linewidth=2
)
sns.lineplot(
    data=p1_probabilities,
    x='binned_elo_difference',
    y='probability_winner_1',
    hue='p1_binned_rd',
    palette='viridis'
)
plt.title('Effect of p1_binned_rd on Probability of Winner Being 1.0')
plt.xlabel('Binned Elo Difference')
plt.ylabel('Probability Winner = 1.0')
plt.legend(title='p1_binned_rd')
plt.grid()
plt.show()

# Plot for p2_binned_rd
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=overall_probabilities,
    x='binned_elo_difference',
    y='probability_winner_1',
    label='Overall',
    color='black',
    linewidth=2
)
sns.lineplot(
    data=p2_probabilities,
    x='binned_elo_difference',
    y='probability_winner_1',
    hue='p2_binned_rd',
    palette='plasma'
)
plt.title('Effect of p2_binned_rd on Probability of Winner Being 1.0')
plt.xlabel('Binned Elo Difference')
plt.ylabel('Probability Winner = 1.0')
plt.legend(title='p2_binned_rd')
plt.grid()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the probability of the winner being 1.0 grouped by binned_elo_difference and binned_min_rd
probabilities = (
    default_elo_df.groupby(['binned_elo_difference', 'binned_min_rd'])['winner']
    .mean()
    .reset_index()
)

# Rename for clarity
probabilities.rename(columns={'winner': 'probability_winner_1'}, inplace=True)

# Create a Seaborn plot
g = sns.FacetGrid(
    probabilities,
    col="binned_min_rd",
    col_wrap=3,  # Adjust the number of columns for the grid
    sharey=True,
    height=4,
    aspect=1.5
)
g.map(sns.lineplot, 'binned_elo_difference', 'probability_winner_1', marker='o')

# Add titles and labels
g.set_axis_labels('Binned Elo Difference', 'Probability Winner = 1.0')
g.set_titles('Min RD = {col_name}')
g.tight_layout()

plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the probability of the winner being 1.0 grouped by binned_elo_difference and binned_min_rd
probabilities = (
    default_elo_df.groupby(['binned_elo_difference', 'binned_max_rd'])['winner']
    .mean()
    .reset_index()
)

# Rename for clarity
probabilities.rename(columns={'winner': 'probability_winner_1'}, inplace=True)

# Create a Seaborn plot
g = sns.FacetGrid(
    probabilities,
    col="binned_max_rd",
    col_wrap=3,  # Adjust the number of columns for the grid
    sharey=True,
    height=4,
    aspect=1.5
)
g.map(sns.lineplot, 'binned_elo_difference', 'probability_winner_1', marker='o')

# Add titles and labels
g.set_axis_labels('Binned Elo Difference', 'Probability Winner = 1.0')
g.set_titles('Max RD = {col_name}')
g.tight_layout()

plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the probabilities for both Min RD and Max RD
min_rd_probabilities = (
    default_elo_df.groupby(['binned_elo_difference', 'binned_min_rd'])['winner']
    .mean()
    .reset_index()
)
min_rd_probabilities['RD Type'] = 'Min RD'  # Add a column to distinguish Min RD

max_rd_probabilities = (
    default_elo_df.groupby(['binned_elo_difference', 'binned_max_rd'])['winner']
    .mean()
    .reset_index()
)
max_rd_probabilities['RD Type'] = 'Max RD'  # Add a column to distinguish Max RD

# Rename for clarity
min_rd_probabilities.rename(columns={'winner': 'probability_winner_1'}, inplace=True)
max_rd_probabilities.rename(columns={'winner': 'probability_winner_1'}, inplace=True)

# Combine the two DataFrames
combined_probabilities = pd.concat([min_rd_probabilities, max_rd_probabilities], ignore_index=True)

# Create a Seaborn FacetGrid plot with RD Type as hue
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=combined_probabilities,
    x='binned_elo_difference',
    y='probability_winner_1',
    hue='RD Type',
    style='RD Type',
    markers=True,
    dashes=False
)

# Add titles and labels
plt.title('Probability of Winner Being 1.0 by Binned Elo Difference (Min RD vs Max RD)')
plt.xlabel('Binned Elo Difference')
plt.ylabel('Probability Winner = 1.0')
plt.legend(title='RD Type')
plt.grid()

# Show the plot
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure the `matchup_1` column exists in your DataFrame; adjust this as needed
# For demonstration, replace "matchup_1" with the actual column name or create one if missing

# Calculate the probabilities for each combination of binned_elo_difference and matchup_1
probabilities = (
    default_elo_df.groupby(['binned_elo_difference', 'matchup_1'])['winner']
    .mean()
    .reset_index()
)

# Rename for clarity
probabilities.rename(columns={'winner': 'probability_winner_1'}, inplace=True)

# Plot the results with seaborn
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=probabilities,
    x='binned_elo_difference',
    y='probability_winner_1',
    hue='matchup_1',
    marker='o',
    palette='viridis'
)

# Add titles and labels
plt.title('Probability of Winner Being 1.0 by Binned Elo Difference and Matchup')
plt.xlabel('Binned Elo Difference')
plt.ylabel('Probability Winner = 1.0')
plt.legend(title='Matchup 1')
plt.grid()

# Show the plot
plt.show()
