# Additionl columns to sets_df
We already have run jaspar_label_majors.ipynb, jaspar_top_8_tournamen_path.ipynb, and jaspar_top_8.ipynb
resulting in 
- (data_path + 'top_8_tournament_previous_sets_and_results_df') 
- (data_path + 'sets_top_8_labeled_df.pkl')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, cohen_kappa_score
from sklearn.model_selection import train_test_split  # Correct import

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'


## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [None]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = + data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Load Data


In [None]:
# players_df = pd.read_pickle(data_path + '/labelled_data/players_df.pkl')
# players_df.head()


In [None]:
sets_df = pd.read_pickle(data_path + '/sets_top_8_labeled_df.pkl')
# sets_df = dfs['sets_df']
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
print(sets_df.shape)
sets_df.head(3)


In [None]:
tournament_info_df = pd.read_pickle(data_path + '/top_8_tournament_previous_sets_and_results_df')
print(tournament_info_df.shape)
tournament_info_df.head(3)


### Add loser_id column

In [None]:
sets_df['loser_id'] = sets_df['p1_id']
p2_lose= (sets_df['winner_id'] == sets_df['p1_id'])
sets_df.loc[p2_lose, 'loser_id'] = sets_df['p2_id']

sets_df = sets_df[['key', 'game', 'tournament_key', 'winner_id', 'loser_id', 'p1_id', 'p2_id',
       'p1_score', 'p2_score', 'location_names', 'bracket_name',
       'bracket_order', 'set_order', 'best_of', 'game_data', 'top_8',
       'top_8_location_names', 'valid_top_8_bracket',
       'top_8_bracket_location_names', 'major']]
sets_df.head()

### Add valid_score column
This column will be true if the match was a best of 3 or best of 5 with one player getting the score needed to win.

In [None]:
# Filter Best of 3 sets
best_of_3s = sets_df[sets_df['best_of'] == 3]

# Valid Best of 3: Player 1 wins
best_of_3s_p1_win = best_of_3s[(best_of_3s['p1_score'] == 2) & (best_of_3s['winner_id'] == best_of_3s['p1_id'])]
best_of_3s_valid_p1 = best_of_3s_p1_win[best_of_3s_p1_win['p2_score'].isin([0, 1])]

# Valid Best of 3: Player 2 wins
best_of_3s_p2_win = best_of_3s[(best_of_3s['p2_score'] == 2) & (best_of_3s['winner_id'] == best_of_3s['p2_id'])]
best_of_3s_valid_p2 = best_of_3s_p2_win[best_of_3s_p2_win['p1_score'].isin([0, 1])]

# Combine valid Best of 3 sets
best_of_3s_valid = pd.concat([best_of_3s_valid_p1, best_of_3s_valid_p2])

# Filter Best of 5 sets
best_of_5s = sets_df[sets_df['best_of'] == 5]

# Valid Best of 5: Player 1 wins
best_of_5s_p1_win = best_of_5s[(best_of_5s['p1_score'] == 3) & (best_of_5s['winner_id'] == best_of_5s['p1_id'])]
best_of_5s_valid_p1 = best_of_5s_p1_win[best_of_5s_p1_win['p2_score'].isin([0, 1, 2])]

# Valid Best of 5: Player 2 wins
best_of_5s_p2_win = best_of_5s[(best_of_5s['p2_score'] == 3) & (best_of_5s['winner_id'] == best_of_5s['p2_id'])]
best_of_5s_valid_p2 = best_of_5s_p2_win[best_of_5s_p2_win['p1_score'].isin([0, 1, 2])]

# Combine valid Best of 5 sets
best_of_5s_valid = pd.concat([best_of_5s_valid_p1, best_of_5s_valid_p2])

# Combine all valid sets and create the 'valid_score' column
valid_score_index = pd.concat([best_of_3s_valid, best_of_5s_valid]).index
sets_df['valid_score'] = False
sets_df.loc[valid_score_index, 'valid_score'] = True

sets_df = sets_df[['key', 'game', 'tournament_key', 'winner_id', 'loser_id', 'p1_id',
       'p2_id', 'p1_score', 'p2_score', 'valid_score', 'best_of', 'location_names', 'bracket_name',
       'bracket_order', 'set_order',  'game_data', 'top_8',
       'top_8_location_names', 'valid_top_8_bracket',
       'top_8_bracket_location_names', 'major']]

print(f"Sets with a valid score make up {sets_df['valid_score'].sum() / sets_df.shape[0]:.2%} of the dataset.")
print(f"Best of 3s with a valid score make up {sets_df[sets_df['best_of']==3]['valid_score'].sum() / sets_df.shape[0]:.2%} of the dataset.")
print(f"Best of 5s with a valid score make up {sets_df[sets_df['best_of']==5]['valid_score'].sum() / sets_df.shape[0]:.2%} of the dataset.")

sets_df.head(3)


### Make some plots

In [None]:
bo3_results = sets_df[(sets_df['best_of']==3) & (sets_df['valid_score']==True)][['p1_score','p2_score']]
bo3_result_ordered_labels = ['(2, 0)', '(2, 1)', '(1, 2)', '(0, 2)']

bo5_results = sets_df[(sets_df['best_of']==5) & (sets_df['valid_score']==True)][['p1_score','p2_score']]
bo5_result_ordered_labels = ['(3, 0)', '(3, 1)', '(3, 2)', '(2, 3)', '(1, 3)', '(0, 3)']

# Calculate counts and percentages for Best of 3
bo3_counts = bo3_results.value_counts(subset=['p1_score', 'p2_score']).reindex(
    [(2, 0), (2, 1), (1, 2), (0, 2)], fill_value=0
)
bo3_percentages = (bo3_counts / bo3_counts.sum()) * 100

# Calculate counts and percentages for Best of 5
bo5_counts = bo5_results.value_counts(subset=['p1_score', 'p2_score']).reindex(
    [(3, 0), (3, 1), (3, 2), (2, 3), (1, 3), (0, 3)], fill_value=0
)
bo5_percentages = (bo5_counts / bo5_counts.sum()) * 100

# Create the figure and axes
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Plot Best of 3 histogram
axes[0].bar(bo3_result_ordered_labels, bo3_percentages, color='blue', alpha=0.7)
axes[0].set_title('Best of 3 Results')
axes[0].set_ylabel('Percentage (%)')
axes[0].set_xlabel('Results')
axes[0].set_ylim(0, 100)
for i, pct in enumerate(bo3_percentages):
    axes[0].text(i, pct + 1, f'{int(pct)}%', ha='center')

# Plot Best of 5 histogram
axes[1].bar(bo5_result_ordered_labels, bo5_percentages, color='green', alpha=0.7)
axes[1].set_title('Best of 5 Results')
axes[1].set_ylabel('Percentage (%)')
axes[1].set_xlabel('Results')
axes[1].set_ylim(0, 100)
for i, pct in enumerate(bo5_percentages):
    axes[1].text(i, pct + 1, f'{int(pct)}%', ha='center')

# Adjust layout and show the plot
plt.tight_layout()
plt.show()


We try some more advanced plots, but they did not quite work. Maybe I'll revisit later.

In [None]:
bo3_results = sets_df[(sets_df['best_of']==3) & (sets_df['valid_score']==True)][['p1_score','p2_score']]
bo3_result_ordered_labels = ['(2, 0)', '(2, 1)', '(1, 2)', '(0, 2)']

bo5_results = sets_df[(sets_df['best_of']==5) & (sets_df['valid_score']==True)][['p1_score','p2_score']]
bo5_result_ordered_labels = ['(3, 0)', '(3, 1)', '(3, 2)', '(2, 3)', '(1, 3)', '(0, 3)']


# Calculate counts and percentages for Best of 3
bo3_counts = bo3_results.value_counts(subset=['p1_score', 'p2_score']).reindex(
    [(2, 0), (2, 1), (1, 2), (0, 2)], fill_value=0
)
bo3_percentages = (bo3_counts / bo3_counts.sum()) * 100

# Calculate aggregate percentages for Player 1 and Player 2 wins in Best of 3
bo3_p1_wins_percent = bo3_percentages.loc[(2, 0)] + bo3_percentages.loc[(2, 1)]
bo3_p2_wins_percent = bo3_percentages.loc[(1, 2)] + bo3_percentages.loc[(0, 2)]

# Calculate counts and percentages for Best of 5
bo5_counts = bo5_results.value_counts(subset=['p1_score', 'p2_score']).reindex(
    [(3, 0), (3, 1), (3, 2), (2, 3), (1, 3), (0, 3)], fill_value=0
)
bo5_percentages = (bo5_counts / bo5_counts.sum()) * 100

# Calculate aggregate percentages for Player 1 and Player 2 wins in Best of 5
bo5_p1_wins_percent = bo5_percentages.loc[(3, 0)] + bo5_percentages.loc[(3, 1)] + bo5_percentages.loc[(3, 2)]
bo5_p2_wins_percent = bo5_percentages.loc[(2, 3)] + bo5_percentages.loc[(1, 3)] + bo5_percentages.loc[(0, 3)]

# Create the figure and axes
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot Best of 3 histogram with aggregate win bars
axes[0].bar(bo3_result_ordered_labels, bo3_percentages, color='blue', alpha=0.7, label='Individual Result')
axes[0].bar(
    ['(2, 0)', '(2, 1)'],
    [bo3_p1_wins_percent, bo3_p1_wins_percent],
    color='lightblue',
    alpha=0.5,
    width=1.8,
    align='center',
    zorder=0,
    label='Player 1 Wins'
)
axes[0].bar(
    ['(1, 2)', '(0, 2)'],
    [bo3_p2_wins_percent, bo3_p2_wins_percent],
    color='lightcoral',
    alpha=0.5,
    width=1.8,
    align='center',
    zorder=0,
    label='Player 2 Wins'
)
axes[0].set_title('Best of 3 Results')
axes[0].set_ylabel('Percentage (%)')
axes[0].set_xlabel('Results')
axes[0].set_ylim(0, 100)
axes[0].legend(loc="upper left")

# Label percentages for aggregate bars in Best of 3
axes[0].text(0.5, bo3_p1_wins_percent + 2, f'{int(bo3_p1_wins_percent)}%', ha='center', color='blue')
axes[0].text(2.5, bo3_p2_wins_percent + 2, f'{int(bo3_p2_wins_percent)}%', ha='center', color='red')

# Label individual percentages in Best of 3
for i, pct in enumerate(bo3_percentages):
    axes[0].text(i, pct + 1, f'{int(pct)}%', ha='center')

# Plot Best of 5 histogram with aggregate win bars
axes[1].bar(bo5_result_ordered_labels, bo5_percentages, color='green', alpha=0.7, label='Individual Result')
axes[1].bar(
    ['(3, 0)', '(3, 1)', '(3, 2)'],
    [bo5_p1_wins_percent, bo5_p1_wins_percent, bo5_p1_wins_percent],
    color='lightgreen',
    alpha=0.5,
    width=3.5,
    align='center',
    zorder=0,
    label='Player 1 Wins'
)
axes[1].bar(
    ['(2, 3)', '(1, 3)', '(0, 3)'],
    [bo5_p2_wins_percent, bo5_p2_wins_percent, bo5_p2_wins_percent],
    color='orange',
    alpha=0.5,
    width=3.5,
    align='center',
    zorder=0,
    label='Player 2 Wins'
)
axes[1].set_title('Best of 5 Results')
axes[1].set_ylabel('Percentage (%)')
axes[1].set_xlabel('Results')
axes[1].set_ylim(0, 100)
axes[1].legend(loc="upper left")

# Label percentages for aggregate bars in Best of 5
axes[1].text(1.5, bo5_p1_wins_percent + 2, f'{int(bo5_p1_wins_percent)}%', ha='center', color='green')
axes[1].text(4.5, bo5_p2_wins_percent + 2, f'{int(bo5_p2_wins_percent)}%', ha='center', color='orange')

# Label individual percentages in Best of 5
for i, pct in enumerate(bo5_percentages):
    axes[1].text(i, pct + 1, f'{int(pct)}%', ha='center')

# Adjust layout and show the plot
plt.tight_layout()
plt.show()



## Extract character data
First restrict to sets with game data.

In [None]:
sets_with_game_data_df = sets_df[sets_df['game_data'].apply(lambda x: x != [])].copy()
sets_with_game_data_df = sets_with_game_data_df[sets_with_game_data_df['valid_score'] == True]
sets_with_game_data_df['length_gamedata'] = sets_with_game_data_df['game_data'].apply(len)
sets_with_game_data_df = sets_with_game_data_df[sets_with_game_data_df['length_gamedata'].isin([2,3,4,5])]
sets_with_game_data_df.shape
print(f"Sets with game data comprise {sets_with_game_data_df.shape[0]/sets_df.shape[0]:.2%} of the data.")

In [None]:
from tqdm import tqdm

tqdm.pandas()

# Updated function to extract character and matchup data as strings
def extract_character_data(game_data, p1_id, p2_id):
    p1_characters = set()
    p2_characters = set()
    matchup_strings = []
    p1_initial_char = None
    p2_initial_char = None
    p1_changed = False
    p2_changed = False

    for game in game_data:
        # Extract winner and loser IDs
        winner_id = str(game['winner_id'])
        loser_id = str(game['loser_id'])

        # Extract characters
        winner_char = game['winner_char'].split('/')[1] if game['winner_char'] else None
        loser_char = game['loser_char'].split('/')[1] if game['loser_char'] else None

        # Skip if either character is missing
        if not winner_char or not loser_char:
            continue

        # Track unique characters for Player 1
        if winner_id == p1_id:
            p1_characters.add(winner_char)
            if p1_initial_char is None:
                p1_initial_char = winner_char
            elif winner_char != p1_initial_char:
                p1_changed = True
        elif loser_id == p1_id:
            p1_characters.add(loser_char)
            if p1_initial_char is None:
                p1_initial_char = loser_char
            elif loser_char != p1_initial_char:
                p1_changed = True

        # Track unique characters for Player 2
        if winner_id == p2_id:
            p2_characters.add(winner_char)
            if p2_initial_char is None:
                p2_initial_char = winner_char
            elif winner_char != p2_initial_char:
                p2_changed = True
        elif loser_id == p2_id:
            p2_characters.add(loser_char)
            if p2_initial_char is None:
                p2_initial_char = loser_char
            elif loser_char != p2_initial_char:
                p2_changed = True

        # Create matchup string
        p1_char = winner_char if winner_id == p1_id else loser_char
        p2_char = winner_char if winner_id == p2_id else loser_char
        winner = '0' if winner_id == p1_id else '1'
        matchup_strings.append(f"{p1_char}/{p2_char}/{winner}")

    return (
        sorted(p1_characters),  # Unique characters played by Player 1
        sorted(p2_characters),  # Unique characters played by Player 2
        p1_changed,             # True if Player 1 changed characters
        p2_changed,             # True if Player 2 changed characters
        matchup_strings         # List of matchup strings
    )

# Apply the function to the DataFrame
sets_with_game_data_df[['p1_characters', 'p2_characters', 'p1_changed', 'p2_changed', 'matchup_strings']] = sets_with_game_data_df.progress_apply(
    lambda row: pd.Series(extract_character_data(row['game_data'], str(row['p1_id']), str(row['p2_id']))),
    axis=1
)

# Add columns to indicate if players did not change characters
sets_with_game_data_df['p1_consistent'] = ~sets_with_game_data_df['p1_changed']
sets_with_game_data_df['p2_consistent'] = ~sets_with_game_data_df['p2_changed']

# Display the structure of the updated DataFrame
sets_with_game_data_df[['p1_characters', 'p2_characters', 'p1_consistent', 'p2_consistent', 'matchup_strings']].head()


In [None]:
# from collections import Counter
# import pandas as pd
# import matplotlib.pyplot as plt

# # Combine all matchup strings into a single list
# all_matchup_strings = [matchup for matchup_list in sets_with_game_data_df['matchup_strings'] for matchup in matchup_list]

# # Count occurrences of each matchup string
# matchup_string_counts = Counter(all_matchup_strings)

# # Convert the results to a DataFrame for analysis
# matchup_string_df = pd.DataFrame.from_records(
#     list(matchup_string_counts.items()),  # Convert dict_items to a list
#     columns=['matchup_string', 'count']
# )

# # Display the DataFrame
# print(matchup_string_df)

# # Optional: Visualize the top 10 matchups by count
# top_matchups = matchup_string_df.sort_values('count', ascending=False).head(10)
# top_matchups.plot(
#     x='matchup_string', y='count', kind='bar', figsize=(12, 6), legend=False
# )
# plt.title('Top 10 Matchup Strings')
# plt.ylabel('Count')
# plt.xlabel('Matchup String')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import TwoSlopeNorm
from collections import Counter

# Combine all matchup strings into a single list
all_matchup_strings = [matchup for matchup_list in sets_with_game_data_df['matchup_strings'] for matchup in matchup_list]

# Count occurrences of each matchup string
matchup_string_counts = Counter(all_matchup_strings)

# Prepare data for heatmaps
matchup_data = []
for matchup, count in matchup_string_counts.items():
    p1_char, p2_char, winner = matchup.split('/')
    p1_wins = int(winner == '0') * count  # Count wins for Player 1
    p2_wins = int(winner == '1') * count  # Count wins for Player 2
    matchup_data.append((p1_char, p2_char, count, p1_wins, p2_wins))

# Convert to DataFrame
matchup_df = pd.DataFrame(matchup_data, columns=['p1_char', 'p2_char', 'count', 'p1_wins', 'p2_wins'])

# Group by characters to calculate total counts and win rates
grouped = matchup_df.groupby(['p1_char', 'p2_char']).agg(
    total_sets=('count', 'sum'),
    p1_wins=('p1_wins', 'sum'),
    p2_wins=('p2_wins', 'sum')
).reset_index()

# Calculate win rate for Player 1
grouped['p1_win_rate'] = grouped['p1_wins'] / grouped['total_sets'] * 100

# Pivot tables for heatmaps
heatmap_data_counts = grouped.pivot(index='p1_char', columns='p2_char', values='total_sets').fillna(0)
heatmap_data_win_rate = grouped.pivot(index='p1_char', columns='p2_char', values='p1_win_rate').fillna(0)

# Order characters by frequency
char_order = heatmap_data_counts.sum(axis=1).sort_values(ascending=False).index
heatmap_data_counts = heatmap_data_counts.loc[char_order, char_order]
heatmap_data_win_rate = heatmap_data_win_rate.loc[char_order, char_order]

# Plot first heatmap (counts)
plt.figure(figsize=(20, 15), dpi=150)
sns.heatmap(heatmap_data_counts, annot=True, fmt='.0f', cmap='Blues', cbar_kws={'label': 'Count'})
plt.title('Counts of Sets by Character Matchup')
plt.ylabel('Player 1 Character')
plt.xlabel('Player 2 Character')
plt.tight_layout()
plt.show()

# Plot second heatmap (win percentage)
plt.figure(figsize=(20, 15), dpi=150)
# Set midpoint for 50% winrate
norm = TwoSlopeNorm(vmin=0, vcenter=50, vmax=100)
sns.heatmap(
    heatmap_data_win_rate, 
    annot=True, 
    fmt='.1f', 
    cmap='coolwarm', 
    norm=norm, 
    cbar_kws={'label': 'Win %'}
)
plt.title('Player 1 Win Percentage by Character Matchup')
plt.ylabel('Player 1 Character')
plt.xlabel('Player 2 Character')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import TwoSlopeNorm
from collections import Counter

# Combine all matchup strings into a single list
all_matchup_strings = [matchup for matchup_list in sets_with_game_data_df['matchup_strings'] for matchup in matchup_list]

# Count occurrences of each matchup string
matchup_string_counts = Counter(all_matchup_strings)

# Prepare data for heatmaps
matchup_data = []
for matchup, count in matchup_string_counts.items():
    p1_char, p2_char, winner = matchup.split('/')
    p1_wins = int(winner == '0') * count  # Count wins for Player 1
    p2_wins = int(winner == '1') * count  # Count wins for Player 2
    matchup_data.append((p1_char, p2_char, count, p1_wins, p2_wins))

# Convert to DataFrame
matchup_df = pd.DataFrame(matchup_data, columns=['p1_char', 'p2_char', 'count', 'p1_wins', 'p2_wins'])

# Group by characters to calculate total counts and win rates
grouped = matchup_df.groupby(['p1_char', 'p2_char']).agg(
    total_sets=('count', 'sum'),
    p1_wins=('p1_wins', 'sum'),
    p2_wins=('p2_wins', 'sum')
).reset_index()

# Calculate win rate for Player 1
grouped['p1_win_rate'] = grouped['p1_wins'] / grouped['total_sets'] * 100

# Pivot tables for heatmaps
heatmap_data_counts = grouped.pivot(index='p1_char', columns='p2_char', values='total_sets').fillna(0)
heatmap_data_win_rate = grouped.pivot(index='p1_char', columns='p2_char', values='p1_win_rate').fillna(0)
heatmap_data_win_rate = heatmap_data_win_rate - heatmap_data_win_rate.T
# Order characters by frequency
char_order = heatmap_data_counts.sum(axis=1).sort_values(ascending=False).index
heatmap_data_counts = heatmap_data_counts.loc[char_order, char_order]
heatmap_data_win_rate = heatmap_data_win_rate.loc[char_order, char_order]

# # Plot first heatmap (counts)
# plt.figure(figsize=(20, 15), dpi=150)
# sns.heatmap(heatmap_data_counts, annot=True, fmt='.0f', cmap='Blues', cbar_kws={'label': 'Count'})
# plt.title('Counts of Sets by Character Matchup')
# plt.ylabel('Player 1 Character')
# plt.xlabel('Player 2 Character')
# plt.tight_layout()
# plt.show()

# Plot second heatmap (win percentage)
plt.figure(figsize=(20, 15), dpi=150)
# Set midpoint for 50% winrate
norm = TwoSlopeNorm(vmin=-50, vcenter=0, vmax=50)
sns.heatmap(
    heatmap_data_win_rate, 
    annot=True, 
    fmt='.1f', 
    cmap='coolwarm', 
    norm=norm, 
    cbar_kws={'label': 'Win %'}
)
plt.title('Win Rate Asymmetry Between Player 1 and Player 2 Matchups')
plt.ylabel('Player 1 Character')
plt.xlabel('Player 2 Character')
plt.tight_layout()
plt.show()


In [None]:
# Add the specified columns to sets_df with default values
for col in ['p1_characters', 'p2_characters', 'p1_consistent', 'p2_consistent', 'matchup_strings']:
    sets_df[col] = None  # Default value is None; you can replace with np.nan if needed

# Update values in sets_df for sets present in sets_with_game_data_df
sets_df.update(sets_with_game_data_df[['p1_characters', 'p2_characters', 'p1_consistent', 'p2_consistent', 'matchup_strings']])

# Display the updated sets_df to verify
sets_df[['p1_characters', 'p2_characters', 'p1_consistent', 'p2_consistent', 'matchup_strings']].head()


### Save the dataframe