In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict

import sqlite3
import sys
import time
import math
import tqdm

from glicko2 import Player

## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [2]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = "../data/melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [3]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [None]:
players_df = dfs['players_df']
players_df.head()

In [None]:
ranking_df = dfs['ranking_df']
ranking_df.head()

In [None]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

In [None]:
sets_df = dfs['sets_df']
print(f'{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data')
sets_df.head()

In [None]:
tournament_info_df = dfs['tournament_info_df']
tournament_info_df.head()

In [9]:
total_character_counts = {}

# Iterate over the 'characters' column
for idx, row in players_df.iterrows():
    char_dict = row['characters']
    if isinstance(char_dict, dict):
        for character, count in char_dict.items():
            if character in total_character_counts:
                total_character_counts[character] += count
            else:
                total_character_counts[character] = count
    else:
        # Handle cases where 'characters' is empty or NaN
        continue

# Now total_character_counts contains the total counts per character

# Sort the characters by total counts in decreasing order
sorted_characters = sorted(total_character_counts.items(), key=lambda x: x[1], reverse=True)
sorted_character_keys = [character[0] for character in sorted_characters]

In [None]:
sorted_tournament_info_df = tournament_info_df.sort_values('end').reset_index(drop=True)
first_date = sorted_tournament_info_df.iloc[0]['start']

game_data_sets_df= sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)]
seen = set()
tournament_keys = [key for key in game_data_sets_df['tournament_key'] if not (key in seen or seen.add(key))]
# Filter tournament_info_df to include only matching tournament keys
game_data_tournaments_df = tournament_info_df[
    tournament_info_df['key'].isin(tournament_keys)
].sort_values('end').reset_index(drop=True)


print(game_data_sets_df.head(1))
print(f"{game_data_sets_df.iloc[0:1]['game_data'].values[0]}")
print(first_date)

In [12]:


def process_tournament(tournament_key, player_ratings_matchup_df, tournament_info_df, sets_df):
    """
    Optimized function to process a tournament and update character-specific player ratings.
    """
    # Get the sets for this tournament
    tournament_sets_df = sets_df[sets_df['tournament_key'] == tournament_key]

    # Filter out matches without game data
    tournament_sets_df = tournament_sets_df[
        tournament_sets_df['game_data'].apply(lambda x: isinstance(x, list) and len(x) > 0)
    ]

    if tournament_sets_df.empty:
        print(f"No game data for tournament {tournament_key}.")
        return player_ratings_matchup_df

    # Explode 'game_data' to get one row per game
    tournament_sets_df = tournament_sets_df.explode('game_data').reset_index(drop=True)

    # Normalize 'game_data' dictionaries into separate columns
    game_data_df = pd.json_normalize(tournament_sets_df['game_data'])

    # Remove overlapping columns before concatenation
    overlapping_columns = tournament_sets_df.columns.intersection(game_data_df.columns)
    game_data_df = game_data_df.drop(columns=overlapping_columns)

    # Concatenate the game_data_df with tournament_sets_df
    tournament_games_df = pd.concat(
        [tournament_sets_df.reset_index(drop=True), game_data_df.reset_index(drop=True)], axis=1
    )

    # Remove duplicate columns if any
    tournament_games_df = tournament_games_df.loc[:, ~tournament_games_df.columns.duplicated()]

    # Adjust column names based on actual data
    winner_games = tournament_games_df[['winner_id', 'loser_id', 'winner_char', 'loser_char']]
    winner_games = winner_games.rename(columns={
        'winner_id': 'player_id',
        'loser_id': 'opponent_id',
        'winner_char': 'player_char',
        'loser_char': 'opponent_char'
    })
    winner_games['outcome'] = 1  # Winner

    loser_games = tournament_games_df[['loser_id', 'winner_id', 'loser_char', 'winner_char']]
    loser_games = loser_games.rename(columns={
        'loser_id': 'player_id',
        'winner_id': 'opponent_id',
        'loser_char': 'player_char',
        'winner_char': 'opponent_char'
    })
    loser_games['outcome'] = 0  # Loser

    # Combine the data
    player_games_df = pd.concat([winner_games, loser_games], ignore_index=True)

    # Remove duplicate columns if any
    player_games_df = player_games_df.loc[:, ~player_games_df.columns.duplicated()]

    # Filter out games with missing data
    player_games_df.dropna(
        subset=['player_char', 'opponent_char', 'player_id', 'opponent_id'], inplace=True
    )

    # Ensure characters are recognized
    player_games_df = player_games_df[
        player_games_df['player_char'].isin(sorted_character_keys) &
        player_games_df['opponent_char'].isin(sorted_character_keys)
    ]

    # Check if player_games_df is empty
    if player_games_df.empty:
        print(f"No valid player games data for tournament {tournament_key} after filtering.")
        return player_ratings_matchup_df

    # Get the tournament end date
    end_date_series = tournament_info_df.loc[
        tournament_info_df['key'] == tournament_key, 'end'
    ]
    if end_date_series.empty:
        print(f"No end date found for tournament {tournament_key}.")
        return player_ratings_matchup_df
    end_date = pd.to_datetime(end_date_series.values[0])

    # Ensure all players are in player_ratings_matchup_df
    tournament_player_ids = pd.unique(
        player_games_df[['player_id', 'opponent_id']].values.ravel('K')
    )

    existing_player_ids = player_ratings_matchup_df['player_id'].tolist()
    new_players = [pid for pid in tournament_player_ids if pid not in existing_player_ids]

    # Collect new player rows in a list
    new_rows = []

    for player_id in new_players:
        new_row = {
            'player_id': player_id,
            'dates_dict': {},          # Empty dict
            'rating_history_dict': {}, # Empty dict
            'rd_history_dict': {},     # Empty dict
            'game_count_dict': {},     # Empty dict
            'glicko2_dict': {}         # Empty dict
        }
        new_rows.append(new_row)

    # Concatenate once outside the loop if there are new players
    if new_rows:
        new_players_df = pd.DataFrame(new_rows)
        player_ratings_matchup_df = pd.concat([player_ratings_matchup_df, new_players_df], ignore_index=True)

    # Update player_index_map
    player_index_map = {player_id: idx for idx, player_id in enumerate(player_ratings_matchup_df['player_id'])}

    # Prepare data for grouping
    player_games_df['player_index'] = player_games_df['player_id'].map(player_index_map)
    player_games_df['opponent_index'] = player_games_df['opponent_id'].map(player_index_map)

    # Filter out games where players are not in the player_ratings_matchup_df
    player_games_df.dropna(subset=['player_index', 'opponent_index'], inplace=True)
    player_games_df['player_index'] = player_games_df['player_index'].astype(int)
    player_games_df['opponent_index'] = player_games_df['opponent_index'].astype(int)

    # Check if player_games_df is empty
    if player_games_df.empty:
        # print(f"No valid player games data for tournament {tournament_key} after mapping indices.")
        return player_ratings_matchup_df

    # Collect game counts
    game_counts = player_games_df.groupby(
        ['player_index', 'player_char', 'opponent_char']
    ).size().reset_index(name='game_count')

    # Update game counts in the DataFrame
    for row in game_counts.itertuples():
        idx = row.player_index
        p_char = row.player_char
        o_char = row.opponent_char
        count = row.game_count

        # Access or initialize game_count_dict
        game_count_dict = player_ratings_matchup_df.at[idx, 'game_count_dict']
        if p_char not in game_count_dict:
            game_count_dict[p_char] = {}
        if o_char not in game_count_dict[p_char]:
            game_count_dict[p_char][o_char] = 0
        game_count_dict[p_char][o_char] += count

        # Update the DataFrame entry
        player_ratings_matchup_df.at[idx, 'game_count_dict'] = game_count_dict

    # Collect opponent ratings, RDs, and outcomes
    def get_opponent_rating_rd(row):
        try:
            opp_idx = row['opponent_index']
            opp_char = row['opponent_char']
            player_char = row['player_char']

            # Check for missing data
            if pd.isna(opp_idx) or pd.isna(opp_char) or pd.isna(player_char):
                return pd.Series({'opponent_rating': np.nan, 'opponent_rd': np.nan})

            opp_idx = int(opp_idx)

            # Access or initialize opponent's glicko2_dict
            opp_glicko2_dict = player_ratings_matchup_df.at[opp_idx, 'glicko2_dict']
            if opp_char not in opp_glicko2_dict:
                opp_glicko2_dict[opp_char] = {}
            if player_char not in opp_glicko2_dict[opp_char]:
                opp_glicko2_dict[opp_char][player_char] = Player()
                opp_glicko2_dict[opp_char][player_char]._tau = .5

            opponent_glicko = opp_glicko2_dict[opp_char][player_char]

            # Update the DataFrame entry
            player_ratings_matchup_df.at[opp_idx, 'glicko2_dict'] = opp_glicko2_dict

            return pd.Series({
                'opponent_rating': opponent_glicko.getRating(),
                'opponent_rd': opponent_glicko.getRd()
            })
        except Exception as e:
            # Log the exception and return NaN values
            print(f"Error in get_opponent_rating_rd for tournament {tournament_key}: {e}")
            return pd.Series({'opponent_rating': np.nan, 'opponent_rd': np.nan})

    # Apply the function and assign the new columns directly
    opponent_ratings_rds = player_games_df.apply(get_opponent_rating_rd, axis=1)

    # Check if opponent_ratings_rds contains the required columns
    if opponent_ratings_rds.empty or not set(['opponent_rating', 'opponent_rd']).issubset(opponent_ratings_rds.columns):
        print(f"Warning: opponent_ratings_rds is empty or missing required columns for tournament {tournament_key}")
        return player_ratings_matchup_df

    player_games_df[['opponent_rating', 'opponent_rd']] = opponent_ratings_rds[['opponent_rating', 'opponent_rd']]

    # Drop rows with NaN opponent ratings
    player_games_df.dropna(subset=['opponent_rating', 'opponent_rd'], inplace=True)

    # Check if player_games_df is empty after dropping NaNs
    if player_games_df.empty:
        print(f"No valid opponent ratings data for tournament {tournament_key} after dropping NaNs.")
        return player_ratings_matchup_df

    # Group data for update_player
    grouped = player_games_df.groupby(['player_index', 'player_char', 'opponent_char'])

    # Update ratings
    for (player_index, player_char, opponent_char), group in grouped:
        p_row = player_ratings_matchup_df.loc[player_index]

        # Access or initialize glicko2_dict
        glicko2_dict = p_row['glicko2_dict']
        if player_char not in glicko2_dict:
            glicko2_dict[player_char] = {}
        if opponent_char not in glicko2_dict[player_char]:
            glicko2_dict[player_char][opponent_char] = Player()
        p_vs_char_glicko = glicko2_dict[player_char][opponent_char]

        opponent_ratings = group['opponent_rating'].tolist()
        opponent_rds = group['opponent_rd'].tolist()
        outcomes = group['outcome'].tolist()

        # Update Glicko-2 rating
        p_vs_char_glicko.update_player(opponent_ratings, opponent_rds, outcomes)

        # Access or initialize history dicts
        dates_dict = p_row['dates_dict']
        rating_history_dict = p_row['rating_history_dict']
        rd_history_dict = p_row['rd_history_dict']

        if player_char not in dates_dict:
            dates_dict[player_char] = {}
            rating_history_dict[player_char] = {}
            rd_history_dict[player_char] = {}
        if opponent_char not in dates_dict[player_char]:
            dates_dict[player_char][opponent_char] = []
            rating_history_dict[player_char][opponent_char] = []
            rd_history_dict[player_char][opponent_char] = []

        dates_dict[player_char][opponent_char].append(end_date)
        rating_history_dict[player_char][opponent_char].append(p_vs_char_glicko.getRating())
        rd_history_dict[player_char][opponent_char].append(p_vs_char_glicko.getRd())

        # Update the DataFrame entries
        player_ratings_matchup_df.at[player_index, 'glicko2_dict'] = glicko2_dict
        player_ratings_matchup_df.at[player_index, 'dates_dict'] = dates_dict
        player_ratings_matchup_df.at[player_index, 'rating_history_dict'] = rating_history_dict
        player_ratings_matchup_df.at[player_index, 'rd_history_dict'] = rd_history_dict

    # Return the updated DataFrame
    return player_ratings_matchup_df


In [None]:
# process_tournament(tournament_key, player_ratings_matchup_df, tournament_info_df, sets_df)
# Initialize player ratings DataFrame
player_ratings_matchup_df = pd.DataFrame(columns=[
    'player_id',
    'dates_dict',          # {player_char: {opponent_char: [dates]}}
    'rating_history_dict', # {player_char: {opponent_char: [ratings]}}
    'rd_history_dict',     # {player_char: {opponent_char: [RDs]}}
    'game_count_dict',     # {player_char: {opponent_char: game_count}}
    'glicko2_dict'         # {player_char: {opponent_char: Player()}}
])

# Loop over tournaments
for idx, tournament in tqdm.tqdm(game_data_tournaments_df.iterrows(), total=game_data_tournaments_df.shape[0]):
    player_ratings_matchup_df = process_tournament(tournament['key'], player_ratings_matchup_df, tournament_info_df, sets_df)

In [None]:
player_ratings_matchup_df.head()

In [15]:
# player_ratings_matchup_df.to_pickle('../data/player_ratings_matchup_tau_5_df.pkl')

In [16]:
# loaded =  pd.read_pickle('../data/player_ratings_matchup_tau_5_df.pkl')
# loaded.head()

In [17]:
# im