In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict

import sqlite3
import sys
import time

## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [2]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = "../data/melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [3]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [None]:
players_df = dfs['players_df']
players_df.head()

In [None]:
ranking_df = dfs['ranking_df']
ranking_df.head()

In [None]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

In [None]:
sets_df = dfs['sets_df']
print(f'{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data')
sets_df.head()

In [None]:
tournament_info_df = dfs['tournament_info_df']
tournament_info_df.head()


In [None]:
n = 1206
print(tournament_info_df.loc[n]['entrants'])
print(len(tournament_info_df.loc[n]['placings']))
print(tournament_info_df.loc[n]['placings'])


## Investigate data for Zain
We choose zain because he is in the best player in the head of the players_df.

In [None]:
zain_df = players_df.iloc[2]
print('Zain has played in', len(zain_df['placings']), 'tournaments.')

# Dataframe of tournaments that zain has played in.
zain_tournament_keys = [tournament['key'] for tournament in zain_df['placings']]
zain_tournament_info_df = tournament_info_df[tournament_info_df['key'].isin(zain_tournament_keys)]

In [None]:
print(zain_tournament_info_df.info())
zain_tournament_info_df.head()

Here we make a dataframe containing all the sets that zain has played in and add some columns so that we can identify him more easily.

In [None]:
# Filter sets_df with Zain as a player and make a copy
zain_sets_df = sets_df[(sets_df['p1_id'] == zain_df['player_id']) | (sets_df['p2_id'] == zain_df['player_id'])].copy()
print('Zain has played', zain_sets_df.shape[0], 'sets.')

# Assuming zain_df['player_id'] gives Zain's player ID
zain_id = zain_df['player_id']

# Add a 'zain_win' column using .loc
zain_sets_df.loc[:, 'zain_win'] = ((zain_sets_df['p1_id'] == zain_id) & (zain_sets_df['p1_score'] > zain_sets_df['p2_score'])) | \
                                  ((zain_sets_df['p2_id'] == zain_id) & (zain_sets_df['p2_score'] > zain_sets_df['p1_score']))

# Add an 'opponent' column using .loc
zain_sets_df.loc[:, 'opponent'] = zain_sets_df.apply(
    lambda row: row['p1_id'] if row['p2_id'] == zain_id else row['p2_id'],
    axis=1
)

# Add 'zain_score' and 'opponent_score' columns using .loc
zain_sets_df.loc[:, 'zain_score'] = zain_sets_df.apply(
    lambda row: row['p1_score'] if row['p1_id'] == zain_id else row['p2_score'],
    axis=1
)

zain_sets_df.loc[:, 'opponent_score'] = zain_sets_df.apply(
    lambda row: row['p2_score'] if row['p1_id'] == zain_id else row['p1_score'],
    axis=1
)

# Filter the zain_sets_df for rows where 'game_data' is not an empty list
game_data_zain_sets_df = zain_sets_df[zain_sets_df['game_data'].apply(lambda x: len(x) > 0)]

# Display the result
print('Zain has played', game_data_zain_sets_df.shape[0], 'sets with game_data.')


Here we check to see that zain_df['characters'] counts the number of games that zain has played that character.

In [None]:
# Ensure zain_id is a string to match the data in game_data
zain_id = str(zain_id)

# Initialize a dictionary to count Zain's characters
zain_characters_count = defaultdict(int)

# Loop through the 'game_data' for each set Zain played
for game_data in game_data_zain_sets_df['game_data']:
    for game in game_data:
        # Check if Zain was the winner or loser and increment the count of the character he used
        if str(game['winner_id']) == zain_id:  # Compare as strings
            zain_characters_count[game['winner_char']] += 1
        elif str(game['loser_id']) == zain_id:  # Compare as strings
            zain_characters_count[game['loser_char']] += 1

# Convert the defaultdict to a regular dict and display the result
zain_characters_count = dict(zain_characters_count)
# print(zain_characters_count)
# print(zain_df['characters'])

# Extract the characters dictionary from zain_df
zain_characters_actual = zain_df['characters']

# Find the keys that are common to both dictionaries
common_keys = set(zain_characters_count.keys()).intersection(set(zain_characters_actual.keys()))

# Compare the values for the keys that are common
for key in common_keys:
    if zain_characters_count[key] == zain_characters_actual[key]:
        print(f"{key}: Match - {zain_characters_count[key]} games")
    else:
        print(f"{key}: Mismatch - counted {zain_characters_count[key]} games, actual {zain_characters_actual[key]} games")

## ChatGPT's output when asked to write some code to calculate Glicko-2 
Use this as a rough baseline for how long it will take to compute.

In [12]:
from joblib import Parallel, delayed
import tqdm

In [13]:
# Load your datasets
players_df = dfs['players_df']
sets_df = dfs['sets_df']
tournaments_df = dfs['tournament_info_df']

# Ensure tournaments are sorted by end date
tournaments_df = tournaments_df.sort_values('end').reset_index(drop=True)

In [14]:
# Initialize player ratings DataFrame
# Columns: ['player_id', 'rating', 'rd', 'volatility', 'last_updated']
player_ratings = pd.DataFrame({
    'player_id': players_df['player_id'],
    'rating': 1500.0,        # Initial rating
    'rd': 350.0,             # Initial rating deviation
    'volatility': 0.06,      # Initial volatility
    '': pd.Timestamp.min  # Keep track of the last update time
})

player_ratings.set_index('player_id', inplace=True)

# Initialize an empty DataFrame to store ratings history
ratings_history = pd.DataFrame(columns=[
    'player_id', 'tournament_key', 'tournament_end_date', 'rating', 'rd', 'volatility'
])

# Initialize a nested dictionary to store matchup ratings
# matchup_ratings[player_id][matchup_key] = {'rating': 1500, 'rd': 350, 'volatility': 0.06, 'last_updated': pd.Timestamp.min}
matchup_ratings = defaultdict(lambda: defaultdict(lambda: {
    'rating': 1500.0,
    'rd': 350.0,
    'volatility': 0.06,
    'last_updated': pd.Timestamp.min
}))

# Initialize an empty DataFrame for matchup ratings history
matchup_ratings_history = pd.DataFrame(columns=[
    'player_id', 'matchup_key', 'tournament_key', 'tournament_end_date', 'rating', 'rd', 'volatility'
])


In [18]:
# initialze data frame to store the ratings history over time
# if the player is in tourmanet, we append the end date of the tourmanet to the list in 'dates'
# and then append the values for the remaining three columns to the lists after we calculate the
# values for the update
player_ratings = pd.DataFrame({
    'player_id': players_df['player_id'],
    'dates': [[0] for _ in range(players_df.shape[0])],  # A list with one element (0) for each player
    'rating_history': [[1500.0] for _ in range(players_df.shape[0])],  # Initialize with 1500 rating
    'rd_history': [[350.0] for _ in range(players_df.shape[0])],  # Initialize with 350 RD
    'volatility_history': [[350.0] for _ in range(players_df.shape[0])],  # Initialize with 350 volatility
})

In [51]:
# Convert rating and RD to Glicko-2 scale
def convert_rating_to_glicko2_scale(rating):
    return (rating - 1500) / 173.7178

def convert_rd_to_glicko2_scale(rd):
    return rd / 173.7178

# Convert rating and RD back to original scale
def convert_rating_from_glicko2_scale(rating):
    return rating * 173.7178 + 1500

def convert_rd_from_glicko2_scale(rd):
    return rd * 173.7178


In [52]:
def update_player_rating(player, opponent_results, tau=0.5):
    """
    Update player's rating based on opponent_results.

    Parameters:
    - player: Series containing 'rating', 'rd', 'volatility', 'last_updated'
    - opponent_results: List of tuples [(opponent_rating, opponent_rd, score), ...]
    - tau: System constant, typically between 0.3 and 1.2
    """
    # Step 1: Convert ratings to Glicko-2 scale
    mu = convert_rating_to_glicko2_scale(player['rating'])
    phi = convert_rd_to_glicko2_scale(player['rd'])
    sigma = player['volatility']
    
    # If no games played, adjust phi for time decay
    # (Implement time decay here if necessary)
    
    # Step 2: Compute the estimated variance (v) and delta
    v_inv = 0
    delta = 0
    for opp_rating, opp_rd, score in opponent_results:
        mu_j = convert_rating_to_glicko2_scale(opp_rating)
        phi_j = convert_rd_to_glicko2_scale(opp_rd)
        g_phi_j = 1 / np.sqrt(1 + 3 * phi_j**2 / np.pi**2)
        E_mu_j = 1 / (1 + np.exp(-g_phi_j * (mu - mu_j)))
        v_inv += (g_phi_j**2) * E_mu_j * (1 - E_mu_j)
        delta += g_phi_j * (score - E_mu_j)
    v = 1 / v_inv
    delta *= v
    
    # Step 3: Update volatility (sigma)
    # Implement the iterative algorithm to find sigma'
    # (This involves solving a function numerically; use a suitable method)
    # For simplicity, you can use an approximate method or a library function
    
    # Step 4: Update rating and RD
    phi_star = np.sqrt(phi**2 + sigma**2)
    phi_new = 1 / np.sqrt(1 / phi_star**2 + 1 / v)
    mu_new = mu + phi_new**2 * delta / v
    
    # Convert back to original scale
    new_rating = convert_rating_from_glicko2_scale(mu_new)
    new_rd = convert_rd_from_glicko2_scale(phi_new)
    
    # Return updated values
    return new_rating, new_rd, sigma  # sigma remains unchanged for simplicity


In [23]:
import math

def compute_ancillary_quantities(current_rating, ratings_deviation, num_opponents, opponent_ratings, opponent_rds, resulting_scores):
    """
    First step of the Glicko-2 algorithm as found on Wikipeadia.
    
    Parameters:
    -current_rating is mu
    -ratings_deviation is phi
    -opponents is m
    -opponent_ratings is mu_j
    -oppenents rds are phi_j
    -resulting_scores is s_j
    """
    def g_function(phi_j):
        return 1 / math.sqrt(1 + 3 * phi_j ** 2 * math.pi)
    
    def E_function(mu, mu_j, phi_j):
        return 1 / (1. + math.exp(-g_function(phi_j)(mu-mu_j)))
    
    v_inv = 0
    delta = 0
    for j in range(num_opponents):
        v_inv += g_function(opponent_ratings[j]) ** 2 * E_function(current_rating, opponent_ratings[j], opponent_rds[j]) * (1 - E_function(current_rating, opponent_ratings[j], opponent_rds[j]))
        delta = g_function(opponent_ratings[j]) * (resulting_scores[j] - E_function(current_rating, opponent_ratings[j], opponent_rds[j]))
    v = 1 / v_inv
    delta *= v
    
    return v, delta

def determinte_new_rating_volatility(current_rating, ratings_deviation, rating_volatility, num_opponents, opponent_ratings, opponent_rds, resulting_scores, tau=.2):
    """
    Second step of the Glicko-2 algorithm as found on Wikipeadia.
    
    Parameters:
    -current_rating is mu
    -ratings_deviation is phi
    -rating_volatility is sigma
    -opponents is m
    -opponent_ratings is mu_j
    -oppenents rds are phi_j
    -resulting_scores is s_j
    """
    v, delta = compute_ancillary_quantities(current_rating, ratings_deviation, num_opponents, opponent_ratings, opponent_rds, resulting_scores)
    
    def f(x, v,):
        
        
        
    
    
    

In [53]:
def process_tournament(tournament_key):
    # Filter sets for this tournament
    tournament_sets = sets_df[sets_df['tournament_key'] == tournament_key]
    
    # Get the tournament end date
    tournament_end_date = tournaments_df.loc[
        tournaments_df['key'] == tournament_key, 'end'].values[0]
    tournament_end_date = pd.to_datetime(tournament_end_date)
    
    # Collect results for each player
    player_results = defaultdict(list)
    for idx, row in tournament_sets.iterrows():
        p1_id = row['p1_id']
        p2_id = row['p2_id']
        p1_score = row['p1_score']
        p2_score = row['p2_score']
        
        # Ensure both players are in the ratings DataFrame
        if p1_id not in player_ratings.index:
            player_ratings.loc[p1_id] = [1500, 350, 0.06, pd.Timestamp.min]
        if p2_id not in player_ratings.index:
            player_ratings.loc[p2_id] = [1500, 350, 0.06, pd.Timestamp.min]
        
        # Determine outcome
        if p1_score > p2_score:
            score_p1 = 1
            score_p2 = 0
        elif p1_score < p2_score:
            score_p1 = 0
            score_p2 = 1
        else:
            score_p1 = score_p2 = 0.5  # Handle ties if applicable
        
        # Append results
        opp_p2 = player_ratings.loc[p2_id]
        player_results[p1_id].append((opp_p2['rating'], opp_p2['rd'], score_p1))
        
        opp_p1 = player_ratings.loc[p1_id]
        player_results[p2_id].append((opp_p1['rating'], opp_p1['rd'], score_p2))
    
    # Update player ratings
    updated_players = []
    for player_id, results in player_results.items():
        player = player_ratings.loc[player_id]
        
        # Apply time decay before updating (optional)
        # apply_time_decay(player, tournament_end_date)
        
        new_rating, new_rd, new_sigma = update_player_rating(player, results)
        
        # Update the player_ratings DataFrame
        player_ratings.loc[player_id, 'rating'] = new_rating
        player_ratings.loc[player_id, 'rd'] = new_rd
        # player_ratings.loc[player_id, 'volatility'] = new_sigma  # Update if volatility changes
        player_ratings.loc[player_id, 'last_updated'] = tournament_end_date
        
        # Add to updated players list
        updated_players.append({
            'player_id': player_id,
            'tournament_key': tournament_key,
            'tournament_end_date': tournament_end_date,
            'rating': new_rating,
            'rd': new_rd,
            'volatility': player_ratings.loc[player_id, 'volatility']
        })
    
    # Append updated players' ratings to ratings_history DataFrame
    ratings_history_df = pd.DataFrame(updated_players)
    global ratings_history
    ratings_history = pd.concat([ratings_history, ratings_history_df], ignore_index=True)
    
def process_tournament(tournament_key):
    # Filter sets for this tournament
    tournament_sets = sets_df[sets_df['tournament_key'] == tournament_key]
    
    # Get the tournament end date
    tournament_end_date = tournaments_df.loc[
        tournaments_df['key'] == tournament_key, 'end'].values[0]
    tournament_end_date = pd.to_datetime(tournament_end_date)
    
    # Initialize data structures for player results and matchup results
    player_results = defaultdict(list)
    matchup_results = defaultdict(lambda: defaultdict(list))
    
    # Loop through each set in the tournament
    for idx, row in tournament_sets.iterrows():
        p1_id = row['p1_id']
        p2_id = row['p2_id']
        p1_score = row['p1_score']
        p2_score = row['p2_score']
        game_data = row['game_data']
        
        # Ensure both players are in the ratings DataFrame
        if p1_id not in player_ratings.index:
            player_ratings.loc[p1_id] = [1500, 350, 0.06, pd.Timestamp.min]
        if p2_id not in player_ratings.index:
            player_ratings.loc[p2_id] = [1500, 350, 0.06, pd.Timestamp.min]
        
        # Determine outcome
        if p1_score > p2_score:
            score_p1 = 1
            score_p2 = 0
        elif p1_score < p2_score:
            score_p1 = 0
            score_p2 = 1
        else:
            score_p1 = score_p2 = 0.5  # Handle ties if applicable
        
        # Append results for overall ratings
        opp_p2 = player_ratings.loc[p2_id]
        player_results[p1_id].append((opp_p2['rating'], opp_p2['rd'], score_p1))
        
        opp_p1 = player_ratings.loc[p1_id]
        player_results[p2_id].append((opp_p1['rating'], opp_p1['rd'], score_p2))
        
        # Now handle character-specific matchups
        # Extract characters from game_data if available
        # We'll assume that game_data contains character information
        # Otherwise, adjust accordingly

        # Initialize character lists
        p1_characters = []
        p2_characters = []
        
        # Extract characters from game_data
        if game_data:
            for game in game_data:
                # Check if p1 is winner or loser in the game
                if str(game['winner_id']) == str(p1_id):
                    p1_characters.append(game['winner_char'])
                    p2_characters.append(game['loser_char'])
                elif str(game['loser_id']) == str(p1_id):
                    p1_characters.append(game['loser_char'])
                    p2_characters.append(game['winner_char'])
                else:
                    # If player IDs don't match, skip
                    continue
        else:
            # If game_data is empty, you may decide to skip matchup updates
            # Alternatively, you can use most common characters from player profiles
            continue  # Skip this set for matchup updates
        
        # Determine the most common character used by each player in this set
        from collections import Counter
        if p1_characters:
            p1_char = Counter(p1_characters).most_common(1)[0][0]
        else:
            p1_char = 'Unknown'
        if p2_characters:
            p2_char = Counter(p2_characters).most_common(1)[0][0]
        else:
            p2_char = 'Unknown'
        
        # Create matchup keys
        # For p1: matchup against p2's character
        p1_matchup_key = p2_char
        # For p2: matchup against p1's character
        p2_matchup_key = p1_char
        
        # Ensure matchup ratings exist
        if p1_id not in matchup_ratings:
            matchup_ratings[p1_id] = {}
        if p1_matchup_key not in matchup_ratings[p1_id]:
            matchup_ratings[p1_id][p1_matchup_key] = {'rating': 1500, 'rd': 350, 'volatility': 0.06, 'last_updated': pd.Timestamp.min}
        
        if p2_id not in matchup_ratings:
            matchup_ratings[p2_id] = {}
        if p2_matchup_key not in matchup_ratings[p2_id]:
            matchup_ratings[p2_id][p2_matchup_key] = {'rating': 1500, 'rd': 350, 'volatility': 0.06, 'last_updated': pd.Timestamp.min}
        
        # Append results for matchup ratings
        opp_p2_matchup = matchup_ratings[p2_id][p2_matchup_key]
        matchup_results[p1_id][p1_matchup_key].append((opp_p2_matchup['rating'], opp_p2_matchup['rd'], score_p1))
        
        opp_p1_matchup = matchup_ratings[p1_id][p1_matchup_key]
        matchup_results[p2_id][p2_matchup_key].append((opp_p1_matchup['rating'], opp_p1_matchup['rd'], score_p2))
    
    # Update overall player ratings
    updated_players = []
    for player_id, results in player_results.items():
        player = player_ratings.loc[player_id]
        
        # Apply time decay before updating (optional)
        # apply_time_decay(player, tournament_end_date)
        
        new_rating, new_rd, new_sigma = update_player_rating(player, results)
        
        # Update the player_ratings DataFrame
        player_ratings.loc[player_id, 'rating'] = new_rating
        player_ratings.loc[player_id, 'rd'] = new_rd
        # player_ratings.loc[player_id, 'volatility'] = new_sigma  # Update if volatility changes
        player_ratings.loc[player_id, 'last_updated'] = tournament_end_date
        
        # Add to updated players list
        updated_players.append({
            'player_id': player_id,
            'tournament_key': tournament_key,
            'tournament_end_date': tournament_end_date,
            'rating': new_rating,
            'rd': new_rd,
            'volatility': player_ratings.loc[player_id, 'volatility']
        })
    
    # Append updated players' ratings to ratings_history DataFrame
    ratings_history_df = pd.DataFrame(updated_players)
    global ratings_history
    ratings_history = pd.concat([ratings_history, ratings_history_df], ignore_index=True)
    
    # Now update matchup ratings
    updated_matchups = []
    for player_id, matchups in matchup_results.items():
        for matchup_key, results in matchups.items():
            player_matchup = matchup_ratings[player_id][matchup_key]
            
            # Apply time decay to matchup rating if necessary
            # apply_time_decay_matchup(player_matchup, tournament_end_date)
            
            new_rating, new_rd, new_sigma = update_player_rating(player_matchup, results)
            
            # Update the matchup rating
            matchup_ratings[player_id][matchup_key]['rating'] = new_rating
            matchup_ratings[player_id][matchup_key]['rd'] = new_rd
            # matchup_ratings[player_id][matchup_key]['volatility'] = new_sigma  # Update if volatility changes
            matchup_ratings[player_id][matchup_key]['last_updated'] = tournament_end_date
            
            # Add to updated matchups list
            updated_matchups.append({
                'player_id': player_id,
                'matchup_key': matchup_key,
                'tournament_key': tournament_key,
                'tournament_end_date': tournament_end_date,
                'rating': new_rating,
                'rd': new_rd,
                'volatility': matchup_ratings[player_id][matchup_key]['volatility']
            })
    
    # Append updated matchup ratings to matchup_ratings_history DataFrame
    matchup_ratings_history_df = pd.DataFrame(updated_matchups)
    global matchup_ratings_history
    matchup_ratings_history = pd.concat([matchup_ratings_history, matchup_ratings_history_df], ignore_index=True)
    


In [None]:
# Loop over tournaments
for idx, tournament in tqdm.tqdm(tournaments_df[:1000].iterrows(), total=tournaments_df.shape[0]):
    process_tournament(tournament['key'])
