In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'


## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [2]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = + data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [3]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [4]:
players_df = dfs['players_df']
players_df.head()

Unnamed: 0,game,player_id,tag,all_tags,prefixes,social,country,state,region,c_country,c_state,c_region,placings,characters,alias
0,melee,Rishi,Rishi,[Rishi],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
1,melee,15634,lloD,"[lloD, VGz | lloD, Llod]",[],{'twitter': ['lloD74']},United States,VA,,US,CA,Laurel,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/peach': 1089, 'melee/falco': 1, 'melee...",
2,melee,6126,Zain,"[Zain, DontTestMe]",[PG],{'twitter': ['PG_Zain']},United States,VA,,US,CA,Los Angeles,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/marth': 1065, 'melee/pichu': 1, 'melee...",DontTestMe
3,melee,Chu,Chu,[Chu],[],{'twitter': []},,,,,,,[{'key': 'mdva-invitational-2017-(challonge-mi...,,
4,melee,5620,Junebug,"[Junebug, LS | VGz Junebug]",[],{'twitter': ['arJunebug']},United States,VA,,US,VA,Richmond,[{'key': 'mdva-invitational-2017-(challonge-mi...,"{'melee/sheik': 46, 'melee/falco': 4, 'melee/g...",


In [5]:
ranking_df = dfs['ranking_df']
ranking_df.head()

Unnamed: 0,game,ranking_name,priority,region,seasons,tournaments,icon
0,melee,SSBMRank,0,world,"[2015, 2016, 2017, 2018, 2019]",[],miom


In [6]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

Unnamed: 0,game,ranking_name,season,start,end,total,by_id,by_placing,final,name
0,melee,SSBMRank,2015,1420070400,1451606399,100,"{'6189': 1, '1004': 2, '4465': 3, '1000': 4, '...","{'1': '6189', '2': '1004', '3': '4465', '4': '...",0,
1,melee,SSBMRank,2016,1451606400,1483228799,100,"{'6189': 1, '1004': 2, '1000': 3, '1003': 4, '...","{'1': '6189', '2': '1004', '3': '1000', '4': '...",0,
2,melee,SSBMRank,2017,1483228800,1514764799,100,"{'1004': 1, '6189': 2, '1000': 3, '1003': 4, '...","{'1': '1004', '2': '6189', '3': '1000', '4': '...",0,
3,melee,SSBMRank,2018,1514793600,1546329600,100,"{'1004': 1, '6189': 2, '4465': 3, '15990': 4, ...","{'1': '1004', '2': '6189', '3': '4465', '4': '...",0,
4,melee,SSBMRank,2019,1546329600,1577836800,100,"{'1004': 1, '4465': 2, '1000': 3, '16342': 4, ...","{'1': '1004', '2': '4465', '3': '1000', '4': '...",0,


In [15]:
sets_df = dfs['sets_df']
print(sets_df.shape)
# print(f'{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data')
# sets_df.head()

(1795681, 14)


In [8]:
tournament_info_df = dfs['tournament_info_df']
tournament_info_df.head()

Unnamed: 0,game,key,cleaned_name,source,tournament_name,tournament_event,season,rank,start,end,country,state,city,entrants,placings,losses,bracket_types,online,lat,lng
0,melee,mdva-invitational-2017-(challonge-mirror),MDVA Invitational 2017 (Challonge Mirror),challonge,https://challonge.com/mdva_invitational_2017,,17,,2017-11-26 08:05:11,2017-11-26 08:48:09,US,VA,Fall's Church,10,"[[Rishi, 1], [15634, 3], [6126, 4], [Chu, 8], ...",{},b'{}',0,,
1,melee,s@sh7,S@SH7,challonge,https://challonge.com/sash7,,17,,2017-06-13 10:27:01,2017-06-13 10:27:01,US,MI,Ann Arbor,92,[],{},b'{}',0,,
2,melee,slippi-champions-league-week-1__melee-singles,Slippi Champions League Week 1,pgstats,slippi-champions-league-week-1,melee-singles,20,,2020-10-11 14:00:00,2020-10-11 14:00:00,,,,20,"[[1000, 1], [6126, 2], [4107, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0
3,melee,slippi-champions-league-week-2__melee-singles,Slippi Champions League Week 2,pgstats,slippi-champions-league-week-2,melee-singles,20,,2020-10-18 14:00:00,2020-10-18 14:00:00,,,,20,"[[6126, 1], [4107, 2], [1000, 3], [19554, 3], ...",{},b'{}',1,0.0,0.0
4,melee,slippi-champions-league-week-3__melee-singles,Slippi Champions League Week 3,pgstats,slippi-champions-league-week-3,melee-singles,20,,2020-10-25 14:00:00,2020-10-25 14:00:00,,,,20,"[[6126, 1], [3359, 2], [19554, 3], [4107, 3], ...",{},b'{}',1,0.0,0.0


We want to filter to only sets with valid score.

In [14]:
# print(sets_df['p1_score'].value_counts())

filtered_sets_df = sets_df[sets_df['p1_score'].isin([0,1,2,3])]
filtered_sets_df = filtered_sets_df[filtered_sets_df['p2_score'].isin([0,1,2,3])]
filtered_sets_df = filtered_sets_df[filtered_sets_df['best_of'].isin([3,5])]

mask = np.max(filtered_sets_df[['p1_score','p2_score']], axis = 1) == (filtered_sets_df['best_of'] // 2 + 1)

filtered_sets_df = filtered_sets_df[mask]
print(filtered_sets_df['p1_score'].info())
filtered_sets_df[['p1_score','p2_score','best_of']].value_counts()


<class 'pandas.core.series.Series'>
Index: 1194287 entries, 0 to 1795642
Series name: p1_score
Non-Null Count    Dtype
--------------    -----
1194287 non-null  int64
dtypes: int64(1)
memory usage: 18.2 MB
None


p1_score  p2_score  best_of
2         0         3          612390
0         2         3          196067
2         1         3          147365
1         2         3           94181
3         0         5           48454
          1         5           26862
0         3         5           20836
1         3         5           17268
3         2         5           17182
2         3         5           13678
          2         3               4
Name: count, dtype: int64

In [10]:
tau = .5

def process_tournament(tournament_key, player_ratings_df, tournament_info_df, sets_df):
    """
    Process a tournament to update player ratings.
    """
    # Get the sets for this tournament
    tournament_sets_df = sets_df[sets_df['tournament_key'] == tournament_key]
    
    # Extract the unique player IDs from the sets
    tournament_players = pd.unique(tournament_sets_df[['p1_id', 'p2_id']].values.ravel())
   

    # Add any new players to the player_ratings_df
    new_players = [player for player in tournament_players if player not in player_ratings_df.index]
    if new_players:
        new_player_df = pd.DataFrame({
            'dates': [[datetime.datetime(2015, 1, 11, 14, 16, 0)] for _ in new_players],
            'rating_history': [[1500.0] for _ in new_players],
            'rd_history': [[350.0] for _ in new_players],
            'glicko2': [Player() for _ in new_players]
        }, index=new_players)
        new_player_df['glicko2']._tau = tau
        # Check the IDs of the Player objects
    
        player_ratings_df = pd.concat([player_ratings_df, new_player_df], ignore_index=False)
    

    # Ensure the index name is set to 'player_id'
    player_ratings_df.index.name = 'player_id'

    # Create a mapping from player_id to their Glicko2 player object
    player_map = player_ratings_df.loc[tournament_players]['glicko2'].to_dict()

    # Create a snapshot of ratings before the tournament
    ratings_snapshot = player_ratings_df.loc[tournament_players][['glicko2']].copy()
    ratings_snapshot['rating'] = ratings_snapshot['glicko2'].apply(lambda x: x.getRating())
    ratings_snapshot['rd'] = ratings_snapshot['glicko2'].apply(lambda x: x.getRd())
    # Add 'opponent_id' column from the index
    ratings_snapshot.reset_index(inplace=True)
    ratings_snapshot.rename(columns={'player_id': 'opponent_id'}, inplace=True)
    ratings_snapshot = ratings_snapshot[['opponent_id', 'rating', 'rd']]
    # print(ratings_snapshot)
    
    # Prepare player matches DataFrame
    df_p1 = tournament_sets_df[['p1_id', 'p2_id', 'p1_score', 'p2_score']].copy()
    df_p1.rename(columns={'p1_id': 'player_id', 'p2_id': 'opponent_id', 'p1_score': 'player_score', 'p2_score':'opponent_score'}, inplace=True)
    df_p2 = tournament_sets_df[['p2_id', 'p1_id',  'p2_score', 'p1_score']].copy()
    df_p2.rename(columns={'p2_id': 'player_id', 'p1_id': 'opponent_id', 'p2_score': 'player_score', 'p1_score':'opponent_score'}, inplace=True)
    
    player_matches = pd.concat([df_p1, df_p2], ignore_index=True)
   
    player_matches_win = pd.DataFrame({'player_id': player_matches.player_id.repeat(player_matches['player_score'])})
    player_matches_win['opponent_id'] = player_matches.opponent_id.repeat(player_matches['player_score'])
    player_matches_win['outcome'] = 1
    player_matches_lose = pd.DataFrame({'player_id': player_matches.player_id.repeat(player_matches['opponent_score'])})
    player_matches_lose['opponent_id'] = player_matches.opponent_id.repeat(player_matches['opponent_score'])
    player_matches_lose['outcome'] = 0
    
    player_matches_expanded = pd.concat([player_matches_win, player_matches_lose])

    player_matches_expanded = player_matches_expanded.merge(
        ratings_snapshot,
        on='opponent_id', how='left'
    )
    
    player_matches_expanded.rename(columns={'rating': 'opponent_rating', 'rd': 'opponent_rd'}, inplace=True)

    grouped = player_matches_expanded.groupby('player_id').agg({
        'opponent_rating': list,
        'opponent_rd': list,
        'outcome': list
    }).reset_index()
    
    # Get the tournament end date
    end_date = tournament_info_df.loc[tournament_info_df['key'] == tournament_key, 'end'].values[0]
    end_date = pd.to_datetime(end_date)
    
    def process_player(row):
            player_id = row['player_id']
            rating_list = row['opponent_rating']
            rd_list = row['opponent_rd']
            outcome_list = row['outcome']
            
            # Update Glicko rating for the player
            player_glicko = player_map[player_id]
            if rating_list and len(rating_list) > 0:  # Ensure player has matches to process
                try:
                    # Use a safe update to avoid division by zero errors
                    player_glicko.update_player(rating_list, rd_list, outcome_list)
                except ZeroDivisionError:
                    print(f"ZeroDivisionError: player_id={player_id}, tournament_key={tournament_key}")
                except Exception as e:
                    print(f"Unexpected error for player_id={player_id}, tournament_key={tournament_key}: {e}")
            
            # Update the player's history
            player_ratings_df.at[player_id, 'dates'].append(end_date)
            player_ratings_df.at[player_id, 'rating_history'].append(player_glicko.getRating())
            player_ratings_df.at[player_id, 'rd_history'].append(player_glicko.getRd())
            
    # Apply the function to each row using apply
    grouped.apply(process_player, axis=1)   
    
    return player_ratings_df  # Return the updated DataFrame

sorted_tournament_info_df = tournament_info_df.sort_values('end').reset_index(drop=True)
sorted_tournament_info_df.shape[0]

# Initialize player ratings DataFrame
player_ratings_df = pd.DataFrame(columns=[
     'dates', 'rating_history', 'rd_history', 'glicko2'
])




# Loop over tournaments
for  tournament_key in tqdm.tqdm(sorted_tournament_info_df['key'], total=sorted_tournament_info_df.shape[0]):
    # print(tournament)
    player_ratings_df = process_tournament(tournament_key, player_ratings_df, tournament_info_df, filtered_sets_df)
    
# Loop over tournaments
# for idx, tournament in tqdm.tqdm(sorted_tournament_info_df[:1000].iterrows(), total=sorted_tournament_info_df.shape[0]):
#     player_ratings_df = process_tournament(tournament['key'], player_ratings_df, tournament_info_df, sets_df)


AttributeError: type object 'tqdm' has no attribute 'tqdm'

In [None]:
player_ratings_df

In [None]:
df = player_ratings_df.copy()

In [None]:
# Convert lists to NumPy arrays in the specified columns
for col in ['dates', 'rating_history', 'rd_history']:
    print(col)
    player_ratings_df[col] = player_ratings_df[col].apply(np.array)
# Save the DataFrame to a pickle file
# player_ratings_df.to_pickle(data_path + 'overall_players_ranking_based_on_games.pkl')


In [None]:
# Load the DataFrame to verify
loaded_df = pd.read_pickle(data_path + 'overall_players_ranking_based_on_games.pkl')
# loaded_df.head()

In [None]:
# max_rating = 0

# for index, row in loaded_df.iterrows():
#     index_max_rating = np.argmax(row['rating_history'].astype(int))  # Correcting the typo to `astype`
#     if row['rating_history'][index_max_rating] > max_rating:
#         best_row = row
#         best_index = index
#         max_rating = row['rating_history'][index_max_rating]
#         print(players_df[players_df['player_id'] == index]['tag'], max_rating)
#         # print(max_rating)


        
# print(max_rating)


In [None]:
# players_df[players_df['player_id'] == best_index]