
# Label Majors
In this notebook we want to:
- Label the major tournaments based on https://liquipedia.net/smash/Major_Tournaments/Melee

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime 

import sqlite3
import sys
import time
import tqdm
from tqdm.auto import tqdm
import pickle
import joblib
import os
import re
from datetime import datetime

if os.path.exists('/workspace/data'):
    # Load the dictionary of DataFrames from the pickle
    data_path = '/workspace/data/'
else:
    data_path = '../data/'


## Loading SQLite Database into Pandas DataFrames

The following code connects to an SQLite database (`melee_player_database.db`) and converts each table within the database into a pandas DataFrame. The DataFrames will be stored in a dictionary, where each key corresponds to the table name with `_df` appended, and the values are the respective DataFrames.

### Steps:

1. **Database Connection**: We use the `sqlite3` library to connect to the SQLite database file.
2. **Retrieve Table Names**: A query retrieves all the table names in the database.
3. **Convert Tables to DataFrames**: For each table:
   - The table is loaded into a pandas DataFrame using `pd.read_sql()`.
   - We check each column to see if any data is JSON-formatted (lists or dictionaries). If so, we convert these columns from strings into their corresponding Python objects using `json.loads()`.
4. **Store DataFrames**: The DataFrames are stored in a dictionary, where the key is the table name with a `_df` suffix, and the value is the DataFrame.
5. **Database Connection Closed**: Once all tables are loaded into DataFrames, the database connection is closed.

### Example:
If the database contains a table named `players`, the corresponding DataFrame will be stored in the dictionary with the key `players_df`, and can be accessed as:

```python
players_df = dfs['players_df']


In [None]:
# Function to get the table names
def get_table_names(conn):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    return pd.read_sql(query, conn)['name'].tolist()

# Function to load tables into DataFrames
def load_tables_to_dfs(conn):
    table_names = get_table_names(conn)
    dataframes = {}
    
    for table in table_names:
        # Load table into a DataFrame
        df = pd.read_sql(f"SELECT * FROM {table}", conn)
        
        # Detect and convert JSON formatted columns (if any)
        for col in df.columns:
            # Check if any entry in the column is a valid JSON (list or dictionary)
            if df[col].apply(lambda x: isinstance(x, str)).all():
                try:
                    # Try parsing the column as JSON
                    df[col] = df[col].apply(lambda x: json.loads(x) if pd.notnull(x) else x)
                except (json.JSONDecodeError, TypeError):
                    # If it fails, skip the column
                    pass
        
        # Store the DataFrame with table name + '_df'
        dataframes[f"{table}_df"] = df
        
    return dataframes

if os.path.exists(data_path + 'dfs_dict.pkl'):
    cell_has_run = True
    # Load the dictionary of DataFrames from the pickle
    with open(data_path + 'dfs_dict.pkl', 'rb') as f:
        dfs = pickle.load(f)
# Check if the flag variable exists in the global scope so that this code does not run twice
if 'cell_has_run' not in globals():
    path = + data_path + "melee_player_database.db"
    
    # Connect to the database
    conn = sqlite3.connect(path)

    # Convert each table into a DataFrame
    dfs = load_tables_to_dfs(conn)

    # Close the connection
    conn.close()

    # Now, you have a dictionary 'dfs' where each key is the table name with '_df' suffix and value is the corresponding DataFrame.
    # For example, to access the DataFrame for a table called 'players':
    # players_df = dfs['players_df']

    dfs['tournament_info_df']['start'] = pd.to_datetime(dfs['tournament_info_df']['start'], unit='s')
    dfs['tournament_info_df']['end'] = pd.to_datetime(dfs['tournament_info_df']['end'], unit='s')

    
    # Set the flag to indicate that the cell has been run
    cell_has_run = True

### Here we adjust the data types of the dataframes so that they are the correct type. (This will be updated as needed.)

In [None]:
dfs['sets_df']['best_of'] = dfs['sets_df']['best_of'].fillna(0).astype(int) 

In [None]:
# # Save the dictionary of DataFrames as a pickle
# with open(data_path + 'dfs_dict.pkl', 'wb') as f:
#     pickle.dump(dfs, f)

### Here we make dataframes that we will use and print the head.

The integers in 'characters' count the number of games the player has played that character. (We verify this for Zain below.)

In [None]:
players_df = dfs['players_df']
players_df.head()


In [None]:
ranking_df = dfs['ranking_df']
ranking_df.head()

In [None]:
ranking_seasons_df = dfs['ranking_seasons_df']
ranking_seasons_df.head()

In [None]:
sets_df = dfs['sets_df']
print(f"{sets_df[sets_df['game_data'].apply(lambda x: len(x) > 0)].shape[0] / sets_df.shape[0]:0.01%} percent of sets have some game data")
sets_df.shape



In [None]:
tournament_info_df = dfs['tournament_info_df']
print(tournament_info_df.shape)
print(tournament_info_df.head())


In [None]:
tournament_info_df[tournament_info_df['cleaned_name']=='DreamHack Denver 2017']

We copied the information from Liquipedia into a speadsheet and saved it as a CSV which we load as a dataframe.

In [None]:
majors_df = pd.read_csv('melee_majors.csv')
majors_df = majors_df.iloc[6:]
majors_df.shape

### Clean Up the Tournament Names in Your List
First, let's clean the tournament names in your list to remove duplicates.

In [None]:
tournament_list = list(majors_df['Tournament'])

# Function to remove duplicate phrases
def remove_duplicate_phrases(name):
    # Split the name into words
    words = name.split()
    # Use a sliding window to find duplicates
    for i in range(1, len(words)):
        if words[:i] == words[i:2*i]:
            return ' '.join(words[i:])
    return name

# Clean the tournament names
cleaned_tournament_list = [remove_duplicate_phrases(name) for name in tournament_list]

print("Cleaned Tournament Names:")
for original, cleaned in zip(tournament_list, cleaned_tournament_list):
    print(f"Original: {original}")
    print(f"Cleaned: {cleaned}")
    print()

In [None]:

import pandas as pd
import re

# Function to clean tournament names
def clean_tournament_name(name):
    # Remove special characters, convert to lowercase, remove extra spaces
    # if ':' in name:
    #     name = name.split(":")[0]
    # if '-' in name:
    #     name = name.split("-")[0]
    name = re.sub(r'[^a-zA-Z0-9\s]', '', name)
    name = name.lower()
    name = re.sub(r'\s+', ' ', name).strip()
    return name

# Clean the major tournament names
major_tournaments_cleaned = [clean_tournament_name(t) for t in cleaned_tournament_list]

# Clean the 'cleaned_name' column in your DataFrame
tournament_info_df['cleaned_name_cleaned'] = tournament_info_df['cleaned_name'].apply(clean_tournament_name)

# Create the 'major' column
tournament_info_df['major'] = tournament_info_df['cleaned_name_cleaned'].isin(major_tournaments_cleaned)

# Verify the results
majors_in_df = tournament_info_df[tournament_info_df['major']]
print("Number of majors found:", majors_in_df.shape[0])
print("Majors found:")
print(majors_in_df['cleaned_name'].unique())


In [None]:
missing_majors = [major for major in major_tournaments_cleaned if not tournament_info_df['cleaned_name_cleaned'].isin([major]).any()]

print(len(missing_majors))
for major in missing_majors:
    print(major)

In [None]:
temp_df  = tournament_info_df[tournament_info_df['major']==True].copy()
temp_df

### Search for missing majors one by one

In [None]:
# temp_df = tournament_info_df.copy()
# # temp_df = tournament_info_df[tournament_info_df['city']=='Los Vagas']
# year = 2015
# temp_df = temp_df[temp_df['start']>=datetime(year,1,1)]
# temp_df = temp_df[temp_df['start']<datetime(year,12,30)]
# # temp_df = temp_df[temp_df['entrants']==16]
# # temp_df.sort_values('entrants',inplace=True)
# print(temp_df.shape)
# temp_df

## Add missing majors

In [None]:
missing_majors_5=[
39443, #Tipped off 15
38456, #Get on my level X
28389, #riptide 2023
26646, #Get on my level 2023
26137, # ludwig 2023 main event
24918, #Tipped off 14,
22595, # back in blood major upset
17129,# smash summit 14
15764, # lost tech city 2022
12948, # double down 2022
12779, # get on my level 2022
11293, # smash summit 13
7532, #smash_world tour
6377, #SWT 2021 NA east regional finals
5168, #riptide 2021
1233, #Galint Melee Open: Spring Edition
2, #Slippi Champions League Week 1
3,#Slippi Champions League Week 2
4,#Slippi Champions League Week 3
5,#Slippi Champions League Week 4
667, #Get on my line 2020
167, #GameTyrant Expo 2018
30, #EVO 2018	
41, #Enthusiast Gaming Live Expo 2018
51, #GameTyrant Expo 2017
# genesis fuse doubles circuit finals
58, #EVO 2017
#Shine 2016
26, #EVO 2016
141, # Supe Smash con
25, #EVO 2015
# WTFox
165, #FC Smash 15XR: Return
14 #paragon 2015
]

tournament_info_df.loc[missing_majors_5, 'major'] = True


In [None]:
major_tournament_info_df = tournament_info_df[tournament_info_df['major']==True]
major_tournament_info_df

## Remove not majors
Going through the list on the website and comparing to the majors we found, remove the ones that were miss labelled. That completes the list of majors

In [None]:
not_actually_majors = [
36389,  #battle-of-bc-6-7__lowtier-bracket-melee
16526, #ludwig-smash-invitational__melee-singles-lcq
]
tournament_info_df.loc[not_actually_majors, 'major'] = False


We seem to be missing 2 majors.

In [None]:
major_tournament_info_df = tournament_info_df[tournament_info_df['major']==True]
major_tournament_info_df

In [None]:
major_tournament_info_df.to_pickle(data_path + 'major_tournament_info_df.pkl')