In [117]:
import pandas as pd

In [135]:
def json_to_df(filepath, filename):
    try:
        df = pd.read_json(filepath + filename)
        return df
    except:
        print("Invalid file") 

def general_dataset_info(dataset):
    print("----------Null Values----------\n", dataset.isnull().sum())
    print("\n----------Duplicate Values----------\n", dataset.duplicated().sum())
    print("\nTotal entries:", len(dataset))

def join_players_and_team(filepath):
    teams = json_to_df(filepath, "teams.json")
    teams = teams[['id', 'name', 'acronym', 'home_league_id']]

    aggregated_teams = teams.groupby('id').agg({
    'name': lambda x: ', '.join(x.unique()),   # Combine names into a comma-separated string
    'acronym': lambda x: ', '.join(x.unique()), # Combine acronyms into a comma-separated string
    'home_league_id': 'first'
}).reset_index()
    
    players = json_to_df(filepath, "players.json")
    players = players[['handle', 'first_name', 'last_name', 'home_team_id']]
    players = players.drop_duplicates()
    
    leagues = json_to_df(filepath, "leagues.json")
    leagues = leagues[['league_id', 'region', 'name']]


    merged_df = pd.merge(players, aggregated_teams, left_on='home_team_id', right_on='id', how='left')
    merged_df = pd.merge(merged_df, leagues, left_on='home_league_id', right_on='league_id', how='left')
    merged_df.rename(columns={
    'name_x': 'team_name',
    'name_y': 'league_name',

    }, inplace=True)

    merged_df = merged_df[['handle', 'first_name', 'last_name','team_name', 'acronym', 'region', 'league_name']]
    
    return merged_df


In [119]:
STATS = 'Z:\\'
CH_FOLDER = 'Z:\\vct-challengers\esports-data\\'
GC_FOLDER = 'Z:\\game-changers\esports-data\\'
INTL_FOLDER = 'Z:\\vct-international\esports-data\\'

In [120]:
stats_df = json_to_df(STATS, "stats_data.json")

In [137]:
ch_df = join_players_and_team(CH_FOLDER)
gc_df = join_players_and_team(GC_FOLDER)
intl_df = join_players_and_team(INTL_FOLDER)

ch_df = ch_df.drop_duplicates()
gc_df = gc_df.drop_duplicates()
intl_df = intl_df.drop_duplicates()

print("CHALLENGERS DATA\n")
general_dataset_info(ch_df)

# print("\nGAME CHANGERS DATA\n")
general_dataset_info(gc_df)

print("\nINTERNATIONAL DATA\n")
general_dataset_info(intl_df)

----------Null Values----------
 handle           0
first_name      33
last_name        6
home_team_id     0
dtype: int64

----------Duplicate Values----------
 0

Total entries: 5321
----------Null Values----------
 handle          0
first_name      0
last_name       0
home_team_id    0
dtype: int64

----------Duplicate Values----------
 0

Total entries: 2619
----------Null Values----------
 handle          0
first_name      0
last_name       0
home_team_id    0
dtype: int64

----------Duplicate Values----------
 0

Total entries: 1542
CHALLENGERS DATA

----------Null Values----------
 handle          0
first_name     33
last_name       6
team_name       0
acronym         0
region          0
league_name     0
dtype: int64

----------Duplicate Values----------
 0

Total entries: 5321
----------Null Values----------
 handle         0
first_name     0
last_name      0
team_name      7
acronym        7
region         7
league_name    7
dtype: int64

----------Duplicate Values----------
 

In [None]:
combined_df = pd.concat([ch_df, gc_df, intl_df], ignore_index=True)
combined_df.reset_index(drop=True, inplace=True)

general_dataset_info(combined_df)