In [2]:
import pandas as pd
import json

In [3]:
def resetDict():
    # instantiate dictionary for dataframe
    games = dict()
    games["gameId"] = list()
    games["teamId"] = list()
    games["points"] = list()
    games["playerId"] = list()
    games["min"] = list()
    games["fgm"] = list()
    games["fga"] = list()
    games["ftm"] = list()
    games["fta"] = list()
    games["tpm"] = list()
    games["tpa"] = list()
    games["offReb"] = list()
    games["defReb"] = list()
    games["assists"] = list()
    games["steals"] = list()
    games["blocks"] = list()
    games["pFouls"] = list()
    games["turnovers"] = list()
    games["plusMinus"] = list()
#     print("resetDict")
    return games

def fillDataFrame(games, json_file):
    for raw_data in json_file:
        dat = raw_data["api"]["statistics"]
        for d in dat:
            games["gameId"].append(d["gameId"])
            games["teamId"].append(d["teamId"])
            games["points"].append(d["points"])
            games["playerId"].append(d["playerId"])
            games["min"].append(d["min"])
            games["fgm"].append(d["fgm"])
            games["fga"].append(d["fga"])
            games["ftm"].append(d["ftm"])
            games["fta"].append(d["fta"])
            games["tpm"].append(d["tpm"])
            games["tpa"].append(d["tpa"])
            games["offReb"].append(d["offReb"])
            games["defReb"].append(d["defReb"])
            games["assists"].append(d["assists"])
            games["steals"].append(d["steals"])
            games["blocks"].append(d["blocks"])
            games["pFouls"].append(d["pFouls"])
            games["turnovers"].append(d["turnovers"])
            games["plusMinus"].append(d["plusMinus"])
    
    df = pd.DataFrame(games)
#     print(df.dtypes)
    return df

def toIntMin(pStr):
    pStr = pStr.strip()
    if pStr == "":
        return 0
    elif len(pStr) < 1:
        return 0
    
    col_idx = pStr.find(":")
    ret = int(pStr[:col_idx])
    return ret

def toIntSec(pStr):
    pStr = pStr.strip()
    if pStr == "":
        return 0
    elif len(pStr) < 1:
        return 0
    
    col_idx = pStr.find(":")
    ret = int(pStr[col_idx+1:])
    return ret

def convertTodigit(digit, isInt=True):
#     print(isInt)
    digit = digit.strip()
    if digit == "":
        return 0
    elif len(digit) < 1:
        return 0
    
    if isInt:
        return int(digit)
    else:
        return float(digit)

def convertDFdatatypes(df):
    df["minutes"] = df["min"].apply(toIntMin)
    df["seconds"] = df["min"].apply(toIntSec)
    df["points"] = df["points"].apply(convertTodigit)
    df["fgm"] = df["fgm"].apply(convertTodigit)
    df["fga"] = df["fga"].apply(convertTodigit)
    df["ftm"] = df["ftm"].apply(convertTodigit)
    df["fta"] = df["fta"].apply(convertTodigit)
    df["tpm"] = df["tpm"].apply(convertTodigit)
    df["tpa"] = df["tpa"].apply(convertTodigit)
    df["offReb"] = df["offReb"].apply(convertTodigit)
    df["defReb"] = df["defReb"].apply(convertTodigit)
    df["assists"] = df["assists"].apply(convertTodigit)
    df["steals"] = df["steals"].apply(convertTodigit)
    df["blocks"] = df["blocks"].apply(convertTodigit)
    df["pFouls"] = df["pFouls"].apply(convertTodigit)
    df["turnovers"] = df["turnovers"].apply(convertTodigit)
    df["plusMinus"] = df["plusMinus"].apply(convertTodigit)
    
    return df

In [4]:
# player info
def resetPlayerDict():
    players = dict()
    players["playerId"] = list()
    players["firstName"] = list()
    players["lastName"] = list()
    players["country"] = list()
    players["dateOfBirth"] = list()
    players["startNba"] = list()
    players["heightInMeters"] = list()
    players["weightInKilograms"] = list()
    
    return players

def populatePlayersDF(players, json_file):
    for raw_data in json_file:
        dat = raw_data["api"]["players"][0]
        players["playerId"].append(dat["playerId"])
        players["firstName"].append(dat["firstName"])
        players["lastName"].append(dat["lastName"])
        players["country"].append(dat["country"])
        players["dateOfBirth"].append(dat["dateOfBirth"])
        players["startNba"].append(dat["startNba"])
        players["heightInMeters"].append(dat["heightInMeters"])
        players["weightInKilograms"].append(dat["weightInKilograms"])  
        
    df = pd.DataFrame(players)
    df.drop_duplicates(subset="playerId", keep="first", inplace=True)
    # fill null with 0
    df.fillna("0", inplace=True)
    df.replace("", "0", inplace=True)
    return df

def convertPlayerInfo(df):
#     print(df)
    df["startNba"] = df["startNba"].apply(convertTodigit)
    df["heightInMeters"] = df["heightInMeters"].apply(lambda x: convertTodigit(x,isInt=False))
    df["weightInKilograms"] = df["weightInKilograms"].apply(lambda x:convertTodigit(x, isInt=False))
    
    return df

In [5]:
def process(file_name, games=True):
    with open(file_name) as json_file:
        data = json.load(json_file)
    
    if games:
        games = resetDict()
        df = fillDataFrame(games, data)
        df = convertDFdatatypes(df)
    else:
        playerInfo = resetPlayerDict()
        df = populatePlayersDF(playerInfo, data)
        df = convertPlayerInfo(df)
    return df

In [6]:
df1 = process('../datasets/game_details.json')
df2 = process('../datasets/game_details_2.json')
df3 = process('../datasets/game_details_3.json')
df4 = process('../datasets/game_details_4.json')
df5 = process('../datasets/game_details_5.json')

In [7]:
playerInfo = process('../datasets/playerinfo.json', False)

In [8]:
print(playerInfo.shape)
playerInfo.dtypes

(132, 8)


playerId              object
firstName             object
lastName              object
country               object
dateOfBirth           object
startNba               int64
heightInMeters       float64
weightInKilograms    float64
dtype: object

In [9]:
# df.append(df2, ignore_index=True)
# df.append(df3, ignore_index=True)
# df.append(df4, ignore_index=True)
# df.append(df5, ignore_index=True)
frames = [df1, df2, df3, df4, df5]
df = pd.concat(frames)

In [10]:
print(df.shape)
df["gameId"].value_counts().sort_index()

(44514, 21)


4308    32
4309    39
4310    37
4311    40
4312    33
        ..
6217    26
6218    26
6219    26
6220    26
6221    26
Name: gameId, Length: 1381, dtype: int64

In [11]:
df[df['gameId'] == "6221"]

Unnamed: 0,gameId,teamId,points,playerId,min,fgm,fga,ftm,fta,tpm,...,offReb,defReb,assists,steals,blocks,pFouls,turnovers,plusMinus,minutes,seconds
1554,6221,38,22,314,41:05,7,16,7,8,1,...,1,5,3,2,1,4,2,-2,41,5
1555,6221,38,26,479,46:10,10,17,3,4,3,...,2,8,3,1,1,2,2,2,46,10
1556,6221,38,3,184,26:34,0,5,3,4,0,...,3,6,4,0,0,4,1,-7,26,34
1557,6221,38,0,203,17:43,0,0,0,0,0,...,0,1,3,1,0,1,1,7,17,43
1558,6221,38,26,327,41:42,9,16,4,6,4,...,2,5,10,3,0,5,3,16,41,42
1559,6221,38,0,732,0:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1560,6221,38,15,255,22:07,7,12,1,2,0,...,2,1,2,0,0,4,1,9,22,7
1561,6221,38,0,320,0:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1562,6221,38,0,345,0:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1563,6221,38,0,358,0:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df.to_csv('../datasets/gamedetails.csv')

In [13]:
playerInfo.to_csv('../datasets/playersinfo.csv')

## GameId

In [13]:
with open('../datasets/gameid.json') as json_file:
    data = json.load(json_file)

In [20]:
data["api"]["games"]

[{'seasonYear': '2018',
  'league': 'standard',
  'gameId': '4308',
  'startTimeUTC': '2018-09-28T23:00:00.000Z',
  'endTimeUTC': '2018-09-29T01:24:00.000Z',
  'arena': 'Wells Fargo Center',
  'city': 'Philadelphia',
  'country': 'USA',
  'clock': '',
  'gameDuration': '2:08',
  'currentPeriod': '4/4',
  'halftime': '0',
  'EndOfPeriod': '0',
  'seasonStage': '1',
  'statusShortGame': '3',
  'statusGame': 'Finished',
  'vTeam': {'teamId': '18',
   'shortName': 'MEL',
   'fullName': 'Melbourne United',
   'nickName': 'United',
   'logo': 'https://upload.wikimedia.org/wikipedia/en/thumb/1/1b/Melbourne_United_logo.svg/220px-Melbourne_United_logo.svg.png',
   'score': {'points': '84'}},
  'hTeam': {'teamId': '27',
   'shortName': 'PHI',
   'fullName': 'Philadelphia 76ers',
   'nickName': '76ers',
   'logo': 'https://upload.wikimedia.org/wikipedia/fr/4/48/76ers_2016.png',
   'score': {'points': '104'}}},
 {'seasonYear': '2018',
  'league': 'standard',
  'gameId': '4309',
  'startTimeUTC': '

In [17]:
games = dict()
games["gameId"] = list()
games["startTimeUTS"] = list()
games["aTeam"] = list()
games["hTeam"] = list()