In [1]:
import pandas as pd
import requests
import time

In [2]:
SLEEP = 0
SEASONS = [2021, 2022, 2023]
with open('../balldontlie.apikey', 'r') as f:
    API_KEY = f.read()
    f.close()



# Helpers

In [7]:
def get_players_df(cursor):
    resp = requests.get(
        url='https://api.balldontlie.io/v1/players',
        params={
            'per_page': 100,
            'cursor': cursor
        },
        headers={
            'Authorization': API_KEY
        }
    )

    if resp.ok:
        print(f'on cursor {cursor}, meta: {resp.json()['meta']}')
        
        df = pd.Series(resp.json()['data']).apply(pd.Series)
        #df['current_team_id'] = df.team.apply(lambda x: x['id'])
        #df = df.drop(columns='team')

        if 'next_cursor' in resp.json()['meta'].keys():
            next_cursor = resp.json()['meta']['next_cursor']
        else:
            next_cursor = None

        return df, next_cursor
    
    else:
        print(f'FAIL cursor {cursor}')

        return None, None
    


In [12]:
def get_other_df(dfname, cursor):
    resp = requests.get(
        url=f'https://api.balldontlie.io/v1/{dfname}',
        params={
            'per_page': 100,
            'cursor': cursor,
            'seasons': SEASONS
        },
        headers={
            'Authorization': API_KEY
        }
    )

    if resp.ok:
        print(f'on cursor {cursor}, meta: {resp.json()['meta']}')
        
        df = pd.Series(resp.json()['data']).apply(pd.Series)
        #df['current_team_id'] = df.team.apply(lambda x: x['id'])
        #df = df.drop(columns='team')

        if 'next_cursor' in resp.json()['meta'].keys():
            next_cursor = resp.json()['meta']['next_cursor']
        else:
            next_cursor = None

        return df, next_cursor
    
    else:
        print(f'FAIL cursor {cursor}')

        return None, None
    


# Team Data

In [5]:
resp = requests.get(
    url='https://api.balldontlie.io/v1/teams',
    params={
        'per_page': 100,
        'cursor': 0
    },
    headers={
        'Authorization': API_KEY
    }
)

df = pd.Series(resp.json()['data']).apply(pd.Series)
df.to_csv('../data/data_raw/teams.csv', index=False)

# Player Data

In [8]:
dfs = []
cursor = 0

df, next_cursor = get_players_df(cursor)
dfs.append(df)

cursor = next_cursor

while cursor:

    df, next_cursor = get_players_df(cursor)
    dfs.append(df)

    cursor = next_cursor

    time.sleep(SLEEP)

df = pd.concat(dfs)
df.to_csv('../data/data_raw/players.csv', index=False)

on cursor 0, meta: {'next_cursor': 100, 'per_page': 100}
on cursor 100, meta: {'prev_cursor': 100, 'next_cursor': 200, 'per_page': 100}
on cursor 200, meta: {'prev_cursor': 200, 'next_cursor': 300, 'per_page': 100}
on cursor 300, meta: {'prev_cursor': 300, 'next_cursor': 400, 'per_page': 100}
on cursor 400, meta: {'prev_cursor': 400, 'next_cursor': 500, 'per_page': 100}
on cursor 500, meta: {'prev_cursor': 500, 'next_cursor': 600, 'per_page': 100}
on cursor 600, meta: {'prev_cursor': 600, 'next_cursor': 700, 'per_page': 100}
on cursor 700, meta: {'prev_cursor': 700, 'next_cursor': 800, 'per_page': 100}
on cursor 800, meta: {'prev_cursor': 800, 'next_cursor': 900, 'per_page': 100}
on cursor 900, meta: {'prev_cursor': 900, 'next_cursor': 1000, 'per_page': 100}
on cursor 1000, meta: {'prev_cursor': 1000, 'next_cursor': 1100, 'per_page': 100}
on cursor 1100, meta: {'prev_cursor': 1100, 'next_cursor': 1200, 'per_page': 100}
on cursor 1200, meta: {'prev_cursor': 1200, 'next_cursor': 1300, 'p

# Games Data

In [13]:
dfs = []
cursor = 0

df, next_cursor = get_other_df('games', cursor)
dfs.append(df)

cursor = next_cursor

while cursor:

    df, next_cursor = get_other_df('games', cursor)
    dfs.append(df)

    cursor = next_cursor

    time.sleep(SLEEP)

df = pd.concat(dfs)
df.to_csv('../data/data_raw/games.csv', index=False)

on cursor 0, meta: {'next_cursor': 473417, 'per_page': 100}
on cursor 473417, meta: {'prev_cursor': 473417, 'next_cursor': 473515, 'per_page': 100}
on cursor 473515, meta: {'prev_cursor': 473515, 'next_cursor': 473608, 'per_page': 100}
on cursor 473608, meta: {'prev_cursor': 473608, 'next_cursor': 473709, 'per_page': 100}
on cursor 473709, meta: {'prev_cursor': 473709, 'next_cursor': 473809, 'per_page': 100}
on cursor 473809, meta: {'prev_cursor': 473809, 'next_cursor': 473909, 'per_page': 100}
on cursor 473909, meta: {'prev_cursor': 473909, 'next_cursor': 474013, 'per_page': 100}
on cursor 474013, meta: {'prev_cursor': 474013, 'next_cursor': 474106, 'per_page': 100}
on cursor 474106, meta: {'prev_cursor': 474106, 'next_cursor': 474206, 'per_page': 100}
on cursor 474206, meta: {'prev_cursor': 474206, 'next_cursor': 474312, 'per_page': 100}
on cursor 474312, meta: {'prev_cursor': 474312, 'next_cursor': 474403, 'per_page': 100}
on cursor 474403, meta: {'prev_cursor': 474403, 'next_cursor

In [30]:
gamesdf = pd.read_csv('../data/data_raw/games.csv')
gamesdf = gamesdf.sort_values('date')

# Stats Data

Doing it this way, because using cursors sometimes leads to "dead-ends" with stats data, aka not all games get loaded.

In [42]:
dfs = []
invalid_games = []

for i, row in gamesdf.iterrows():

    print(f'processing date={row.date}, game={row.id}')
    
    resp = requests.get(
        url='https://api.balldontlie.io/v1/stats',
        params={
            'game_ids[]': row.id
        },
        headers={
            'Authorization': API_KEY
        }
    )

    if resp.ok:
        df = pd.Series(resp.json()['data']).apply(pd.Series)
        dfs.append(df)
    else:
        invalid_games.append(row.id)

df = pd.concat(dfs)
df.to_csv('../data/data_raw/stats.csv', index=False)

processing date=2021-08-03, game=448625
processing date=2021-08-03, game=448628
processing date=2021-08-04, game=449206
processing date=2021-08-04, game=449208
processing date=2021-08-06, game=450930
processing date=2021-08-08, game=451505
processing date=2021-08-08, game=451506
processing date=2021-08-08, game=451507
processing date=2021-08-08, game=451508
processing date=2021-08-08, game=451510
processing date=2021-08-08, game=451511
processing date=2021-08-08, game=451512
processing date=2021-08-09, game=452857
processing date=2021-08-09, game=452672
processing date=2021-08-09, game=452671
processing date=2021-08-09, game=452670
processing date=2021-08-09, game=452667
processing date=2021-08-09, game=452668
processing date=2021-08-09, game=452666
processing date=2021-08-09, game=452665
processing date=2021-08-09, game=452669
processing date=2021-08-10, game=455103
processing date=2021-08-10, game=455104
processing date=2021-08-10, game=455105
processing date=2021-08-10, game=455106


In [49]:
# processing INVALID games
dfs = []
invalid_games_twice = []

for gid in invalid_games:

    
    resp = requests.get(
        url='https://api.balldontlie.io/v1/stats',
        params={
            'game_ids[]': gid
        },
        headers={
            'Authorization': API_KEY
        }
    )

    if resp.ok:
        df = pd.Series(resp.json()['data']).apply(pd.Series)
        dfs.append(df)
        print(f'processing game={gid}, status OK')
    else:
        invalid_games_twice.append(gid)
        print(f'processing game={gid}, status INVALID')

df = pd.concat(dfs)
df.to_csv('../data/data_raw/stats_invalid.csv', index=False)

processing game=473978, status OK
processing game=473984, status OK
processing game=473980, status OK
processing game=473986, status OK
processing game=473988, status OK
processing game=473989, status OK
processing game=473987, status OK
processing game=473991, status OK
processing game=473992, status OK
processing game=473993, status OK
processing game=473995, status OK
processing game=473990, status OK
processing game=473998, status OK
processing game=473996, status OK
processing game=473994, status OK
processing game=473997, status OK
processing game=474002, status OK
processing game=474003, status OK
processing game=473999, status OK
processing game=474001, status OK
processing game=474004, status OK
processing game=474000, status OK
processing game=474010, status OK
processing game=474014, status OK
processing game=474011, status OK
processing game=474009, status OK
processing game=474006, status OK
processing game=474008, status OK
processing game=474007, status OK
processing gam

# Advanced Stats Data

In [None]:
dfs = []
invalid_games = []

for i, row in gamesdf.iterrows():
    
    resp = requests.get(
        url='https://api.balldontlie.io/v1/stats/advanced',
        params={
            'game_ids[]': row.id
        },
        headers={
            'Authorization': API_KEY
        }
    )

    if resp.ok:
        df = pd.Series(resp.json()['data']).apply(pd.Series)
        dfs.append(df)
        print(f'processing date={row.date}, game={row.id}, status=OK')
    else:
        invalid_games.append(row.id)
        print(f'processing date={row.date}, game={row.id}, status=INVALID')

df = pd.concat(dfs)
df.to_csv('../data/data_raw/advanced.csv', index=False)

In [19]:
df['date'] = df.game.apply(lambda x: x['date'])

In [21]:
df.date.max()

'2022-03-11'

In [26]:
get_other_df('games', 10851598)

on cursor 10851598, meta: {'prev_cursor': 10851598, 'per_page': 100}


(Series([], dtype: object), None)