In [2]:
from nba_api.stats.endpoints import playergamelog
from nba_api.stats.static import players
import pandas as pd
import time
from tqdm import tqdm

In [3]:
player_names = ['Jayson Tatum', 'Jaylen Brown', 'Cam Thomas', 'Nic Claxton',
                'Jalen Brunson', 'Mikal Bridges', 'Joel Embiid', 'Tyrese Maxey',
                'Scottie Barnes', 'Brandon Ingram', 'Josh Giddey', 'Alex Caruso',
                'Donovan Mitchell', 'Cade Cunningham', 'Tobias Harris', 'Tyrese Haliburton',
                'Pascal Siakam', 'Giannis Antetokounmpo', 'Khris Middleton', 'Trae Young',
                'Clint Capela', 'Lamelo Ball', 'Brandon Miller', 'Tyler Herro', 
                'Bam Adebayo', 'Paulo Banchero', 'Franz Wagner', 'CJ McCollum',
                'Marvin Bagley III', 'Nikola Jokic', 'Russell Westbrook', 'Anthony Edwards',
                'Karl Anthony-Towns', 'Shai Gilgeous-Alexander', 'Chet Holmgren', 'Stephen Curry',
                'Jordan Poole', 'Lebron James', 'Luka Doncic', 'Anthony Davis',
                'Kyrie Irving', 'Devin Booker', 'Kevin Durant', 'Bradley Beal',
                'James Harden', 'Steven Adams', 'Fred Vanvleet', 'Collin Sexton',
                'Duncan Robinson', 'Jimmy Butler']

In [4]:
# Get player IDs
all_players = players.get_players()
player_ids = []
for name in player_names:
    player = [p for p in all_players if p['full_name'] == name]
    if player:
        player_ids.append(player[0]['id'])

In [5]:
# Collect game logs for 2023-24 season
all_gamelogs = []
for player_id in tqdm(player_ids):
    try:
        gamelog = playergamelog.PlayerGameLog(
            player_id=str(player_id),
            season='2023-24',
            season_type_all_star='Regular Season'
        )
        df = gamelog.get_data_frames()[0]
        df['PLAYER_ID'] = player_id
        all_gamelogs.append(df)
        time.sleep(1.0)  # Rate limit: 1 req/sec
    except Exception as e:
        print(f"Error for player {player_id}: {e}")
        continue

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:52<00:00,  1.18s/it]


In [7]:
# Combine and save
gamelogs_df = pd.concat(all_gamelogs, ignore_index=True)
gamelogs_df.to_parquet('../data/raw/player_gamelogs_2023-24_sample.parquet')
print(f"Collected {len(gamelogs_df)} game records for {len(player_ids)} players")

Collected 3003 game records for 45 players


  gamelogs_df = pd.concat(all_gamelogs, ignore_index=True)


In [12]:
# === DATA INSPECTION ===
print("="*60)
print("DATA COLLECTION SUMMARY")
print("="*60)

# Reload to verify it saved correctly
df_check = pd.read_parquet('../data/raw/player_gamelogs_2023-24_sample.parquet')

print(f"\n File saved successfully!")
print(f"  - Total games: {len(df_check):,}")
print(f"  - Unique players: {df_check['PLAYER_ID'].nunique()}")
print(f"  - Date range: {df_check['GAME_DATE'].min()} to {df_check['GAME_DATE'].max()}")
print(f"  - Columns: {len(df_check.columns)}")

print(f"\n Target Variable Ranges:")
print(f"  - Points (PTS): {df_check['PTS'].min():.0f} - {df_check['PTS'].max():.0f} (avg: {df_check['PTS'].mean():.1f})")
print(f"  - Rebounds (REB): {df_check['REB'].min():.0f} - {df_check['REB'].max():.0f} (avg: {df_check['REB'].mean():.1f})")
print(f"  - Assists (AST): {df_check['AST'].min():.0f} - {df_check['AST'].max():.0f} (avg: {df_check['AST'].mean():.1f})")

print(f"\n Sample data:")
display(df_check[['GAME_DATE','PLAYER_ID', 'MATCHUP', 'PTS', 'REB','AST', 'MIN']].head(10))

print(f"\n Ready for Phase 2: Exploration!")

DATA COLLECTION SUMMARY

 File saved successfully!
  - Total games: 3,003
  - Unique players: 44
  - Date range: Apr 01, 2024 to Oct 31, 2023
  - Columns: 28

 Target Variable Ranges:
  - Points (PTS): 0 - 73 (avg: 21.2)
  - Rebounds (REB): 0 - 25 (avg: 6.1)
  - Assists (AST): 0 - 23 (avg: 5.1)

 Sample data:


Unnamed: 0,GAME_DATE,PLAYER_ID,MATCHUP,PTS,REB,AST,MIN
0,"Apr 11, 2024",1628369,BOS vs. NYK,18,4,7,32
1,"Apr 09, 2024",1628369,BOS @ MIL,22,5,6,37
2,"Apr 05, 2024",1628369,BOS vs. SAC,17,6,5,33
3,"Apr 03, 2024",1628369,BOS vs. OKC,24,7,3,29
4,"Apr 01, 2024",1628369,BOS @ CHA,25,10,4,34
5,"Mar 30, 2024",1628369,BOS @ NOP,23,9,4,36
6,"Mar 28, 2024",1628369,BOS @ ATL,31,13,6,45
7,"Mar 25, 2024",1628369,BOS @ ATL,37,8,5,39
8,"Mar 23, 2024",1628369,BOS @ CHI,26,2,6,37
9,"Mar 20, 2024",1628369,BOS vs. MIL,31,8,4,39



 Ready for Phase 2: Exploration!
