In [1]:
import mlcroissant as mlc
import pandas as pd

# Fetch the Croissant JSON-LD
croissant_dataset = mlc.Dataset("nba-play-by-play-data-1997-2023-metadata.json")

# Check what record sets are in the dataset
record_sets = croissant_dataset.metadata.record_sets
print(record_sets)

# Fetch the records and put them in a DataFrame
record_set_df = pd.DataFrame(croissant_dataset.records(record_set=record_sets[-1].uuid))
record_set_df.head()


  -  [Metadata(NBA Play-by-Play Data (1997-2023))] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.


[RecordSet(uuid="pbp1997.csv"), RecordSet(uuid="pbp1998.csv"), RecordSet(uuid="pbp1999.csv"), RecordSet(uuid="pbp2000.csv"), RecordSet(uuid="pbp2001.csv"), RecordSet(uuid="pbp2002.csv"), RecordSet(uuid="pbp2003.csv"), RecordSet(uuid="pbp2004.csv"), RecordSet(uuid="pbp2005.csv"), RecordSet(uuid="pbp2006.csv"), RecordSet(uuid="pbp2007.csv"), RecordSet(uuid="pbp2008.csv"), RecordSet(uuid="pbp2009.csv"), RecordSet(uuid="pbp2010.csv"), RecordSet(uuid="pbp2011.csv"), RecordSet(uuid="pbp2012.csv"), RecordSet(uuid="pbp2013.csv"), RecordSet(uuid="pbp2014.csv"), RecordSet(uuid="pbp2015.csv"), RecordSet(uuid="pbp2016.csv"), RecordSet(uuid="pbp2017.csv"), RecordSet(uuid="pbp2018.csv"), RecordSet(uuid="pbp2019.csv"), RecordSet(uuid="pbp2020.csv"), RecordSet(uuid="pbp2021.csv"), RecordSet(uuid="pbp2022.csv"), RecordSet(uuid="pbp2023.csv")]


Unnamed: 0,pbp2023.csv/gameid,pbp2023.csv/period,pbp2023.csv/clock,pbp2023.csv/h_pts,pbp2023.csv/a_pts,pbp2023.csv/team,pbp2023.csv/playerid,pbp2023.csv/player,pbp2023.csv/type,pbp2023.csv/subtype,pbp2023.csv/result,pbp2023.csv/x,pbp2023.csv/y,pbp2023.csv/dist,pbp2023.csv/desc,pbp2023.csv/season
0,b'22200001',1,b'PT12M00.00S',0.0,0.0,,b'0',,b'period',b'start',,0,0,0,b'Start of 1st Period (7:36 PM EST)',2023
1,b'22200001',1,b'PT12M00.00S',,,b'BOS',b'201143',b'A. Horford',b'Jump Ball',,,0,0,0,b'Jump Ball Horford vs. Embiid: Tip to Harris',2023
2,b'22200001',1,b'PT11M38.00S',,,b'PHI',b'203954',b'J. Embiid',b'Missed Shot',b'Turnaround Fadeaway shot',b'Missed',-118,50,13,"b""MISS Embiid 13' Turnaround Fadeaway Shot""",2023
3,b'22200001',1,b'PT11M38.00S',,,b'BOS',b'1627759',b'J. Brown',,,,0,0,0,b'Brown BLOCK (1 BLK)',2023
4,b'22200001',1,b'PT11M35.00S',,,b'PHI',b'200782',b'P. Tucker',b'Rebound',b'Unknown',,0,0,0,b'Tucker REBOUND (Off:1 Def:0)',2023


In [82]:
# Helper Functions 
def convert_nba_time(time_str):
# Convert NBA Time to seconds
    if not isinstance(time_str, str ) or not time_str:
            return None, None
    
    # Remove 'PT' prefix
    time_str = time_str.replace('PT', '')
    
    # Extract minutes and seconds
    minutes = int(time_str.split('M')[0])
    seconds = float(time_str.split('M')[1].replace('S', ''))
    
    # Return in readable format and total seconds
    readable = f"{minutes}:{seconds:02.0f}"
    total_seconds = minutes * 60 + seconds
    
    return readable, total_seconds

def get_scores(row_id, dataframe):
    # For a given play, get the previous score and the following score
    # iteratively go back plays until there is a score change
    next_play = dataframe.iloc[row_id + 1]
    while True:
        row_id = row_id - 1
        prev_play = dataframe.iloc[row_id]
        if prev_play['pbp2023.csv/h_pts'] != None:
            break
    return f"{prev_play['pbp2023.csv/h_pts']}-{prev_play['pbp2023.csv/a_pts']}", f"{next_play['pbp2023.csv/h_pts']}-{next_play['pbp2023.csv/a_pts']}"
            

In [31]:

record_set = record_set_df.copy()
# Convert columns to strings
record_set['pbp2023.csv/type'] = record_set_df['pbp2023.csv/type'].str.decode('utf-8') 
record_set['pbp2023.csv/type'] = record_set['pbp2023.csv/type'].fillna("")
record_set['pbp2023.csv/clock'] = record_set_df['pbp2023.csv/clock'].str.decode('utf-8')
record_set['pbp2023.csv/gameid'] = record_set_df['pbp2023.csv/gameid'].str.decode('utf-8')


# Clock in seconds
record_set["clock_seconds"] = record_set["pbp2023.csv/clock"].apply(lambda x: convert_nba_time(x)[1])



In [33]:
# Get Timeout with conditions (4th Qtr or OT and > 60 seconds left)
timeouts = record_set[record_set["pbp2023.csv/type"].str.contains("Timeout", na=False)]
print("Number of timeouts:", len(timeouts))
print("\nSample timeout types:")
print(timeouts["pbp2023.csv/type"].unique())

periods = record_set[record_set["pbp2023.csv/period"].isin([4, 5, 6, 7])]
print("\nUnique periods found:", periods["pbp2023.csv/period"].unique())

last_minute = record_set[record_set['clock_seconds'] < 60]
print("\nRange of clock seconds in last minute:")
print("Min:", last_minute['clock_seconds'].min())
print("Max:", last_minute['clock_seconds'].max())

# Now try combining them one at a time
timeouts_in_late_periods = record_set[
    (record_set["pbp2023.csv/type"].str.contains("Timeout", na=False)) &
    (record_set["pbp2023.csv/period"].isin([4, 5, 6, 7]))
]
print("\nTimeouts in late periods:", len(timeouts_in_late_periods))

# Final combined filter
timeouts_set = record_set[
    (record_set["pbp2023.csv/type"].str.contains("Timeout", na=False)) &
    (record_set["pbp2023.csv/period"].isin([4, 5, 6, 7])) &
    (record_set['clock_seconds'] < 60)
]
print("\nFinal result count:", len(timeouts_set))

Number of timeouts: 14461

Sample timeout types:
['Timeout']

Unique periods found: [4 5 6]

Range of clock seconds in last minute:
Min: 0.0
Max: 59.9

Timeouts in late periods: 5678

Final result count: 1420


In [None]:
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.library.parameters import Season
from nba_api.stats.library.parameters import SeasonType

# Get Game Data using NBA API
gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=None,
                            season_nullable=None,
                            season_type_nullable=SeasonType.regular)  


games = gamefinder.get_data_frames()[0]

# Get 2022-2023 games only
games_2223 = games[games.SEASON_ID.str[-4:] == '2022']

# Drop leading zeros on game id to match the other dataset
games_2223.GAME_ID = games_2223.GAME_ID.astype(str).str[2:] 

# Only get games that are in the timeout dataset
timeout_games_2223 = games_2223[games_2223.GAME_ID.isin(timeouts_set['pbp2023.csv/gameid'])]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_2223.GAME_ID = games_2223.GAME_ID.astype(str).str[2:]


In [83]:
get_scores(602202, dataframe=record_set)

('128.0-117.0', 'nan-nan')

In [68]:
len(timeout_games_2223)
timeout_games_2223.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
5684,22022,1610612747,LAL,Los Angeles Lakers,22201228,2023-04-09,LAL vs. UTA,W,240,128,...,0.762,10,39,49,28,4,7,10,9,11.0
5686,22022,1610612762,UTA,Utah Jazz,22201228,2023-04-09,UTA @ LAL,L,240,117,...,0.833,7,38,45,31,4,4,11,16,-11.0
5688,22022,1610612752,NYK,New York Knicks,22201220,2023-04-09,NYK vs. IND,L,241,136,...,0.773,19,34,53,29,8,8,15,24,-5.0
5689,22022,1610612754,IND,Indiana Pacers,22201220,2023-04-09,IND @ NYK,W,241,141,...,0.789,8,36,44,33,8,9,15,24,5.0
5698,22022,1610612756,PHX,Phoenix Suns,22201229,2023-04-09,PHX vs. LAC,L,240,114,...,0.727,12,35,47,29,4,3,7,21,-5.0


In [70]:
len(timeouts_set['pbp2023.csv/gameid'])
timeouts_set[timeouts_set['pbp2023.csv/gameid'] == '22201228']


Unnamed: 0,pbp2023.csv/gameid,pbp2023.csv/period,pbp2023.csv/clock,pbp2023.csv/h_pts,pbp2023.csv/a_pts,pbp2023.csv/team,pbp2023.csv/playerid,pbp2023.csv/player,pbp2023.csv/type,pbp2023.csv/subtype,pbp2023.csv/result,pbp2023.csv/x,pbp2023.csv/y,pbp2023.csv/dist,pbp2023.csv/desc,pbp2023.csv/season,clock_seconds
602202,22201228,4,PT00M39.70S,,,,b'1610612762',,Timeout,b'Regular',,0,0,0,b'Jazz Timeout: Regular (Reg.4 Short 0)',2023,39.7


In [47]:
# games_2223.GAME_ID = games_2223.GAME_ID.astype(str)[-1:]
timeouts_set[timeouts_set['pbp2023.csv/gameid'] == 22301199]

Unnamed: 0,pbp2023.csv/gameid,pbp2023.csv/period,pbp2023.csv/clock,pbp2023.csv/h_pts,pbp2023.csv/a_pts,pbp2023.csv/team,pbp2023.csv/playerid,pbp2023.csv/player,pbp2023.csv/type,pbp2023.csv/subtype,pbp2023.csv/result,pbp2023.csv/x,pbp2023.csv/y,pbp2023.csv/dist,pbp2023.csv/desc,pbp2023.csv/season,clock_seconds
