In [1]:
import pandas as pd

#importing the building datasets
GQS25= pd.read_csv("../target/GQS/2025_GQS.csv")
GQS24= pd.read_csv("../target/GQS/2024_GQS.csv")
GQS23= pd.read_csv("../target/GQS/2023_GQS.csv")
GQS= pd.concat([GQS25,GQS24,GQS23], axis=0,ignore_index=True)

AS_GAMELOGS= pd.read_csv("../predictive/allstars_gamelogs.csv")
RIVALS= pd.read_csv("../predictive/rivalries.csv")
GAMELOGS= pd.read_csv("../predictive/gamelogs.csv")
SCHEDULE= pd.read_csv("../predictive/schedule_info.csv")

In [2]:
#ADD GAME ID TO GQS DF

import pandas as pd
from nba_api.stats.static import teams

# Get official NBA teams
nba_teams = teams.get_teams()

teams_df = pd.DataFrame(nba_teams)[
    ["id", "abbreviation"]
].rename(columns={
    "id": "TEAM_ID",
    "abbreviation": "ABBREVIATION"
})

# Dict for fast mapping
abbr_to_id = dict(zip(teams_df["ABBREVIATION"], teams_df["TEAM_ID"]))

GQS["TEAM1_ID"] = GQS["TEAM1"].map(abbr_to_id)
GQS["TEAM2_ID"] = GQS["TEAM2"].map(abbr_to_id)
GQS = GQS[
    ["GAME_ID", "DATE", "TEAM1", "TEAM1_ID", "TEAM2", "TEAM2_ID", "GQS"]
]

SCHEDULE["TEAM1_ID"]= SCHEDULE["TEAM1"].map(abbr_to_id)
SCHEDULE["TEAM2_ID"]= SCHEDULE["TEAM2"].map(abbr_to_id)



In [3]:
gqs = GQS.copy()
rivals = RIVALS.copy()
schedule = SCHEDULE.copy()
as_logs = AS_GAMELOGS.copy()
gamelogs = GAMELOGS.copy()

In [4]:
#....Rivals.....

import numpy as np
import ast
def parse_rivals_id(x):
    """
    Converts:
     dict: team_id -> set of rival_ids
    """
    if pd.isna(x):
        return set()

    if isinstance(x, (list, set)):
        return {int(i) for i in x}

    try:
        parsed = ast.literal_eval(x)
        return {int(i) for i in parsed}
    except Exception:
        return set()


rivals_clean = rivals.copy()

# Ensure team_id is int
rivals_clean["team_id"] = rivals_clean["team_id"].astype(int)
rivals_clean["rivals_set"] = rivals_clean["rivals_id"].apply(parse_rivals_id)

rivals_dict = {}

for _, row in rivals_clean.iterrows():
    team = row["team_id"]
    rivals_set = row["rivals_set"]

    rivals_dict.setdefault(team, set()).update(rivals_set)
    # assure reverse direction
    for r in rivals_set:
        rivals_dict.setdefault(r, set()).add(team)

rival_pairs = set(
    (team, rival)
    for team, rivals in rivals_dict.items()
    for rival in rivals
)

gqs["RIVALS"] = gqs.apply(
    lambda x: int(
        (x["TEAM1_ID"], x["TEAM2_ID"]) in rival_pairs
    ),
    axis=1
)

In [5]:
#.....ALL STARS.....
as_count = (
    as_logs.groupby("Game_ID")
    .size()
    .rename("TOTAL_ALLSTARS")
    .reset_index()
)

gqs = gqs.merge(
    as_count,
    left_on="GAME_ID",
    right_on="Game_ID",
    how="left"
)

gqs["TOTAL_ALLSTARS"] = gqs["TOTAL_ALLSTARS"].fillna(0)
gqs.drop(columns="Game_ID", inplace=True)

In [6]:
#.....Schedule info.....
schedule_cols = ["GAME_ID", "NATIONAL_TV", "DAY", "TIME"]
gqs = gqs.merge(schedule[schedule_cols], on="GAME_ID", how="left")

#normalize the data
def extract_hour(t):
    if pd.isna(t):
        return np.nan
    t = str(t).strip().lower()

    # "19"
    if t.isdigit():
        return int(t)

    # "19:00"
    if ":" in t and "am" not in t and "pm" not in t:
        return int(t.split(":")[0])

    # "07:00pm"
    if "am" in t or "pm" in t:
        hour = int(t.split(":")[0])
        if "pm" in t and hour != 12:
            hour += 12
        if "am" in t and hour == 12:
            hour = 0
        return hour

    return np.nan

gqs["GAME_HOUR"] = gqs["TIME"].apply(extract_hour)

day_map = {
    "Monday": 0, "Tuesday": 1, "Wednesday": 2,
    "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6, "Mon": 0, "Tue": 1, "Wed": 2,
    "Thu": 3, "Fri": 4, "Sat": 5, "Sun": 6, "MONDAY": 0, "TUESDAY": 1, "WEDNESDAY": 2,
    "THURSDAY": 3, "FRIDAY": 4, "SATURDAY": 5, "SUNDAY": 6
}

gqs["DAY_NUM"] = gqs["DAY"].map(day_map)

In [7]:
#Script to get DAY, TIME, NATIONAL TV for rows with wrong GAME_ID
corrections = gqs.loc[gqs["DAY"].isna()].copy()
gqs = gqs.loc[gqs["DAY"].notna()].copy()

def matchup_key(team1, team2):
    return tuple(sorted([team1, team2]))

#Using matchup keys and using Date to merge because the GAME IDS are wrong
corrections["MATCHUP_KEY"] = corrections.apply(
    lambda x: matchup_key(x["TEAM1_ID"], x["TEAM2_ID"]),
    axis=1
)

schedule = schedule.copy()
schedule["MATCHUP_KEY"] = schedule.apply(
    lambda x: matchup_key(x["TEAM1_ID"], x["TEAM2_ID"]),
    axis=1
)

corrections["DATE"] = pd.to_datetime(corrections["DATE"])
schedule["DATE"] = pd.to_datetime(schedule["DATE"])


schedule_fix = schedule[
    ["MATCHUP_KEY", "DATE", "DAY", "TIME", "NATIONAL_TV"]
]
corrections.drop(columns=["DAY", "TIME", "NATIONAL_TV"], inplace=True)

fixed = corrections.merge(
    schedule_fix,
    on=["MATCHUP_KEY", "DATE"],
    how="left"
)
fixed.drop(columns="MATCHUP_KEY", inplace=True)

fixed["GAME_HOUR"] = fixed["TIME"].apply(extract_hour)
fixed["DAY_NUM"] = fixed["DAY"].map(day_map)

gqs = pd.concat([gqs, fixed], ignore_index=True)

In [8]:
#.....STATS.....

# Sort gamelogs chronologically per team
gamelogs["GAME_DATE"] = pd.to_datetime(gamelogs["GAME_DATE"])
gamelogs = gamelogs.sort_values(["TEAM_ID", "GAME_DATE"])

#Select stat columns
exclude_cols = {
    "SEASON_YEAR", "TEAM_ID", "TEAM_ABBREVIATION",
    "GAME_ID", "GAME_DATE", "MATCHUP", "WL", "MIN"
}

stat_cols = [c for c in gamelogs.columns if c not in exclude_cols]


#Get the last 15-game averages
rolling_means = (
    gamelogs
    .groupby("TEAM_ID")[stat_cols]
    .shift(1)
    .rolling(15, min_periods=5)
    .mean()
    .reset_index()
)

# Attach TEAM_ID and GAME_ID
rolling_means = pd.concat(
    [gamelogs[["TEAM_ID", "GAME_ID"]].reset_index(drop=True),
     rolling_means[stat_cols]],
    axis=1
)


# Merge rolling stats into GQS
team1_stats = rolling_means.add_prefix("T1_")
team2_stats = rolling_means.add_prefix("T2_")

gqs = gqs.merge(
    team1_stats,
    left_on=["TEAM1_ID", "GAME_ID"],
    right_on=["T1_TEAM_ID", "T1_GAME_ID"],
    how="left"
)

gqs = gqs.merge(
    team2_stats,
    left_on=["TEAM2_ID", "GAME_ID"],
    right_on=["T2_TEAM_ID", "T2_GAME_ID"],
    how="left"
)


#Compute SUM and DIFF features
# -----------------------------
t1 = gqs[[f"T1_{s}" for s in stat_cols]]
t2 = gqs[[f"T2_{s}" for s in stat_cols]]

t1.columns = stat_cols
t2.columns = stat_cols

sum_df = t1.add(t2)
sum_df.columns = [f"{c}_SUM" for c in sum_df.columns]

diff_df = t1.sub(t2)
diff_df.columns = [f"{c}_DIFF" for c in diff_df.columns]

gqs = pd.concat([gqs, sum_df, diff_df], axis=1)


#Remove temporary columns

drop_cols = [c for c in gqs.columns if c.startswith("T1_") or c.startswith("T2_")]
gqs.drop(columns=drop_cols + ["TIME", "DAY"], inplace=True)

gqs = gqs.copy()


In [9]:

gqs

Unnamed: 0,GAME_ID,DATE,TEAM1,TEAM1_ID,TEAM2,TEAM2_ID,GQS,RIVALS,TOTAL_ALLSTARS,NATIONAL_TV,...,PCT_AST_FGM_DIFF,PCT_UAST_FGM_DIFF,PTS_OFF_TOV_DIFF,PTS_2ND_CHANCE_DIFF,PTS_FB_DIFF,PTS_PAINT_DIFF,OPP_PTS_OFF_TOV_DIFF,OPP_PTS_2ND_CHANCE_DIFF,OPP_PTS_FB_DIFF,OPP_PTS_PAINT_DIFF
0,22500326,2025-12-03,SAS,1610612759,ORL,1610612753,0.316262,0,1.0,0.0,...,-0.013800,0.013800,-0.200000,0.000000,-4.933333,-4.266667,-1.733333,4.000000,-5.466667,-5.200000
1,22500326,2025-12-03,SAS,1610612759,ORL,1610612753,0.316262,0,1.0,0.0,...,-0.013800,0.013800,-0.200000,0.000000,-4.933333,-4.266667,-1.733333,4.000000,-5.466667,-5.200000
2,22500325,2025-12-03,IND,1610612754,DEN,1610612743,0.138732,0,2.0,0.0,...,-0.065000,0.065000,2.066667,-2.533333,-1.400000,-5.733333,0.733333,1.666667,0.533333,3.200000
3,22500322,2025-12-02,SAS,1610612759,MEM,1610612763,0.327308,0,2.0,0.0,...,-0.108333,0.108333,6.200000,-0.466667,3.800000,3.200000,-3.733333,-0.266667,-4.800000,-6.266667
4,22500320,2025-12-02,NYK,1610612752,BOS,1610612738,0.355065,1,3.0,1.0,...,0.042133,-0.042133,-0.400000,0.666667,1.333333,7.466667,2.533333,-1.266667,0.066667,2.133333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3015,22300007,2023-11-03 00:00:00,POR,1610612757,MEM,1610612763,0.512842,0,1.0,0.0,...,-0.063067,0.063067,-0.733333,-1.200000,-5.400000,-3.733333,0.800000,0.600000,6.133333,8.133333
3016,22300004,2023-11-03 00:00:00,BKN,1610612751,CHI,1610612741,0.350306,0,1.0,0.0,...,0.087733,-0.087800,-6.000000,3.133333,2.000000,-5.466667,3.933333,1.200000,-0.933333,7.866667
3017,22300002,2023-11-03 00:00:00,MIL,1610612749,NYK,1610612752,0.406711,0,4.0,1.0,...,0.003000,-0.003067,-3.600000,-1.066667,-3.666667,-2.133333,0.133333,4.666667,0.200000,6.133333
3018,22300003,2023-11-03 00:00:00,MIA,1610612748,WAS,1610612764,0.086788,0,2.0,0.0,...,0.029533,-0.029467,1.266667,-2.466667,-3.600000,-12.666667,-1.933333,-2.266667,-1.800000,-6.400000


In [10]:
gqs.to_csv("training_set.csv", index=False)

In [24]:
gqs.na()

AttributeError: 'DataFrame' object has no attribute 'na'