In [None]:
import pandas as pd

In [None]:
data_frame = pd.read_csv("nba_games.csv", index_col = 0)

In [None]:
data_frame = data_frame.sort_values("date")

In [None]:
data_frame = data_frame.reset_index(drop=True)

In [None]:
del data_frame["mp.1"]
del data_frame["mp_opp.1"]
del data_frame["index_opp"]

In [None]:
def add_target(team):
    team["target"] = team["won"].shift(-1)
    return team

data_frame = data_frame.groupby("team", group_keys=False).apply(add_target)

In [None]:
data_frame["target"][pd.isnull(data_frame["target"])] = 2

In [None]:
data_frame["target"] = data_frame["target"].astype(int, errors="ignore")

In [None]:
data_frame["won"].value_counts()

In [None]:
data_frame["target"].value_counts()

In [None]:
nulls = pd.isnull(data_frame).sum()

In [None]:
valid_columns = data_frame.columns[~data_frame.columns.isin(nulls[nulls > 0].index)]

In [None]:
data_frame = data_frame[valid_columns].copy()

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier


ridge= RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)
gradient_booster = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

In [None]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]

In [None]:
selected_columns = data_frame.columns[~data_frame.columns.isin(removed_columns)]

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_frame[selected_columns] = scaler.fit_transform(data_frame[selected_columns])

In [None]:
predictors = selected_columns

In [None]:
def backtest(data, model, predictors, start=2, step=1):
    predictions = []
    seasons = sorted(data["season"].unique())
    for i in range(start, len(seasons), step):
        season = seasons[i]
        training = data[data["season"] < season]
        test = data[data["season"] == season]
        model.fit(training[predictors], training["target"])
        prediction = model.predict(test[predictors])
        prediction = pd.Series(prediction, index=test.index)
        combined = pd.concat([test["target"], prediction], axis=1)
        combined.columns = ["actual", "prediction"]
        predictions.append(combined)
    return pd.concat(predictions)

In [None]:
predictions = backtest(data_frame, ridge, predictors)

In [None]:
from sklearn.metrics import accuracy_score


predictions = predictions[predictions["actual"] != 2]
accuracy_score(predictions["actual"], predictions["prediction"])

In [None]:
data_frame.groupby("home").apply(lambda x: x[x["won"]==1].shape[0] /x.shape[0])

In [None]:
data_frame_rolling = data_frame[list(selected_columns) + ["won", "team", "season"]]

In [None]:
import numpy as np

In [None]:
data_frame_rolling['won'] = data_frame_rolling['won'].astype(int)
def find_team_averages(team):
    numeric_team = team.select_dtypes(include=[np.number])
    rolling = numeric_team.rolling(10).mean()
    return rolling


data_frame_rolling = data_frame_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

In [None]:
rolling_columns = [f"{column}_10" for column in data_frame_rolling.columns]
data_frame_rolling.columns = rolling_columns

data_frame = pd.concat([data_frame, data_frame_rolling], axis=1)

In [None]:
data_frame = data_frame.dropna()

In [None]:
def shift_column(team, column_name):
    next_column = team[column_name].shift(-1)
    return next_column

def add_column(data_frame, column_name):
    return data_frame.groupby("team", group_keys=False).apply(lambda x: shift_column(x, column_name))

data_frame["home_next"] = add_column(data_frame, "home")
data_frame["team_opp_next"] = add_column(data_frame, "team_opp")
data_frame["date_next"] = add_column(data_frame, "date")

In [None]:
data_frame = data_frame.copy()

In [None]:
full = data_frame.merge(data_frame[rolling_columns + ["team_opp_next", "date_next", "team"]], 
        left_on = ["team", "date_next"], right_on=["team_opp_next", "date_next"])

In [None]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

In [None]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns

In [None]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [None]:
predictors = selected_columns

In [None]:
predictions = backtest(full, ridge, predictors)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

accuracy = accuracy_score(predictions["actual"], predictions["prediction"])
print(f"Accuracy: {accuracy*100}%")

print("\n")

print("Classification Report:")
print(classification_report(predictions["actual"], predictions["prediction"]))

print("\n")

print("Confusion Matrix:")
print(confusion_matrix(predictions["actual"], predictions["prediction"]))

