In [1]:
from nba_api.stats.endpoints import leaguedashteamstats, playoffpicture
import pandas as pd

# Function to get team stats for a given season
def get_team_stats(season):
    stats = leaguedashteamstats.LeagueDashTeamStats(season=season)
    stats_df = stats.get_data_frames()[0]
    return stats_df

# Collect data for multiple seasons
seasons = ['2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22']
data = pd.concat([get_team_stats(season) for season in seasons], keys=seasons)

# Reset index and clean data
data.reset_index(level=0, inplace=True)
data.rename(columns={'level_0': 'SEASON'}, inplace=True)


In [2]:
def get_playoff_picture(season):
    teams = playoffpicture.PlayoffPicture(season_id=season)
    east_teams_df = teams.get_data_frames()[2]
    # east_teams_df = east_teams_df.loc[east_teams_df['RANK'] <= 8]
    east_teams_df = east_teams_df[['TEAM_ID', 'CLINCHED_PLAYOFFS']]
    west_teams_df = teams.get_data_frames()[3]
    # west_teams_df = west_teams_df.loc[west_teams_df['RANK'] <= 8]
    west_teams_df = west_teams_df[['TEAM_ID', 'CLINCHED_PLAYOFFS']]
    pl_teams_df = pd.concat([east_teams_df, west_teams_df])
    # Extract season id to match with 'data' dataframe
    pl_teams_df['SEASON'] = str(int(season[1:]) - 1 )+ "-" + str(season[-2:])
    return pl_teams_df
seasons_2 = ["22011", "22012", "22013", "22014", "22015", "22016", "22017", "22018", "22019", "22020", "22021", "22022"]
pl_df = pd.concat([get_playoff_picture(season) for season in seasons_2])

# Merge with data
data = data.merge(pl_df, on=["TEAM_ID", "SEASON"])


In [3]:
# Preprocessing steps
data = data.fillna(0)  # Fill missing values
# data = data.drop(columns=['TEAM_ID', 'CFID', 'CFPARAMS'])  # Drop irrelevant columns

# Encode categorical variables
data = pd.get_dummies(data, columns=['SEASON'])

# # Add playoff appearance column (binary 0/1)
# # This is an example, you need to manually label each row based on historical data
# data['PLAYOFF'] = data['TEAM_NAME'].apply(lambda x: 1 if x in ['Golden State Warriors', 'Cleveland Cavaliers'] else 0)


In [4]:
# Feature engineering (example: win-loss ratio)
data['WIN_LOSS_RATIO'] = data['W'] / (data['W'] + data['L'])


In [None]:
data.head()

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split data into features and labels
X = data.drop(columns=['TEAM_NAME', 'CLINCHED_PLAYOFFS'])
y = data['CLINCHED_PLAYOFFS']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6388888888888888
              precision    recall  f1-score   support

           0       0.66      0.58      0.62        36
           1       0.62      0.69      0.66        36

    accuracy                           0.64        72
   macro avg       0.64      0.64      0.64        72
weighted avg       0.64      0.64      0.64        72



In [7]:
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

# X, y = make_classification(
#     n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1
# )
# rng = np.random.RandomState(2)
# X += 2 * rng.uniform(size=X.shape)
# linearly_separable = (X, y)


# iterate over classifiers
for name, clf in zip(names, classifiers):
    clf = make_pipeline(StandardScaler(), clf)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    print("Classifier: ", names)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    # print(classification_report(y_test, y_pred))

    

Classifier:  ['Nearest Neighbors', 'Linear SVM', 'RBF SVM', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Neural Net', 'AdaBoost', 'Naive Bayes', 'QDA']
Accuracy: 0.6666666666666666
Classifier:  ['Nearest Neighbors', 'Linear SVM', 'RBF SVM', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Neural Net', 'AdaBoost', 'Naive Bayes', 'QDA']
Accuracy: 0.6666666666666666
Classifier:  ['Nearest Neighbors', 'Linear SVM', 'RBF SVM', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Neural Net', 'AdaBoost', 'Naive Bayes', 'QDA']
Accuracy: 0.5
Classifier:  ['Nearest Neighbors', 'Linear SVM', 'RBF SVM', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Neural Net', 'AdaBoost', 'Naive Bayes', 'QDA']
Accuracy: 0.5694444444444444
Classifier:  ['Nearest Neighbors', 'Linear SVM', 'RBF SVM', 'Gaussian Process', 'Decision Tree', 'Random Forest', 'Neural Net', 'AdaBoost', 'Naive Bayes', 'QDA']
Accuracy: 0.5972222222222222
Classifier:  ['Nearest Neighbors', 'Linear SVM', 'RBF SVM', 



In [10]:
import seaborn as sns
import matplotlib.pyplot as plt

name_dict = data[['TEAM_ID', 'TEAM_NAME']]
data.drop(columns='TEAM_NAME')
# Calculate the correlation matrix
corr_matrix = data.corr()

# Plot the heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


ValueError: could not convert string to float: 'Atlanta Hawks'

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each feature
X = data.drop(columns=['TEAM_NAME', 'PLAYOFF'])
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data)
