In [64]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import plot_tree

import matplotlib.pyplot as plt

In [65]:
df = pd.read_csv("../data/clean/teams.csv")

# Convert "playoff" column to binary (Y: 1, N: 0)
df["playoff"] = df["playoff"].map({"Y": 1, "N": 0})

In [66]:
def process_categorical(df, col):
    values = df[col].unique()
    mapping = {value: i for i, value in enumerate(values)}
    df[col] = df[col].map(mapping)
    return df

df.fillna(-1, inplace=True)
df = process_categorical(df, "tmID")
df = process_categorical(df, "confID")

In [67]:
model = DecisionTreeClassifier(random_state=42)

min_year = 2
max_year = df['year'].max()

print("Min year:", min_year, "\nMax year:", max_year)

for i in range(min_year, max_year + 1):
    teams_df_train = df[df['year'] < i]
    teams_df_test = df[df['year'] == i]

    X_train = teams_df_train.drop("playoff", axis=1) # Features
    y_train = teams_df_train["playoff"] # Target variable

    X_test = teams_df_test.drop("playoff", axis=1) # Features
    y_test = teams_df_test["playoff"] # Target variable

    print(f"\nTrain/Test size for year={i}:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    model.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred = model.predict(X_test)

    print(f"\nyear = {i}, Accuracy:", accuracy_score(y_test, y_pred))
    print(f"\nyear = {i}, Classification Report:\n", classification_report(y_test, y_pred))
    print(f"\nyear = {i}, Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Min year: 2 
Max year: 10

Train/Test size for year=2: (16, 5) (16, 5) (16,) (16,)

year = 2, Accuracy: 0.4375

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.50      0.47         8
           1       0.43      0.38      0.40         8

    accuracy                           0.44        16
   macro avg       0.44      0.44      0.44        16
weighted avg       0.44      0.44      0.44        16


year = 2, Confusion Matrix:
 [[4 4]
 [5 3]]

Train/Test size for year=3: (32, 5) (16, 5) (32,) (16,)

year = 3, Accuracy: 0.5

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.25      0.33         8
           1       0.50      0.75      0.60         8

    accuracy                           0.50        16
   macro avg       0.50      0.50      0.47        16
weighted avg       0.50      0.50      0.47        16


year = 3, Confusion Matrix:
 [[2 6]
 