In [6]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv("../data/clean/teams.csv")

# Convert "playoff" column to binary (Y: 1, N: 0)
df["playoff"] = df["playoff"].map({"Y": 1, "N": 0})

### Label Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object' or col_type == 'datetime64[ns]':
            df[col] = le.fit_transform(df[col])
    return df

df = encode_df(df)

In [9]:
model = DecisionTreeClassifier(random_state=42)

min_year = 2
max_year = df['year'].max()

print("Min year:", min_year, "\nMax year:", max_year)

for i in range(min_year, max_year + 1):
    teams_df_train = df[df['year'] < i]
    teams_df_test = df[df['year'] == i]

    X_train = teams_df_train.drop("playoff", axis=1) # Features
    y_train = teams_df_train["playoff"] # Target variable

    X_test = teams_df_test.drop("playoff", axis=1) # Features
    y_test = teams_df_test["playoff"] # Target variable

    print(f"\nTrain/Test size for year={i}:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)


    # Step 1: Calculate class distribution in the original dataset
    original_class_distribution = df['playoff'].value_counts(normalize=True)
    print("Original Class Distribution:")
    print(original_class_distribution)

    # Step 2: Calculate class distribution in the training and test sets
    train_class_distribution = y_train.value_counts(normalize=True)
    test_class_distribution = y_test.value_counts(normalize=True)

    print("\nTraining Set Class Distribution:")
    print(train_class_distribution)
    print("\nTest Set Class Distribution:")
    print(test_class_distribution)

    # Step 3: Compare class distributions
    class_distribution_similarity = (train_class_distribution - test_class_distribution).abs().sum()
    print("\nClass Distribution Similarity Score:", class_distribution_similarity)

    model.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred = model.predict(X_test)

    print(f"\nyear = {i}, Accuracy:", accuracy_score(y_test, y_pred))
    print(f"\nyear = {i}, Classification Report:\n", classification_report(y_test, y_pred))
    print(f"\nyear = {i}, Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Min year: 2 
Max year: 10

Train/Test size for year=2: (777, 110) (882, 110) (777,) (882,)

year = 2, Accuracy: 1.0

year = 2, Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00       882

    accuracy                           1.00       882
   macro avg       1.00      1.00      1.00       882
weighted avg       1.00      1.00      1.00       882


year = 2, Confusion Matrix:
 [[882]]

Train/Test size for year=3: (1659, 110) (742, 110) (1659,) (742,)

year = 3, Accuracy: 1.0

year = 3, Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00       742

    accuracy                           1.00       742
   macro avg       1.00      1.00      1.00       742
weighted avg       1.00      1.00      1.00       742


year = 3, Confusion Matrix:
 [[742]]

Train/Test size for year=4: (2401, 110) (728, 110) (2401,) (728,)

year = 4, Accuracy: 1.0

year = 

