In [179]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

In [180]:
#df = pd.read_csv("../data/clean/merged.csv")
df = pd.read_csv("../data/clean/teams.csv") # TESTING

# Convert "playoff" column to binary (Y: 1, N: 0)
df["playoff"] = df["playoff"].map({"Y": 1, "N": 0})

### Label Encoding

In [181]:
from sklearn.preprocessing import LabelEncoder

def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object' or col_type == 'datetime64[ns]':
            df[col] = le.fit_transform(df[col])
    return df

df = encode_df(df)
df.head()


Unnamed: 0,year,tmID,confID,rank,playoff,firstRound,semis,finals,o_fga,o_fta,...,d_pts,won,lost,GP,homeW,homeL,awayW,awayL,confW,confL
0,9,0,0,7,0,2,2,2,2258,725,...,2879,4,30,34,1,16,3,14,2,18
1,10,0,0,2,1,0,2,2,2428,755,...,2797,18,16,34,12,5,6,11,10,12
2,1,1,0,8,0,2,2,2,1903,577,...,2429,8,24,32,5,11,3,13,5,16
3,2,1,0,4,1,1,1,0,1780,528,...,2009,18,14,32,11,5,7,9,15,6
4,3,1,0,2,1,0,2,2,1790,663,...,2133,18,14,32,11,5,7,9,12,9


In [182]:
def calculate_class_distribution_similarity(df, y_train, y_test):
    # Step 1: Calculate class distribution in the original dataset
    original_class_distribution = df['playoff'].value_counts(normalize=True)
    print("Original Class Distribution:")
    print(original_class_distribution)

    # Step 2: Calculate class distribution in the training and test sets
    train_class_distribution = y_train.value_counts(normalize=True)
    test_class_distribution = y_test.value_counts(normalize=True)

    print("\nTraining Set Class Distribution:")
    print(train_class_distribution)
    print("\nTest Set Class Distribution:")
    print(test_class_distribution)

    # Step 3: Compare class distributions
    class_distribution_similarity = (train_class_distribution - test_class_distribution).abs().sum()
    print("\nClass Distribution Similarity Score:", class_distribution_similarity)

    return class_distribution_similarity

# Update the train_model function to include imputation
def train_model(df, year):
    # Remove rows with missing values
    df.dropna(inplace=True)

    teams_df_train = df[df['year'] < year]
    teams_df_test = df[df['year'] == year]

    X_train = teams_df_train.drop("playoff", axis=1)  # Features
    y_train = teams_df_train["playoff"]  # Target variable

    X_test = teams_df_test.drop("playoff", axis=1)  # Features
    y_test = teams_df_test["playoff"]  # Target variable

    print(f"\nTrain/Test size for year={year}:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    return X_train, y_train, X_test, y_test

# create a final dataset with the predictions: team, playoff, prediction (a probability between 0 and 1)
def create_predictions(df, year, model, distribution_similarity=True):
    # Create a copy of the original dataset
    new_df = df.copy()

    X_train, y_train, X_test, y_test = train_model(new_df, year)

    # Check if the class distribution is similar between the training and test sets
    if distribution_similarity:
        calculate_class_distribution_similarity(new_df, y_train, y_test)    

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test) # Predictions
    y_pred_proba = model.predict_proba(X_test) # Prediction probabilities

    # Calculate the AUC
    auc = roc_auc_score(y_test, y_pred_proba[:,1])
    print(f"\nAUC for year={year}:", auc)
 
    # Only add predictions to the test set
    new_df.loc[new_df['year'] == year, 'prediction'] = y_pred
    new_df.loc[new_df['year'] == year, 'prediction_proba'] = y_pred_proba[:,1]

    # Filter the dataset to only include the year we are interested in
    new_df = new_df[new_df['year'] == year]

    # Convert the predictions to integers
    new_df['prediction'] = new_df['prediction'].astype(int)

    return new_df



model = LogisticRegression(max_iter=1000)

min_year = df['year'].min() + 1
max_year = df['year'].max()

print("Min year:", min_year, "\nMax year:", max_year)


def create_final_predictions(df, year):
    # Select the best 4 teams for each conference (confID)
    final_predictions = df[df['year'] == year].sort_values(by='prediction_proba', ascending=False).groupby('confID').head(4)
    return final_predictions

# Usage of the function within the loop
for i in range(min_year, max_year + 1):
    new_df = create_predictions(df, i, model, distribution_similarity=False)
    new_df = create_final_predictions(new_df, i)
    new_df.drop(new_df.columns.difference(['tmID', 'confID', 'year', 'playoff', 'prediction', 'prediction_proba']), axis=1, inplace=True)
    new_df.to_csv(f"../data/predictions/predictions_{i}.csv", index=False)



Min year: 2 
Max year: 10

Train/Test size for year=2: (16, 36) (16, 36) (16,) (16,)

AUC for year=2: 0.90625

Train/Test size for year=3: (32, 36) (16, 36) (32,) (16,)

AUC for year=3: 0.953125

Train/Test size for year=4: (48, 36) (14, 36) (48,) (14,)

AUC for year=4: 0.7291666666666666

Train/Test size for year=5: (62, 36) (13, 36) (62,) (13,)

AUC for year=5: 0.8

Train/Test size for year=6: (75, 36) (13, 36) (75,) (13,)

AUC for year=6: 0.85

Train/Test size for year=7: (88, 36) (14, 36) (88,) (14,)

AUC for year=7: 1.0

Train/Test size for year=8: (102, 36) (13, 36) (102,) (13,)

AUC for year=8: 1.0

Train/Test size for year=9: (115, 36) (14, 36) (115,) (14,)

AUC for year=9: 0.9375

Train/Test size for year=10: (129, 36) (13, 36) (129,) (13,)

AUC for year=10: 0.925


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt