In [18]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier


df = pd.read_csv("../data/clean/merged.csv")

# Convert "playoff" column to binary (Y: 1, N: 0)
df["playoff"] = df["playoff"].map({"Y": 1, "N": 0})
df.head()

Unnamed: 0,year,tmID,confID,playoff,avg_player_height,avg_player_weight,avg_player_age,avg_player_last_3_years_sum_awards,avg_player_all_time_sum_awards,last_3_years_avg_pl_efficiency,all_years_avg_pl_efficiency
0,9,ATL,EA,0.0,72.25,173.333333,24.833333,3.5,3.5,238964.357143,147083.700417
1,10,ATL,EA,1.0,72.0,173.583333,25.25,3.5,3.5,164582.001515,142542.672569
2,11,ATL,EA,,71.769231,162.923077,27.461538,0.0,0.0,213432.089091,181895.149231
3,1,CHA,EA,0.0,72.7,170.6,25.636364,0.0,0.0,198613.747593,186908.067227
4,2,CHA,EA,1.0,73.181818,177.0,25.25,0.0,0.0,232819.203571,153458.716875


In [None]:
def train_test_split_by_year(data, test_year):
    train_set = data[(data['year'] < test_year)]
    test_set = data[data['year'] == test_year]
    return train_set, test_set

def evaluate_model(model, train_set_original, test_set_original):
    
    train_set = train_set_original.copy()
    test_set = test_set_original.copy()

    
    tmIDs = test_set['tmID']
    confIDs = test_set['confID']
    train_set.drop(['tmID', 'confID', 'year'], axis=1, inplace=True)
    test_set.drop(['tmID', 'confID', 'year'], axis=1, inplace=True)
    
    X_train, y_train = train_set.drop(columns=['playoff']), train_set['playoff']
    X_test, y_test = test_set.drop(columns=['playoff']), test_set['playoff']

    # Initialize SMOTE
    smote = SMOTE(random_state=42)
    
    # Fit SMOTE on the training data
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test)
    y_pred = y_pred.astype(int)
    
    predict_proba = model.predict_proba(X_test)[:, 1]
    
    # Normalize predict_proba to ensure the sum is 8
    predict_proba = 8 * (predict_proba / predict_proba.sum())
        
    results = pd.DataFrame({
        'tmID' : tmIDs,
        'confID': confIDs,
        'playoff' : y_pred,
        'predict_proba' : predict_proba
    })

    # Remove duplicates
    results = results.drop_duplicates(subset='tmID', keep='first')
    
    # Ensure that the top 4 teams from each conference are selected --> playoff = 1
    top4 = results.sort_values(by='predict_proba', ascending=False).groupby('confID').head(4)
    # Update the top 4 teams to playoff = 1 in results
    results.loc[results['tmID'].isin(top4['tmID']), 'playoff'] = 1
    
    # Select the best 4 teams for each conference (confID) ensuring unique teams
    #results = results.sort_values(by='predict_proba', ascending=False).groupby('confID').head(4)


    return results

In [20]:
def train_model(data, test_year, model):
    train_data, test_data = train_test_split_by_year(data, test_year)
    result = evaluate_model(model, train_data, test_data)
    
    return result

In [21]:
params = {
    'n_estimators': 100,
    'max_depth': 3,
    'min_samples_split': 2,
    'learning_rate': 0.1
}

results = train_model(df, 11, GradientBoostingClassifier(**params))

results.drop(['confID'], axis=1, inplace=True)
results.to_csv(f"../data/predictions/predictions_11.csv", index=False)