In [59]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier

df = pd.read_csv("../data/clean/merged.csv")

In [60]:
def train_test_split_by_year(data, test_year):
    train_set = data[(data['year'] < test_year)]
    test_set = data[data['year'] == test_year]
    return train_set, test_set

def evaluate_model(model, train_set_original, test_set_original):
    
    train_set = train_set_original.copy()
    test_set = test_set_original.copy()

    
    tmIDs = test_set['tmID']
    confIDs = test_set['confID']
    train_set.drop(['tmID', 'confID', 'year'], axis=1, inplace=True)
    test_set.drop(['tmID', 'confID', 'year'], axis=1, inplace=True)
    
    X_train, y_train = train_set.drop(columns=['playoff']), train_set['playoff']
    X_test, y_test = test_set.drop(columns=['playoff']), test_set['playoff']
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    predict_proba = model.predict_proba(X_test)[:, 1]
        
    results = pd.DataFrame({
        'tmID' : tmIDs,
        'confID': confIDs,
        'playoff' : y_pred,
        'predict_proba' : predict_proba
    })
    
    # Select the best 4 teams for each conference (confID) ensuring unique teams
    results = results.sort_values(by='predict_proba', ascending=False).groupby('confID').head(4)

    return results

In [61]:
def train_model(data, test_year, model):
    train_data, test_data = train_test_split_by_year(data, test_year)
    result = evaluate_model(model, train_data, test_data)
    
    return result

In [62]:
params = {
    'n_estimators': 100,
    'max_depth': 3,
    'min_samples_split': 2,
    'learning_rate': 0.1
}

results = train_model(df, 11, GradientBoostingClassifier(**params))

results.drop(['confID'], axis=1, inplace=True)
results.to_csv(f"../data/predictions/predictions_11.csv", index=False)