In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score

# Load your datasets
team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')
standings_df = pd.read_csv('NBA_Standings_Ranked_Classes.csv')
elo_df = pd.read_csv('elo_ratings_per_year.csv')
four_factors_df = pd.read_csv('team_year_avg_four_factors.csv')

# Merge the Elo ratings into team stats dataset
team_stats_df = pd.merge(team_stats_df, elo_df[['Year', 'Team', 'Elo Rating']], how='left', on=['Year', 'Team'])

# Merge the new features into team stats dataset
team_stats_df = pd.merge(team_stats_df, four_factors_df[['Year', 'Team', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate']], how='left', on=['Year', 'Team'])

# Merge the team stats with the updated standings data
merged_data = pd.merge(team_stats_df, standings_df, how='left', on=['Year', 'Team'])

# Ensure all necessary columns are available before creating new features
required_columns = ['ORtg', 'DRtg', 'W', 'L']
missing_columns = [col for col in required_columns if col not in merged_data.columns]

if missing_columns:
    print(f"Missing columns for feature creation: {missing_columns}")
else:
    # Create new features
    merged_data['Net_Rating'] = merged_data['ORtg'] - merged_data['DRtg']
    merged_data['Win_Loss_Ratio'] = merged_data['W'] / merged_data['L']

# Filter the data by year range
filtered_data = merged_data[merged_data['Year'] >= 2004]

# Select the relevant features for training
features = ['Elo Rating', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate', 'Net_Rating', 'Win_Loss_Ratio']

# Function to train and evaluate for different windows
def train_and_evaluate_yearly(conference_data, start_year, end_year, model):
    train_data = conference_data[(conference_data['Year'] >= start_year) & (conference_data['Year'] < end_year)]
    test_data = conference_data[conference_data['Year'] == end_year]
    
    X_train = train_data[features]
    y_train = train_data['ranking_class']

    X_test = test_data[features]
    y_test = test_data['ranking_class']

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

model = RandomForestClassifier(random_state=42)
results_summary = []

# Iterate over the years using different data windows
for window_size in [1, 2, 3, 4, 5, 7]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Evaluate for East and west
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and calculate the mean accuracy
RF_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(RF_NetRating)

# perform a final cross-validation on the best window size to validate the findings
best_window = RF_NetRating.mean(axis=1).idxmax()
print(f"The optimal window size is {best_window} years.")

  team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')


             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.700483          0.715840
2                    0.766086          0.698286
3                    0.778638          0.724101
4                    0.766300          0.714044
5                    0.774048          0.698719
7                    0.802327          0.728245
The optimal window size is 7 years.


  RF_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split


param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Store results
results_summary = []

# Iterate over the years using a 7-year window to predict the next year
for year in range(2011, 2023):  # Training on 7 years (e.g., 2004-2010 to predict 2011)
    train_data = filtered_data[(filtered_data['Year'] >= year - 7) & (filtered_data['Year'] < year)]
    test_data = filtered_data[filtered_data['Year'] == year]

    X_train = train_data[features]
    y_train = train_data['ranking_class']
    X_test = test_data[features]
    y_test = test_data['ranking_class']

    # Initialize the model
    model = RandomForestClassifier(random_state=42)

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

    # Fit the GridSearchCV
    grid_search.fit(X_train, y_train)

    # Use the best model
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Store results
    results_summary.append({
        "Year": year,
        "Best_Params": grid_search.best_params_,
        "Accuracy": accuracy
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)
print(results_df)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
    Year                                        Best_Params  Accuracy
0   2011  {'bootstrap': False, 'max_depth': 10, 'min_sam...  0.767139
1   2012  {'bootstrap': True, 'max_depth': 20, 'min_samp...  0.681091
2   2013  {'bootstrap': True, 'max_depth': 20,

In [12]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

# Store results for Logistic Regression
results_summary_lr = []

# Define hyperparameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1],
    'penalty': ['l2'],
    'solver': ['liblinear']
}

# Iterate over the years using a 7-year window to predict the next year
for year in range(2011, 2023):
    train_data = filtered_data[(filtered_data['Year'] >= year - 7) & (filtered_data['Year'] < year)]
    test_data = filtered_data[filtered_data['Year'] == year]

    X_train = train_data[features]
    y_train = train_data['ranking_class']
    X_test = test_data[features]
    y_test = test_data['ranking_class']

    # Initialize the model
    model = LogisticRegression(random_state=42)

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid_lr, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

    # Fit the GridSearchCV
    grid_search.fit(X_train, y_train)

    # Use the best model
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Store results
    results_summary_lr.append({
        "Year": year,
        "Best_Params": grid_search.best_params_,
        "Accuracy": accuracy
    })

# Convert results to a DataFrame
results_df_lr = pd.DataFrame(results_summary_lr)
print(results_df_lr)


Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting 3 folds for each of 3 candidates, totalling 9 fits
    Year                                        Best_Params  Accuracy
0   2011  {'C': 0.1, 'penalty': 'l2', 'solver': 'libline...  0.714736
1   2012  {'C': 0.1, 'penalty': 'l2', 'solver': 'libline...  0.771402
2   2013  {'C': 0.1, 'penalty': 'l2', 'solver': 'libline...  0.700163
3   2014  {'

In [13]:
from sklearn.tree import DecisionTreeClassifier

# Store results for Decision Tree
results_summary_dt = []

# Define hyperparameter grid for Decision Tree
param_grid_dt = {
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Iterate over the years using a 7-year window to predict the next year
for year in range(2011, 2023):
    train_data = filtered_data[(filtered_data['Year'] >= year - 7) & (filtered_data['Year'] < year)]
    test_data = filtered_data[filtered_data['Year'] == year]

    X_train = train_data[features]
    y_train = train_data['ranking_class']
    X_test = test_data[features]
    y_test = test_data['ranking_class']

    # Initialize the model
    model = DecisionTreeClassifier(random_state=42)

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid_dt, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

    # Fit the GridSearchCV
    grid_search.fit(X_train, y_train)

    # Use the best model
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Store results
    results_summary_dt.append({
        "Year": year,
        "Best_Params": grid_search.best_params_,
        "Accuracy": accuracy
    })

# Convert results to a DataFrame
results_df_dt = pd.DataFrame(results_summary_dt)
print(results_df_dt)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
    Year                                        Best_Params  Accuracy
0   2011  {'max_depth': 5, 'min_samples_leaf': 1, 'min_s...  0.833727
1   2012  {'max_depth': None, 'min_samples_leaf': 1, 'mi...  0.634525
2   2013  {'max_depth': 5, 'min_samples_leaf': 1, 'min_s..

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

# Store results for Gradient Boosting
results_summary_gb = []

# Define hyperparameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}

# Iterate over the years using a 7-year window to predict the next year
for year in range(2011, 2023):
    train_data = filtered_data[(filtered_data['Year'] >= year - 7) & (filtered_data['Year'] < year)]
    test_data = filtered_data[filtered_data['Year'] == year]

    X_train = train_data[features]
    y_train = train_data['ranking_class']
    X_test = test_data[features]
    y_test = test_data['ranking_class']

    # Initialize the model
    model = GradientBoostingClassifier(random_state=42)

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid_gb, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

    # Fit the GridSearchCV
    grid_search.fit(X_train, y_train)

    # Use the best model
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Store results
    results_summary_gb.append({
        "Year": year,
        "Best_Params": grid_search.best_params_,
        "Accuracy": accuracy
    })

# Convert results to a DataFrame for easy viewing
results_df_gb = pd.DataFrame(results_summary_gb)
print(results_df_gb)


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 8 candidates, totalling 24 fits
    Year                                        Best_Params  Accuracy
0   2011  {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...  0.766745
1   2012  {'learning_rate': 0.1, 'max_depth': 3, 'n_esti...  0.667921
2   2013  {'learning_rate': 0.01, 'max_depth': 5, 'n_est...  0.733116


In [15]:
from sklearn.neighbors import KNeighborsClassifier

# Store results for K-Nearest Neighbors
results_summary_knn = []

# Define hyperparameter grid for K-Nearest Neighbors
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}

# Iterate over the years using a 7-year window to predict the next year
for year in range(2011, 2023):
    train_data = filtered_data[(filtered_data['Year'] >= year - 7) & (filtered_data['Year'] < year)]
    test_data = filtered_data[filtered_data['Year'] == year]

    X_train = train_data[features]
    y_train = train_data['ranking_class']
    X_test = test_data[features]
    y_test = test_data['ranking_class']

    # Initialize the model
    model = KNeighborsClassifier()

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid_knn, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

    # Fit the GridSearchCV
    grid_search.fit(X_train, y_train)

    # Use the best model
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Store results
    results_summary_knn.append({
        "Year": year,
        "Best_Params": grid_search.best_params_,
        "Accuracy": accuracy
    })

# Convert results to a DataFrame
results_df_knn = pd.DataFrame(results_summary_knn)
print(results_df_knn)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
    Year                                Best_Params  Accuracy
0   2011  {'n_neighbors': 3, 'weights': 'distance'}  0.731284
1   2012  {'n_neighbors': 3, 'weights': 'distance'}  0.615240
2   2013  {'n_neighbors': 3, 'weights': 'distance'}  0.656225
3   2014  {'n_neighbors': 7, 'we

In [None]:
# Define the 7-year window and the prediction year
start_year = 2004
end_year = 2010
prediction_year = 2011

# Filter the training data for the 7-year window
train_data = filtered_data[(filtered_data['Year'] >= start_year) & (filtered_data['Year'] <= end_year)]
# Filter the test data for the prediction year
test_data = filtered_data[filtered_data['Year'] == prediction_year]

# Ensure that the data is not empty
if train_data.empty or test_data.empty:
    raise ValueError(f"No data available for the specified years: {start_year}-{end_year} or {prediction_year}")

X_train = train_data[features]
y_train = train_data['ranking_class']
X_test = test_data[features]
y_test = test_data['ranking_class']

# Initialize the model
model = SVC(random_state=42)

# Define hyperparameter grid for Support Vector Classifier
param_grid_svc = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf']
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid_svc, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Use the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Store results
results_summary_svc = {
    "Prediction Year": prediction_year,
    "Best_Params": grid_search.best_params_,
    "Accuracy": accuracy
}

# Convert results to a DataFrame
results_df_svc = pd.DataFrame([results_summary_svc])
print(results_df_svc)


Fitting 3 folds for each of 4 candidates, totalling 12 fits
