In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score

team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')
standings_df = pd.read_csv('NBA_Standings_Ranked_Classes.csv')
elo_df = pd.read_csv('elo_ratings_per_year.csv')
four_factors_df = pd.read_csv('team_year_avg_four_factors.csv')

# Merge the Elo ratings into team stats dataset
team_stats_df = pd.merge(team_stats_df, elo_df[['Year', 'Team', 'Elo Rating']], how='left', on=['Year', 'Team'])

# Merge the new features into team stats dataset
team_stats_df = pd.merge(team_stats_df, four_factors_df[['Year', 'Team', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate']], how='left', on=['Year', 'Team'])

# Merge the team stats with the updated standings data
merged_data = pd.merge(team_stats_df, standings_df, how='left', on=['Year', 'Team'])

# Ensure all necessary columns are available before creating new features
required_columns = ['ORtg', 'DRtg', 'W', 'L']
missing_columns = [col for col in required_columns if col not in merged_data.columns]

if missing_columns:
    print(f"Missing columns for feature creation: {missing_columns}")
else:
    # Create new features
    merged_data['Net_Rating'] = merged_data['ORtg'] - merged_data['DRtg']
    merged_data['Win_Loss_Ratio'] = merged_data['W'] / merged_data['L']

# Filter the data by year range
filtered_data = merged_data[merged_data['Year'] >= 2004]

# Select the relevant features for training
features = ['Elo Rating', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate', 'Net_Rating', 'Win_Loss_Ratio']

# Function to train and evaluate for different windows
def train_and_evaluate_yearly(conference_data, start_year, end_year, model):
    train_data = conference_data[(conference_data['Year'] >= start_year) & (conference_data['Year'] < end_year)]
    test_data = conference_data[conference_data['Year'] == end_year]
    
    X_train = train_data[features]
    y_train = train_data['ranking_class']

    X_test = test_data[features]
    y_test = test_data['ranking_class']

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

model = RandomForestClassifier(random_state=42)
results_summary = []

# Iterate over the years using different data windows
for window_size in [1, 2, 3, 4, 5, 7]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Evaluate for East and west
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and calculate the mean accuracy
RF_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]

print(RF_NetRating)

# perform a final cross-validation on the best window size to validate the findings
best_window = RF_NetRating.mean(axis=1).idxmax()
print(f"The optimal window size is {best_window} years.")


  team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')


             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.700483          0.715840
2                    0.766086          0.698286
3                    0.778638          0.724101
4                    0.766300          0.714044
5                    0.774048          0.698719
7                    0.802327          0.728245
The optimal window size is 7 years.


  RF_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier(random_state=42)
cv_scores = []

# Iterate over the years using a 7-year window to predict the next year
for year in range(2011, 2023):  # Training on 7 years (e.g., 2004-2010 to predict 2011)
    train_data = filtered_data[(filtered_data['Year'] >= year - 7) & (filtered_data['Year'] < year)]

    X_train = train_data[features]
    y_train = train_data['ranking_class']

    # Perform cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append({
        "Year": year,
        "Mean_CV_Accuracy": scores.mean(),
        "Std_CV_Accuracy": scores.std()
    })

# Convert results to a DataFrame
cv_results_df = pd.DataFrame(cv_scores)
print(cv_results_df)


    Year  Mean_CV_Accuracy  Std_CV_Accuracy
0   2011          0.876989         0.100808
1   2012          0.875484         0.102008
2   2013          0.885015         0.095479
3   2014          0.888866         0.092454
4   2015          0.842055         0.132210
5   2016          0.862157         0.117910
6   2017          0.875721         0.103584
7   2018          0.897123         0.093612
8   2019          0.901441         0.095494
9   2020          0.897958         0.083414
10  2021          0.911072         0.082383
11  2022          0.908439         0.083343


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

model = LogisticRegression(random_state=42, max_iter=1000)
cv_scores = []

# Iterate over the years using a 7-year window to predict the next year
for year in range(2011, 2023):  # Training on 7 years (e.g., 2004-2010 to predict 2011)
    train_data = filtered_data[(filtered_data['Year'] >= year - 7) & (filtered_data['Year'] < year)]

    X_train = train_data[features]
    y_train = train_data['ranking_class']

    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append({
        "Year": year,
        "Mean_CV_Accuracy": scores.mean(),
        "Std_CV_Accuracy": scores.std()
    })

# Convert results to a DataFrame
cv_results_df_lr = pd.DataFrame(cv_scores)
print(cv_results_df_lr)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

    Year  Mean_CV_Accuracy  Std_CV_Accuracy
0   2011          0.759690         0.051561
1   2012          0.772405         0.061421
2   2013          0.754221         0.042859
3   2014          0.763948         0.045292
4   2015          0.754926         0.045642
5   2016          0.769425         0.034535
6   2017          0.780684         0.033023
7   2018          0.765062         0.016688
8   2019          0.787907         0.028150
9   2020          0.788654         0.026172
10  2021          0.792693         0.034038
11  2022          0.815844         0.045578


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

model = GradientBoostingClassifier(random_state=42)
cv_scores = []

# Iterate over the years using a 7-year window to predict the next year
for year in range(2011, 2023):  # Training on 7 years (e.g., 2004-2010 to predict 2011)
    train_data = filtered_data[(filtered_data['Year'] >= year - 7) & (filtered_data['Year'] < year)]

    X_train = train_data[features]
    y_train = train_data['ranking_class']

    # Perform cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append({
        "Year": year,
        "Mean_CV_Accuracy": scores.mean(),
        "Std_CV_Accuracy": scores.std()
    })

# Convert results to a DataFrame
cv_results_df_gb = pd.DataFrame(cv_scores)
print(cv_results_df_gb)


    Year  Mean_CV_Accuracy  Std_CV_Accuracy
0   2011          0.891185         0.094141
1   2012          0.880740         0.101978
2   2013          0.884490         0.095260
3   2014          0.917873         0.069316
4   2015          0.843945         0.130932
5   2016          0.841761         0.137806
6   2017          0.884034         0.098345
7   2018          0.897299         0.085594
8   2019          0.865015         0.106218
9   2020          0.890584         0.094606
10  2021          0.926059         0.067769
11  2022          0.913384         0.078977


In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

model = KNeighborsClassifier()
cv_scores = []

# Iterate over the years using a 7-year window to predict the next year
for year in range(2011, 2023):  # Training on 7 years (e.g., 2004-2010 to predict 2011)
    train_data = filtered_data[(filtered_data['Year'] >= year - 7) & (filtered_data['Year'] < year)]

    X_train = train_data[features]
    y_train = train_data['ranking_class']

    # Perform cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append({
        "Year": year,
        "Mean_CV_Accuracy": scores.mean(),
        "Std_CV_Accuracy": scores.std()
    })

# Convert results to a DataFrame
cv_results_df_knn = pd.DataFrame(cv_scores)
print(cv_results_df_knn)


    Year  Mean_CV_Accuracy  Std_CV_Accuracy
0   2011          0.651459         0.051732
1   2012          0.704439         0.038038
2   2013          0.696554         0.030642
3   2014          0.707288         0.054277
4   2015          0.709293         0.044955
5   2016          0.737763         0.031840
6   2017          0.714063         0.031772
7   2018          0.739891         0.033011
8   2019          0.715583         0.056015
9   2020          0.723255         0.041253
10  2021          0.730179         0.044758
11  2022          0.719803         0.066768


In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

model = DecisionTreeClassifier(random_state=42)
cv_scores = []

# Iterate over the years using a 7-year window to predict the next year
for year in range(2011, 2023):  # Training on 7 years (e.g., 2004-2010 to predict 2011)
    train_data = filtered_data[(filtered_data['Year'] >= year - 7) & (filtered_data['Year'] < year)]

    X_train = train_data[features]
    y_train = train_data['ranking_class']

    # Perform cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append({
        "Year": year,
        "Mean_CV_Accuracy": scores.mean(),
        "Std_CV_Accuracy": scores.std()
    })

# Convert results to a DataFrame
cv_results_df_dt = pd.DataFrame(cv_scores)
print(cv_results_df_dt)


    Year  Mean_CV_Accuracy  Std_CV_Accuracy
0   2011          0.843871         0.128351
1   2012          0.881027         0.102476
2   2013          0.873651         0.110108
3   2014          0.854969         0.121340
4   2015          0.834398         0.148779
5   2016          0.836453         0.148391
6   2017          0.864752         0.115973
7   2018          0.862930         0.119305
8   2019          0.875798         0.108432
9   2020          0.900047         0.082992
10  2021          0.906096         0.084852
11  2022          0.868140         0.110687


In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

model = SVC(random_state=42)
cv_scores = []

# Iterate over the years using a 7-year window to predict the next year
for year in range(2011, 2023):  # Training on 7 years (e.g., 2004-2010 to predict 2011)
    train_data = filtered_data[(filtered_data['Year'] >= year - 7) & (filtered_data['Year'] < year)]

    X_train = train_data[features]
    y_train = train_data['ranking_class']

    # Perform cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append({
        "Year": year,
        "Mean_CV_Accuracy": scores.mean(),
        "Std_CV_Accuracy": scores.std()
    })

# Convert results to a DataFrame
cv_results_df_svm = pd.DataFrame(cv_scores)
print(cv_results_df_svm)


    Year  Mean_CV_Accuracy  Std_CV_Accuracy
0   2011          0.675981         0.050730
1   2012          0.703752         0.050017
2   2013          0.701830         0.021582
3   2014          0.714246         0.037508
4   2015          0.697148         0.052126
5   2016          0.698263         0.050827
6   2017          0.724023         0.048050
7   2018          0.722141         0.052092
8   2019          0.707242         0.062019
9   2020          0.699852         0.060848
10  2021          0.699258         0.063729
11  2022          0.715406         0.036459
