In [None]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt

In [None]:
def Data_Organizer(raw_Data):
    
    All_players_dataSet = raw_Data.copy()
    
    All_players_dataSet = All_players_dataSet.drop(columns=All_players_dataSet.columns[All_players_dataSet.columns.str.contains('Unnamed:')])
    All_players_dataSet.reset_index(drop=True, inplace=True)
    
    All_players_dataSet.fillna(0, inplace=True)
    All_players_dataSet.replace('', 0, inplace=True)
    All_players_dataSet.replace('--', 0, inplace=True)
    
    # check if any NaN,empty Strings exists in the dataframe
    any_missing_values = All_players_dataSet.isna().any().any()
    any_empty_values = (All_players_dataSet.applymap(lambda x: x == '')).any().any()
    
    if any_missing_values or any_empty_values:
        print("DataFrame contains missing values or empty strings/spaces.")
    else:
        print("DataFrame does not contain missing values or empty strings/spaces.")
        
    ## seperating our dataFrame
    Y = All_players_dataSet['Points_won']  # Target
    X = All_players_dataSet.drop('Points_won', axis=1)  # Features
    
    X = X.apply(pd.to_numeric, errors='coerce')

    ## keep the specified columns
    
    columns_to_keep = [
        "Dpoy_votes",
        "DWS_advanced",
        "war_total_raptor",
        "predator_defense_raptor",
        "war_reg_season_raptor",
        "WS_advanced",
        "predator_total_raptor",
        "raptor_total_raptor",
        "poss_raptor",
        "VORP_advanced",
        "DEF WS_nba",
        "mp_raptor",
        "raptor_defense_raptor",
        "MP_advanced",
        "MP",
        "GS",
        "MP_100_poss",
        "BPM_advanced",
        "MIN_nba",
        "OPP PTS PAINT_nba",
        "MPG_espn"
    ]
    
    ## keep only wanted columns
    X = X[columns_to_keep]

    return X, Y

In [None]:
import pandas as pd
all_players_w_add_sorted = pd.read_csv("all_players_w_add_sorted.csv")

In [None]:
Test_Data_2023 = all_players_w_add_sorted[all_players_w_add_sorted['Year'] == 2023]
Train_Data_2022 = all_players_w_add_sorted[all_players_w_add_sorted['Year'] <= 2022]

In [None]:
X_train, y_train = Data_Organizer(Train_Data_2022)
X_test , y_test = Data_Organizer(Test_Data_2023)

In [None]:
# Initialize and train a Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=300,random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Mean Squared Error (MAE)
print(f'Mean Squared Error (Random Forest): {mse_rf}')
print(f'R-squared (Random Forest): {r2_rf}')

In [None]:
# Initialize and train a XGBoost Regressor model
xgb_model = XGBRegressor(n_estimators=200,max_depth = 5, random_state=42, n_jobs=-1)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Mean Squared Error (MAE)
print(f'Mean Squared Error (XGBoost): {mse_xgb}')
print(f'R-squared (XGBoost): {r2_xgb}')

In [None]:
# Assuming y_pred_xgb is a numpy array
Test_Data_2023.loc[:, 'Predicted_points'] = y_pred_xgb

# Select only the desired columns
Test_Data_2023_concatenated = Test_Data_2023[['Player_name', 'Points_won', 'Predicted_points']]

# Display the concatenated DataFrame sorted by 'Points_won'
display(Test_Data_2023_concatenated.sort_values(by='Points_won', ascending=False))

In [None]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train a LightGBM Regressor model
lgbm_model = LGBMRegressor(n_estimators=1500, random_state=42, n_jobs=-1)
lgbm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lgbm = lgbm_model.predict(X_test)

# Evaluate the model
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
r2_lgbm = r2_score(y_test, y_pred_lgbm)

# Mean Squared Error (MAE)
print(f'Mean Squared Error (LightGBM): {mse_lgbm}')
print(f'R-squared (LightGBM): {r2_lgbm}')

In [None]:
## Do cross validation
## Do RF incremantal learning
## Do SGD incremantal learning
## Do Leave one out

In [None]:
import xgboost as xgb

X_train_all, y_train_all = Data_Organizer(all_players_w_add_sorted)

# Define the XGBoost regressor
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)

# Define the hyperparameters grid to search
param_grid = {
    'n_estimators': [100, 200, 500, 800],
    'learning_rate': [0.05, 0.1, 0.3,0.5],
    'max_depth': [3, 5, 6, 7, 9],
    'subsample': [0.4, 0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.05, 0.1, 0.2],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (R-squared):", best_score)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Define the hyperparameters distribution for random search
param_dist = {
    'n_estimators': randint(100, 1501),
    'learning_rate': uniform(0.05, 0.45),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.4, 0.6),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.2),
}

# Perform random search with cross-validation
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=100, cv=5, scoring='r2')
random_search.fit(X_train_all, y_train_all)

# Get the best parameters and best score
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

print("Best Parameters (Random Search):", best_params_random)
print("Best Score (R-squared) (Random Search):", best_score_random)