In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [3]:
## Data processing and cleaning function
def Process_Data(Data_set):
    
    All_players_dataSet = Data_set.copy()
    
    All_players_dataSet = All_players_dataSet.drop(columns=All_players_dataSet.columns[All_players_dataSet.columns.str.contains('Unnamed:')])
    All_players_dataSet.reset_index(drop=True, inplace=True)
    
    All_players_dataSet.fillna(0, inplace=True)
    All_players_dataSet.replace('', 0, inplace=True)
    All_players_dataSet.replace('--', 0, inplace=True)
    
    # check if any NaN,empty Strings exists in the dataframe
    any_missing_values = All_players_dataSet.isna().any().any()
    any_empty_values = (All_players_dataSet.applymap(lambda x: x == '')).any().any()
    if any_missing_values or any_empty_values:
        print("DataFrame contains missing values or empty strings/spaces.")
    else:
        print("DataFrame does not contain missing values or empty strings/spaces.")
        
    # Drop unwanted columns
    columns_to_drop = ['First_votes','Season', 'Year', 'Player_name',
                   'Max_points', 'Tm', 'Lg', 'Pos', 'Rank', 'Arena_misc',
                   'Attendance_misc','FG', 'FGA', 'FG%', '3P', '3PA',
                   '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'AST',
                   'PTS', 'FG_100_poss', 'FGA_100_poss', 'FG%_100_poss',
                   '3P_100_poss', '3PA_100_poss', '3P%_100_poss', '2P_100_poss', '2PA_100_poss',
                   '2P%_100_poss', 'FT_100_poss', 'FTA_100_poss', 'FT%_100_poss', 'ORB_100_poss', 'AST_100_poss',
                   'PTS_100_poss', 'ORtg_100_poss', '3PAr_advanced', 'FTr_advanced', 'ORB%_advanced',
                   'OWS_advanced', 'OBPM_advanced', 'AST%_advanced', 'USG%_advanced', 'G_team', 'MP_team', 'FG_team', 'FGA_team',
                   'FG%_team', '3P_team', '3PA_team', '3P%_team', '2P_team', '2PA_team', '2P%_team',
                   'FT_team', 'FTA_team', 'FT%_team', 'ORB_team', 'G_opponent', 'MP_opponent',
                   'FG_opponent', 'STL_opponent', 'BLK_opponent','DRB_opponent', 'TOV_opponent', 'PF_opponent',
                   'PTS_opponent', 'ORtg_misc', 'FTr_misc', '3PAr_misc', 'eFG%_misc', 'ORB%_misc',
                   'FT/FGA_misc', 'eFG%.1_misc', 'FT/FGA.1_misc', 'ORPM_espn', 'raptor_offense_raptor',
                   'war_playoffs_raptor', 'predator_offense_raptor','FTr_misc','FT/FGA.1_misc','BLK_opponent',
                   '3P_team','FT/FGA.1_misc', 'PTS_team','TS%_advanced','L_nba','TOV%_advanced']

    All_players_dataSet['won_dpoy_last_season'] = All_players_dataSet['won_dpoy_last_season'].astype(int)
    
    ## Drop the specified columns
    All_players_dataSet.drop(columns=columns_to_drop, inplace=True)
    
    ## seperating our dataFrame
    Y = All_players_dataSet['Points_won']  # Target
    X = All_players_dataSet.drop('Points_won', axis=1)  # Features
    

    X = X.apply(pd.to_numeric, errors='coerce')


    return X,Y
## Applying SelectFromModel with RandomForestRegressor and return feature importances
def Create_model_and_get_importence_RF(X,Y):
    # Create a RandomForestRegressor and fit it to the data
    regressor = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    regressor.fit(X, Y)

    # Print the feature importances
    feature_importances = regressor.feature_importances_
    
    return regressor, feature_importances
## Applying SelectFromModel with XGBRegressor and return feature importance
def Create_model_and_get_importance_XGB(X, Y):
    # Create a GradientBoostingRegressor and fit it to the data
    regressor = XGBRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    regressor.fit(X, Y)

    # Print the feature importances
    feature_importances = regressor.feature_importances_
    
    return regressor, feature_importances
    
## Applying SelectFromModel with LGBMRegressor and return feature importance
def Create_model_and_get_importance_LGBM(X, Y):
    # Create a LGBMRegressor and fit it to the data
    regressor = LGBMRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    regressor.fit(X, Y)

    # Print the feature importances
    feature_importances = regressor.feature_importances_
    
    return regressor, feature_importances

## Display features importance and save the file 
def show_SelectFromModel(Data_set, year_data):

    X, Y = Process_Data(Data_set)

    regressor, feature_importances = Create_model_and_get_importance_LGBM(X,Y)
    
    # Create a DataFrame for all features and their importances
    all_features_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
    
    # Sort the DataFrame by the "Importance" column in descending order
    all_features_df = all_features_df.sort_values(by='Importance', ascending=False)

    display(all_features_df)
    
    # Save selected features to a CSV file
    all_features_df.to_csv('top_N_features_SelectFromModel_LGBM{}.csv'.format(year_data), index=False)


In [4]:
## All players dataFrame
all_players_w_add_sorted = pd.read_csv("all_players_w_add_sorted.csv")

  all_players_w_add_sorted = pd.read_csv("all_players_w_add_sorted.csv")


In [5]:
show_SelectFromModel(all_players_w_add_sorted, "All")

DataFrame does not contain missing values or empty strings/spaces.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17071
[LightGBM] [Info] Number of data points in the train set: 16284, number of used features: 98
[LightGBM] [Info] Start training from score 1.426185


Unnamed: 0,Feature,Importance
96,DEF WS_nba,1373
97,D-LEBRON_bball,928
28,DWS_advanced,791
11,BLK,462
1,Dpoy_votes,444
...,...,...
83,MIN_nba,22
3,teamates_with_dpoy,19
81,GP_nba,6
22,MP_advanced,0


In [6]:
all_players_w_add_sorted = pd.read_csv("all_players_w_add_sorted.csv")

# Covert to intger
all_players_w_add_sorted['Year'] = pd.to_numeric(all_players_w_add_sorted['Year'], errors='coerce')

# Create subsets for each 5-year interval
subset_1993_1997 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 1993) & (all_players_w_add_sorted['Year'] <= 1997)]
subset_1998_2002 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 1998) & (all_players_w_add_sorted['Year'] <= 2002)]
subset_2003_2007 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 2003) & (all_players_w_add_sorted['Year'] <= 2007)]
subset_2008_2012 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 2008) & (all_players_w_add_sorted['Year'] <= 2012)]
subset_2013_2017 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 2013) & (all_players_w_add_sorted['Year'] <= 2017)]
subset_2018_2023 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 2018) & (all_players_w_add_sorted['Year'] <= 2023)]

  all_players_w_add_sorted = pd.read_csv("all_players_w_add_sorted.csv")


In [7]:
show_SelectFromModel(subset_1993_1997,"1993_1997")

DataFrame does not contain missing values or empty strings/spaces.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11242
[LightGBM] [Info] Number of data points in the train set: 2295, number of used features: 95
[LightGBM] [Info] Start training from score 0.231808


Unnamed: 0,Feature,Importance
28,DWS_advanced,724
14,MP_100_poss,663
18,BLK_100_poss,628
27,BLK%_advanced,512
29,WS_advanced,450
...,...,...
56,L_misc,0
26,STL%_advanced,0
22,MP_advanced,0
2,won_dpoy_last_season,0


In [8]:
show_SelectFromModel(subset_1998_2002,"1998_2002")

DataFrame does not contain missing values or empty strings/spaces.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12955
[LightGBM] [Info] Number of data points in the train set: 2396, number of used features: 96
[LightGBM] [Info] Start training from score 0.270033


Unnamed: 0,Feature,Importance
1,Dpoy_votes,628
28,DWS_advanced,543
74,raptor_defense_raptor,538
80,pace_impact_raptor,518
78,predator_defense_raptor,483
...,...,...
3,teamates_with_dpoy,7
26,STL%_advanced,0
22,MP_advanced,0
2,won_dpoy_last_season,0


In [9]:
show_SelectFromModel(subset_2003_2007,"2003_2007")

DataFrame does not contain missing values or empty strings/spaces.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005488 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12786
[LightGBM] [Info] Number of data points in the train set: 2498, number of used features: 96
[LightGBM] [Info] Start training from score 2.288231


Unnamed: 0,Feature,Importance
28,DWS_advanced,997
96,DEF WS_nba,835
1,Dpoy_votes,832
31,DBPM_advanced,599
78,predator_defense_raptor,482
...,...,...
56,L_misc,0
26,STL%_advanced,0
22,MP_advanced,0
2,won_dpoy_last_season,0


In [10]:
show_SelectFromModel(subset_2008_2012,"2008_2012")

DataFrame does not contain missing values or empty strings/spaces.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13100
[LightGBM] [Info] Number of data points in the train set: 2611, number of used features: 97
[LightGBM] [Info] Start training from score 2.094983


Unnamed: 0,Feature,Importance
97,D-LEBRON_bball,924
28,DWS_advanced,617
74,raptor_defense_raptor,605
1,Dpoy_votes,554
30,WS/48_advanced,523
...,...,...
81,GP_nba,3
83,MIN_nba,2
26,STL%_advanced,0
2,won_dpoy_last_season,0


In [11]:
show_SelectFromModel(subset_2013_2017,"2013_2017")

DataFrame does not contain missing values or empty strings/spaces.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005693 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13123
[LightGBM] [Info] Number of data points in the train set: 2716, number of used features: 97
[LightGBM] [Info] Start training from score 2.004786


Unnamed: 0,Feature,Importance
96,DEF WS_nba,822
28,DWS_advanced,797
78,predator_defense_raptor,749
97,D-LEBRON_bball,721
69,DRPM_espn,634
...,...,...
56,L_misc,2
81,GP_nba,0
22,MP_advanced,0
26,STL%_advanced,0


In [12]:
show_SelectFromModel(subset_2018_2023,"2018_2023")

DataFrame does not contain missing values or empty strings/spaces.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004592 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13808
[LightGBM] [Info] Number of data points in the train set: 3768, number of used features: 97
[LightGBM] [Info] Start training from score 1.436837


Unnamed: 0,Feature,Importance
96,DEF WS_nba,1370
97,D-LEBRON_bball,1148
28,DWS_advanced,839
1,Dpoy_votes,715
11,BLK,450
...,...,...
83,MIN_nba,12
81,GP_nba,3
22,MP_advanced,0
2,won_dpoy_last_season,0
