In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import pearsonr

In [4]:
def show_PersonsR(Data_set, year_data):

    All_players_dataSet = Data_set.copy()

    All_players_dataSet = All_players_dataSet.drop(columns=All_players_dataSet.columns[All_players_dataSet.columns.str.contains('Unnamed:')])
    All_players_dataSet.reset_index(drop=True, inplace=True)
    
    All_players_dataSet.fillna(0, inplace=True)
    All_players_dataSet.replace('', 0, inplace=True)
    All_players_dataSet.replace('--', 0, inplace=True)
    
    # check if any NaN,empty Strings exists in the dataframe
    any_missing_values = All_players_dataSet.isna().any().any()
    any_empty_values = (All_players_dataSet.applymap(lambda x: x == '')).any().any()
    if any_missing_values or any_empty_values:
        print("DataFrame contains missing values or empty strings/spaces.")
    else:
        print("DataFrame does not contain missing values or empty strings/spaces.")

    # Drop unwanted columns
    columns_to_drop = ['First_votes', 'Season', 'Year', 'Player_name', 'won_dpoy_last_season',
                   'Max_points', 'Tm', 'Lg', 'Pos', 'Rank', 'Arena_misc',
                   'Attendance_misc','FG', 'FGA', 'FG%', '3P', '3PA',
                   '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'AST',
                   'PTS', 'FG_100_poss', 'FGA_100_poss', 'FG%_100_poss',
                   '3P_100_poss', '3PA_100_poss', '3P%_100_poss', '2P_100_poss', '2PA_100_poss',
                   '2P%_100_poss', 'FT_100_poss', 'FTA_100_poss', 'FT%_100_poss', 'ORB_100_poss', 'AST_100_poss',
                   'PTS_100_poss', 'ORtg_100_poss', '3PAr_advanced', 'FTr_advanced', 'ORB%_advanced',
                   'OWS_advanced', 'OBPM_advanced', 'AST%_advanced', 'USG%_advanced', 'G_team', 'MP_team', 'FG_team', 'FGA_team',
                   'FG%_team', '3P_team', '3PA_team', '3P%_team', '2P_team', '2PA_team', '2P%_team',
                   'FT_team', 'FTA_team', 'FT%_team', 'ORB_team', 'G_opponent', 'MP_opponent',
                   'FG_opponent', 'STL_opponent', 'BLK_opponent','DRB_opponent', 'TOV_opponent', 'PF_opponent',
                   'PTS_opponent', 'ORtg_misc', 'FTr_misc', '3PAr_misc', 'eFG%_misc', 'ORB%_misc',
                   'FT/FGA_misc', 'eFG%.1_misc', 'FT/FGA.1_misc', 'ORPM_espn', 'raptor_offense_raptor',
                   'war_playoffs_raptor', 'predator_offense_raptor','FTr_misc','FT/FGA.1_misc','BLK_opponent',
                   '3P_team','FT/FGA.1_misc', 'PTS_team','TS%_advanced','L_nba','TOV%_advanced']

    All_players_dataSet['won_dpoy_last_season'] = All_players_dataSet['won_dpoy_last_season'].astype(int)
    
    ## Drop the specified columns
    All_players_dataSet.drop(columns=columns_to_drop, inplace=True)
    
    ## seperating our dataFrame
    Y = All_players_dataSet['Points_won']  # Target
    X = All_players_dataSet.drop('Points_won', axis=1)  # Features
    

    X = X.apply(pd.to_numeric, errors='coerce')

    ##Pearson’s R
    ## 1 indicates a perfect positive linear relationship,
    ## -1 indicates a perfect negative linear relationship, and
    ## 0 indicates no linear correlation.
    ## The p-value is a measure in statistical hypothesis testing
    ## that quantifies the evidence against a null hypothesis,
    ## with a lower p-value suggesting stronger evidence to reject the null hypothesis.
    
    correlation_data = {}
    for column in X.columns:
        corr_coefficient, p_value = pearsonr(X[column], Y)
        correlation_data[column] = {'correlation': corr_coefficient, 'p-value': p_value}
        
    # Sort the features based on the absolute correlation coefficient
    sorted_features = sorted(correlation_data.items(), key=lambda x: abs(x[1]['correlation']), reverse=True)

    # Create a DataFrame from the correlation_data dictionary
    correlation_df = pd.DataFrame.from_dict(correlation_data, orient='index')
    correlation_df.index.name = 'Feature'

    # Sort the DataFrame based on the absolute correlation coefficient
    correlation_df = correlation_df.reindex(correlation_df['correlation'].abs().sort_values(ascending=False).index)

    top_100_features_df = correlation_df.head(100)
    
    # Save the DataFrame to a CSV file
    top_100_features_df.to_csv('top_N_features_PersonsR{}.csv'.format(year_data))
    
    for feature, data in sorted_features:
        print(f"Pearson correlation coefficient for {feature}: {data['correlation']:.3f}, p-value: {data['p-value']:.3f}")

In [5]:
all_players_w_add_sorted = pd.read_csv("all_players_w_add_sorted.csv")
show_PersonsR(all_players_w_add_sorted, "All")

  all_players_w_add_sorted = pd.read_csv("all_players_w_add_sorted.csv")


DataFrame does not contain missing values or empty strings/spaces.
Pearson correlation coefficient for Dpoy_titles: 0.290, p-value: 0.000
Pearson correlation coefficient for DWS_advanced: 0.248, p-value: 0.000
Pearson correlation coefficient for Dpoy_votes: 0.235, p-value: 0.000
Pearson correlation coefficient for BLK_nba: 0.234, p-value: 0.000
Pearson correlation coefficient for war_total_raptor: 0.226, p-value: 0.000
Pearson correlation coefficient for BLK: 0.217, p-value: 0.000
Pearson correlation coefficient for WINS_espn: 0.212, p-value: 0.000
Pearson correlation coefficient for war_reg_season_raptor: 0.210, p-value: 0.000
Pearson correlation coefficient for VORP_advanced: 0.204, p-value: 0.000
Pearson correlation coefficient for WS_advanced: 0.196, p-value: 0.000
Pearson correlation coefficient for D-LEBRON_bball: 0.194, p-value: 0.000
Pearson correlation coefficient for DREB_nba: 0.186, p-value: 0.000
Pearson correlation coefficient for DRB: 0.183, p-value: 0.000
Pearson correla

In [6]:
all_players_w_add_sorted = pd.read_csv("all_players_w_add_sorted.csv")

# covert to intger
all_players_w_add_sorted['Year'] = pd.to_numeric(all_players_w_add_sorted['Year'], errors='coerce')

# Create subsets for each 5-year interval
subset_1993_1997 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 1993) & (all_players_w_add_sorted['Year'] <= 1997)]
subset_1998_2002 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 1998) & (all_players_w_add_sorted['Year'] <= 2002)]
subset_2003_2007 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 2003) & (all_players_w_add_sorted['Year'] <= 2007)]
subset_2008_2012 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 2008) & (all_players_w_add_sorted['Year'] <= 2012)]
subset_2013_2017 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 2013) & (all_players_w_add_sorted['Year'] <= 2017)]
subset_2018_2023 = all_players_w_add_sorted[(all_players_w_add_sorted['Year'] >= 2018) & (all_players_w_add_sorted['Year'] <= 2023)]

  all_players_w_add_sorted = pd.read_csv("all_players_w_add_sorted.csv")


In [7]:
show_PersonsR(subset_1993_1997,"1993_1997")

DataFrame does not contain missing values or empty strings/spaces.
Pearson correlation coefficient for Dpoy_titles: 0.618, p-value: 0.000
Pearson correlation coefficient for Dpoy_votes: 0.511, p-value: 0.000
Pearson correlation coefficient for DWS_advanced: 0.300, p-value: 0.000
Pearson correlation coefficient for VORP_advanced: 0.294, p-value: 0.000
Pearson correlation coefficient for war_total_raptor: 0.294, p-value: 0.000
Pearson correlation coefficient for BLK: 0.288, p-value: 0.000
Pearson correlation coefficient for war_reg_season_raptor: 0.277, p-value: 0.000
Pearson correlation coefficient for WS_advanced: 0.258, p-value: 0.000
Pearson correlation coefficient for DRB: 0.208, p-value: 0.000
Pearson correlation coefficient for TRB: 0.194, p-value: 0.000
Pearson correlation coefficient for mp_raptor: 0.168, p-value: 0.000
Pearson correlation coefficient for raptor_defense_raptor: 0.165, p-value: 0.000
Pearson correlation coefficient for poss_raptor: 0.164, p-value: 0.000
Pearson c



In [8]:
show_PersonsR(subset_1998_2002,"1998_2002")

DataFrame does not contain missing values or empty strings/spaces.
Pearson correlation coefficient for Dpoy_titles: 0.461, p-value: 0.000
Pearson correlation coefficient for Dpoy_votes: 0.320, p-value: 0.000
Pearson correlation coefficient for BLK_nba: 0.309, p-value: 0.000
Pearson correlation coefficient for BLK: 0.307, p-value: 0.000
Pearson correlation coefficient for WINS_espn: 0.227, p-value: 0.000
Pearson correlation coefficient for DWS_advanced: 0.214, p-value: 0.000
Pearson correlation coefficient for DREB_nba: 0.205, p-value: 0.000
Pearson correlation coefficient for war_total_raptor: 0.204, p-value: 0.000
Pearson correlation coefficient for DRB: 0.203, p-value: 0.000
Pearson correlation coefficient for TRB: 0.202, p-value: 0.000
Pearson correlation coefficient for DRPM_espn: 0.200, p-value: 0.000
Pearson correlation coefficient for VORP_advanced: 0.190, p-value: 0.000
Pearson correlation coefficient for war_reg_season_raptor: 0.187, p-value: 0.000
Pearson correlation coeffici



In [9]:
show_PersonsR(subset_2003_2007,"2003_2007")

DataFrame does not contain missing values or empty strings/spaces.
Pearson correlation coefficient for Dpoy_titles: 0.321, p-value: 0.000
Pearson correlation coefficient for Dpoy_votes: 0.318, p-value: 0.000
Pearson correlation coefficient for DWS_advanced: 0.314, p-value: 0.000
Pearson correlation coefficient for BLK_nba: 0.292, p-value: 0.000
Pearson correlation coefficient for BLK: 0.282, p-value: 0.000
Pearson correlation coefficient for raptor_defense_raptor: 0.266, p-value: 0.000
Pearson correlation coefficient for war_total_raptor: 0.247, p-value: 0.000
Pearson correlation coefficient for predator_defense_raptor: 0.239, p-value: 0.000
Pearson correlation coefficient for war_reg_season_raptor: 0.223, p-value: 0.000
Pearson correlation coefficient for DRB: 0.222, p-value: 0.000
Pearson correlation coefficient for DREB_nba: 0.222, p-value: 0.000
Pearson correlation coefficient for DRPM_espn: 0.216, p-value: 0.000
Pearson correlation coefficient for TRB: 0.213, p-value: 0.000
Pearso



In [10]:
show_PersonsR(subset_2008_2012,"2008_2012")

DataFrame does not contain missing values or empty strings/spaces.
Pearson correlation coefficient for DWS_advanced: 0.311, p-value: 0.000
Pearson correlation coefficient for Dpoy_votes: 0.295, p-value: 0.000
Pearson correlation coefficient for Dpoy_titles: 0.293, p-value: 0.000
Pearson correlation coefficient for war_total_raptor: 0.277, p-value: 0.000
Pearson correlation coefficient for WINS_espn: 0.275, p-value: 0.000
Pearson correlation coefficient for BLK_nba: 0.269, p-value: 0.000
Pearson correlation coefficient for VORP_advanced: 0.264, p-value: 0.000
Pearson correlation coefficient for BLK: 0.263, p-value: 0.000
Pearson correlation coefficient for war_reg_season_raptor: 0.259, p-value: 0.000
Pearson correlation coefficient for WS_advanced: 0.254, p-value: 0.000
Pearson correlation coefficient for raptor_defense_raptor: 0.225, p-value: 0.000
Pearson correlation coefficient for DREB_nba: 0.223, p-value: 0.000
Pearson correlation coefficient for DRB: 0.222, p-value: 0.000
Pearson 

In [11]:
show_PersonsR(subset_2013_2017,"2013_2017")

DataFrame does not contain missing values or empty strings/spaces.
Pearson correlation coefficient for war_total_raptor: 0.310, p-value: 0.000
Pearson correlation coefficient for DWS_advanced: 0.297, p-value: 0.000
Pearson correlation coefficient for war_reg_season_raptor: 0.291, p-value: 0.000
Pearson correlation coefficient for Dpoy_titles: 0.275, p-value: 0.000
Pearson correlation coefficient for VORP_advanced: 0.261, p-value: 0.000
Pearson correlation coefficient for D-LEBRON_bball: 0.248, p-value: 0.000
Pearson correlation coefficient for WS_advanced: 0.244, p-value: 0.000
Pearson correlation coefficient for BLK_nba: 0.236, p-value: 0.000
Pearson correlation coefficient for WINS_espn: 0.232, p-value: 0.000
Pearson correlation coefficient for BLK: 0.225, p-value: 0.000
Pearson correlation coefficient for DEF WS_nba: 0.202, p-value: 0.000
Pearson correlation coefficient for DREB_nba: 0.201, p-value: 0.000
Pearson correlation coefficient for DRB: 0.194, p-value: 0.000
Pearson correla

In [12]:
show_PersonsR(subset_2018_2023,"2018_2023")

DataFrame does not contain missing values or empty strings/spaces.
Pearson correlation coefficient for Dpoy_titles: 0.341, p-value: 0.000
Pearson correlation coefficient for BLK_nba: 0.259, p-value: 0.000
Pearson correlation coefficient for DWS_advanced: 0.251, p-value: 0.000
Pearson correlation coefficient for BLK: 0.247, p-value: 0.000
Pearson correlation coefficient for D-LEBRON_bball: 0.234, p-value: 0.000
Pearson correlation coefficient for WS_advanced: 0.224, p-value: 0.000
Pearson correlation coefficient for VORP_advanced: 0.214, p-value: 0.000
Pearson correlation coefficient for war_total_raptor: 0.211, p-value: 0.000
Pearson correlation coefficient for WINS_espn: 0.209, p-value: 0.000
Pearson correlation coefficient for Dpoy_votes: 0.209, p-value: 0.000
Pearson correlation coefficient for war_reg_season_raptor: 0.204, p-value: 0.000
Pearson correlation coefficient for DREB_nba: 0.199, p-value: 0.000
Pearson correlation coefficient for DRB: 0.191, p-value: 0.000
Pearson correla