### Package and Master Spreadsheet Import

In [1]:
import pandas as pd
import seaborn as sns
import scipy as sc
import numpy as np

In [None]:
master_pr = pd.DataFrame(columns = ['Player','FG%','FGA', 'FT%','FTA','3P','3P%','3PA','ORB','TRB','AST','STL','BLK','STK','TOV','PTS','FG%_Adj', 'FT_Adj%', '3PT%_Adj', 'PR_sum_7cat', 'PR_sum_8cat', 'Year'])

In [None]:
df = pd.read_csv('Player_Stats_per_Game_by_Year_Library/Master_Player_Stats.csv')
                 
#Add steals+blocks together for "Defensive Stats"
df['STK'] = df['STL'] + df['BLK']
df

### Yearly Sub-Division Cleaning and Player Ranking Mapping

In [None]:
def scrape_data(df,year1,year2):
    
    master_pr = pd.DataFrame(columns = ['Player','FG%','FGA', 'FT%','FTA','3P','3P%','3PA','ORB','TRB','AST','STL','BLK','STK','TOV','PTS','FG%_Adj', 'FT_Adj%', '3PT%_Adj', 'PR_sum_7cat', 'PR_sum_8cat', 'Year'])
    
    for j in range(year1,year2+1):

        df_subset = df[df["Year"] == j]

        df_teams = df_subset.groupby(['Player'])['Tm'].apply('/'.join).reset_index()
        df_teams['Tm'] = (df_teams['Tm']).str.replace('TOT/','')

        df_subset = pd.merge(df_subset,df_teams,how='left', on='Player')
        df_subset.rename(columns={'Tm_y':  'Team'}, inplace = True)
        df_subset.drop(columns=['Tm_x'],inplace = True)

        ##Drop duplicates names, keep TOT
        df_subset.drop_duplicates(subset=['Player'], keep='first', inplace = True)

        # Remove all null FGA, FTA. If you played in the nba you have to have at least 0.0001 FGA or FTA
        df_subset.dropna(subset=['FGA', 'FTA'], inplace = True)
        # > Fill in null 3PA, 3P as 0
        df_subset.fillna(0, inplace = True)
        # > 5 games; to weed out the 10 day players; due to uncircumstancial injuries, there tend to
        # be a random
        df_subset = df_subset[df_subset['G'] > np.percentile(df_subset['G'],2.3)]

        pr = df_subset[['Player','FG%','FGA', 'FT%','FTA','3P','3P%','3PA','ORB','TRB','AST','STL','BLK','STK','TOV','PTS']]
        pr.set_index('Player', inplace = True)

        for i in range(0,len(pr.columns)):
            pr[pr.columns[i]] = sc.stats.zscore(pr[pr.columns[i]], ddof = 0)

        pr['FG%_Adj'] = ''
        pr['FT%_Adj'] = ''
        pr['3P%_Adj'] = ''
        pr['PR_sum_7cat'] = ''
        pr['PR_sum_8cat'] = ''
        pr['TOV'] = pr['TOV'] * -1
        pr['Year'] = str(j)

        for i in range(0,len(pr)):
            if pr['FT%'][i]<0 and pr['FTA'][i]<0:
                pr['FT%_Adj'][i] = pr['FT%'][i] * pr['FTA'][i] * -1
            else:
                pr['FT%_Adj'][i] = pr['FT%'][i] * pr['FTA'][i]

        for i in range(0,len(pr)):
            if pr['3P%'][i]<0 and pr['3PA'][i]<0:
                pr['3P%_Adj'][i] = pr['3P%'][i] * pr['3PA'][i] * -1
            else:
                pr['3P%_Adj'][i] = pr['3P%'][i] * pr['3PA'][i]

        for i in range(0,len(pr)):
            if pr['FG%'][i]<0 and pr['FGA'][i]<0:
                pr['FG%_Adj'][i] = pr['FG%'][i] * pr['FGA'][i] * -1
            else:
                pr['FG%_Adj'][i] = pr['FG%'][i] * pr['FGA'][i]

        if len(df_subset[df_subset['3P']>=1]) < 120:
            for i in range(0,len(pr)):
                pr['PR_sum_7cat'][i] = pr['FG%_Adj'][i] + pr['FT%_Adj'][i] + pr['AST'][i] + pr['TRB'][i] + pr['STK'][i] + + pr['TOV'][i] + pr['PTS'][i]
            #7cats = 14
        else:
            for i in range(0,len(pr)):
                pr['PR_sum_8cat'][i] = pr['FG%_Adj'][i] + pr['FT%_Adj'][i] + pr['3P%_Adj'][i] + pr['AST'][i] + pr['TRB'][i] + pr['STK'][i] + + pr['TOV'][i] + pr['PTS'][i]
            #8cats = 16

        pr.reset_index(level=0, inplace=True)
    
        pr = pd.merge(pr, df_teams, on="Player")
    
        master_pr = pd.merge(master_pr, df_teams, on="Player")
    
        master_pr = master_pr.append(pr)[pr.columns.tolist()]
    
    master_pr.to_csv('Player_Stats_per_Game_by_Year_Library/Master_PR_Stats_' + str(year1) + '_' + str(year2) +'.csv', index = False)

In [None]:
#original copy
for j in range(2015,max(df['Year'])+1):
    
    df_subset = df[df["Year"] == j]
    
    df_teams = df_subset.groupby(['Player'])['Tm'].apply('/'.join).reset_index()
    df_teams['Tm'] = (df_teams['Tm']).str.replace('TOT/','')
    
    df_subset = pd.merge(df_subset,df_teams,how='left', on='Player')
    df_subset.rename(columns={'Tm_y':  'Team'}, inplace = True)
    df_subset.drop(columns=['Tm_x'],inplace = True)
    
    ##Drop duplicates names, keep TOT
    df_subset.drop_duplicates(subset=['Player'], keep='first', inplace = True)
    
    # Remove all null FGA, FTA. If you played in the nba you have to have at least 0.0001 FGA or FTA
    df_subset.dropna(subset=['FGA', 'FTA'], inplace = True)
    # > Fill in null 3PA, 3P as 0
    df_subset.fillna(0, inplace = True)
    # > 5 games; to weed out the 10 day players; due to uncircumstancial injuries, there tend to
    # be a random
    df_subset = df_subset[df_subset['G'] > np.percentile(df_subset['G'],2.3)]
    
    pr = df_subset[['Player','FG%','FGA', 'FT%','FTA','3P','3P%','3PA','ORB','TRB','AST','STL','BLK','STK','TOV','PTS']]
    pr.set_index('Player', inplace = True)
    
    for i in range(0,len(pr.columns)):
        pr[pr.columns[i]] = sc.stats.zscore(pr[pr.columns[i]], ddof = 0)
    
    pr['FG%_Adj'] = ''
    pr['FT%_Adj'] = ''
    pr['3P%_Adj'] = ''
    pr['PR_sum_7cat'] = ''
    pr['PR_sum_8cat'] = ''
    pr['TOV'] = pr['TOV'] * -1
    pr['Year'] = str(j)
    
    for i in range(0,len(pr)):
        if pr['FT%'][i]<0 and pr['FTA'][i]<0:
            pr['FT%_Adj'][i] = pr['FT%'][i] * pr['FTA'][i] * -1
        else:
            pr['FT%_Adj'][i] = pr['FT%'][i] * pr['FTA'][i]
        
    for i in range(0,len(pr)):
        if pr['3P%'][i]<0 and pr['3PA'][i]<0:
            pr['3P%_Adj'][i] = pr['3P%'][i] * pr['3PA'][i] * -1
        else:
            pr['3P%_Adj'][i] = pr['3P%'][i] * pr['3PA'][i]

    for i in range(0,len(pr)):
        if pr['FG%'][i]<0 and pr['FGA'][i]<0:
            pr['FG%_Adj'][i] = pr['FG%'][i] * pr['FGA'][i] * -1
        else:
            pr['FG%_Adj'][i] = pr['FG%'][i] * pr['FGA'][i]

    if len(df_subset[df_subset['3P']>=1]) < 120:
        for i in range(0,len(pr)):
            pr['PR_sum_7cat'][i] = pr['FG%_Adj'][i] + pr['FT%_Adj'][i] + pr['AST'][i] + pr['TRB'][i] + pr['STK'][i] + + pr['TOV'][i] + pr['PTS'][i]
        #7cats = 14
    else:
        for i in range(0,len(pr)):
            pr['PR_sum_8cat'][i] = pr['FG%_Adj'][i] + pr['FT%_Adj'][i] + pr['3P%_Adj'][i] + pr['AST'][i] + pr['TRB'][i] + pr['STK'][i] + + pr['TOV'][i] + pr['PTS'][i]
        #8cats = 16
    
    pr.reset_index(level=0, inplace=True)
    
    pr = pd.merge(pr, df_teams, on="Player")
    
    master_pr = pd.merge(master_pr, df_teams, on="Player")
    
    master_pr = master_pr.append(pr)[pr.columns.tolist()]