# NBA Age Decline Project Part 2 - Data Cleaning

### Imports

In [1]:
import nba_api

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
import requests
import time
import json

In [4]:
from nba_api.stats.static import players
from nba_api.stats.library.parameters import SeasonAll
from nba_api.stats.library.parameters import SeasonType
from nba_api.stats.endpoints import playercareerstats
from nba_api.stats.endpoints import playerprofilev2

In [5]:
valid_seasons = ['2022-23',
                 '2021-22', 
                 '2020-21', 
                 '2019-20', 
                 '2018-19', 
                 '2017-18', 
                 '2016-17', 
                 '2015-16',
                 '2014-15',
                 '2013-14',
                 '2012-13']

# Going back 10 years not including this year

In [6]:
df = pd.read_csv('C:/Users/kevin/Downloads/player dictionary.csv')
df

Unnamed: 0,id,full_name,first_name,last_name,is_active
0,76001,Alaa Abdelnaby,Alaa,Abdelnaby,False
1,76002,Zaid Abdul-Aziz,Zaid,Abdul-Aziz,False
2,76003,Kareem Abdul-Jabbar,Kareem,Abdul-Jabbar,False
3,51,Mahmoud Abdul-Rauf,Mahmoud,Abdul-Rauf,False
4,1505,Tariq Abdul-Wahad,Tariq,Abdul-Wahad,False
...,...,...,...,...,...
4826,1627790,Ante Zizic,Ante,Zizic,False
4827,78647,Jim Zoet,Jim,Zoet,False
4828,78648,Bill Zopf,Bill,Zopf,False
4829,1627826,Ivica Zubac,Ivica,Zubac,True


**Bringing back the three dataframes from the endpoints that we scraped in part 1.**

save_players is each season's stats with all the players included in this analysis.

In [7]:
save_players = pd.read_csv('C:/Users/kevin/Downloads/old nba players last 10 years v3.csv')
save_players

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,FGM,FGA,FG_PCT,FG3M,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,200746,2006-07,POR,21.0,63,22.1,3.8,7.6,0.503,0.0,...,0.722,2.3,2.7,5.0,0.4,0.3,1.2,0.7,3.0,9.0
1,200746,2007-08,POR,22.0,76,34.9,7.4,15.3,0.484,0.0,...,0.762,2.9,4.7,7.6,1.6,0.7,1.2,1.7,3.2,17.8
2,200746,2008-09,POR,23.0,81,37.1,7.4,15.3,0.484,0.1,...,0.781,2.9,4.6,7.5,1.9,1.0,1.0,1.5,2.6,18.1
3,200746,2009-10,POR,24.0,78,37.5,7.4,15.0,0.495,0.1,...,0.757,2.5,5.6,8.0,2.1,0.9,0.6,1.3,3.0,17.9
4,200746,2010-11,POR,25.0,81,39.6,8.7,17.5,0.500,0.0,...,0.791,3.4,5.3,8.8,2.1,1.0,1.2,1.9,2.7,21.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3374,201152,2020-21,CHI,33.0,68,24.3,5.4,9.7,0.559,0.2,...,0.628,2.5,3.8,6.2,4.3,1.1,0.6,2.0,2.2,12.1
3375,201152,2021-22,SAS,34.0,26,14.2,2.8,4.9,0.578,0.0,...,0.455,1.5,2.0,3.6,2.3,0.9,0.3,1.2,1.5,6.1
3376,201152,2021-22,TOR,34.0,26,18.3,2.6,5.5,0.465,0.7,...,0.481,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3
3377,201152,2021-22,TOT,34.0,52,16.3,2.7,5.2,0.518,0.3,...,0.469,1.5,2.5,4.0,2.0,1.0,0.3,1.0,1.6,6.2


player_eff is the efficiency rank for each season of every player in our analysis. As you can see some players have NaN, it is possible they did not play enough games to qualify (or some other reason).

In [8]:
player_eff = pd.read_csv('C:/Users/kevin/Downloads/player efficiency ranks updated.csv')
player_eff

Unnamed: 0,PLAYER_ID,SEASON_ID,RANK_PG_EFF
0,200746,2006-07,
1,200746,2007-08,53.0
2,200746,2008-09,33.0
3,200746,2009-10,35.0
4,200746,2010-11,13.0
...,...,...,...
2784,201152,2018-19,69.0
2785,201152,2019-20,148.0
2786,201152,2020-21,56.0
2787,201152,2021-22,


career_stats has each player in our analysis taking up one row with their career averages.

In [9]:
career_stats = pd.read_csv('C:/Users/kevin/Downloads/player career stats.csv')
career_stats

Unnamed: 0,PLAYER_ID,LEAGUE_ID,Team_ID,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,200746,0,0,1076,997,33.7,7.7,15.7,0.493,0.2,...,0.813,2.6,5.5,8.1,1.9,0.7,1.1,1.5,2.4,19.1
1,951,0,0,1300,1149,35.7,6.6,14.6,0.452,2.3,...,0.894,0.9,3.1,4.1,3.4,1.1,0.2,2.1,2.2,18.9
2,2754,0,0,820,441,22.0,3.2,6.6,0.475,0.2,...,0.709,1.2,2.3,3.5,1.3,1.4,0.4,1.4,2.4,8.1
3,200811,0,0,428,42,12.9,1.5,3.2,0.474,0.0,...,0.444,1.5,2.1,3.6,0.4,0.4,0.7,0.7,2.0,3.7
4,2365,0,0,695,45,17.7,1.9,3.6,0.532,0.0,...,0.654,1.7,3.3,5.0,0.5,0.4,1.4,0.7,1.9,5.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,101107,0,0,1072,828,28.1,3.7,8.3,0.443,1.0,...,0.808,1.2,4.0,5.2,1.3,0.8,0.5,1.0,2.0,10.2
198,2590,0,0,818,525,29.2,5.0,11.4,0.434,1.3,...,0.871,0.5,2.3,2.8,4.9,0.9,0.1,2.3,2.5,13.2
199,1897,0,0,991,840,31.7,4.7,11.4,0.414,1.2,...,0.715,1.2,3.3,4.5,2.7,1.7,0.5,1.8,2.6,13.2
200,201156,0,0,720,201,22.8,4.0,9.6,0.418,1.4,...,0.836,0.4,1.6,2.0,1.0,0.5,0.2,1.0,1.8,11.4


In [12]:
df[df['full_name'] == 'Dirk Nowitzki']

Unnamed: 0,id,full_name,first_name,last_name,is_active
3184,1717,Dirk Nowitzki,Dirk,Nowitzki,False


### Cleaning!

We first edit the player_eff dataframe to add a column called EFF_CHANGE. This takes the rank of the player for that season and subtract from it the rank of the player for the following season. This column serves as the drop off that is to happen for that  player following the season.

**MISTAKE: I made a mistake of not creating a groupby statement for this column. As a result, a player's final season will have an EFF_CHANGE computed partially from the next player's next season. This does make the data inaccurate for some players. However, for the most part this simply makes the largest drop off age for some players a year or two later than it really is. And for many players, the drop off in one of their previous seasons is greater than this faulty number anyways.**

In [10]:
player_eff['EFF_CHANGE'] = player_eff['RANK_PG_EFF'].diff(-1)

In [11]:
player_eff
#positive change is good --> means that your ranking is higher in the following year

Unnamed: 0,PLAYER_ID,SEASON_ID,RANK_PG_EFF,EFF_CHANGE
0,200746,2006-07,,
1,200746,2007-08,53.0,20.0
2,200746,2008-09,33.0,-2.0
3,200746,2009-10,35.0,22.0
4,200746,2010-11,13.0,3.0
...,...,...,...,...
2784,201152,2018-19,69.0,-79.0
2785,201152,2019-20,148.0,92.0
2786,201152,2020-21,56.0,
2787,201152,2021-22,,


In [15]:
player_eff.isna().sum()

PLAYER_ID         0
SEASON_ID         0
RANK_PG_EFF     951
EFF_CHANGE     1425
dtype: int64

In [12]:
player_eff.dropna(inplace= True)

In [13]:
player_eff

Unnamed: 0,PLAYER_ID,SEASON_ID,RANK_PG_EFF,EFF_CHANGE
1,200746,2007-08,53.0,20.0
2,200746,2008-09,33.0,-2.0
3,200746,2009-10,35.0,22.0
4,200746,2010-11,13.0,3.0
5,200746,2011-12,10.0,0.0
...,...,...,...,...
2781,201152,2015-16,37.0,-48.0
2782,201152,2016-17,85.0,14.0
2783,201152,2017-18,71.0,2.0
2784,201152,2018-19,69.0,-79.0


In [18]:
df[df['id'] == 200746]
#Lamarcus Aldridge dropped off the most from the 2014-15 season to the 2015-16 season

Unnamed: 0,id,full_name,first_name,last_name,is_active
44,200746,LaMarcus Aldridge,LaMarcus,Aldridge,False


In [14]:
def remove_players(df):
    grouped = df.groupby('PLAYER_ID')
    to_drop = []
    for player, group in grouped:
        if (group['EFF_CHANGE'] < 0).any() == False:
            to_drop.append(player)
    df = df[~df['PLAYER_ID'].isin(to_drop)]
    return df

# Removes players that exhibit no negative changes --> usually players like 201202 that are not in the league long

In [15]:
player_eff = remove_players(player_eff)

In [16]:
player_eff[player_eff['PLAYER_ID'] == 201202]

Unnamed: 0,PLAYER_ID,SEASON_ID,RANK_PG_EFF,EFF_CHANGE


In [22]:
player_eff.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1335 entries, 1 to 2785
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PLAYER_ID    1335 non-null   int64  
 1   SEASON_ID    1335 non-null   object 
 2   RANK_PG_EFF  1335 non-null   float64
 3   EFF_CHANGE   1335 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 52.1+ KB


In [23]:
player_eff['PLAYER_ID'].nunique()

177

This creates the target column we will use to column in part 3. For each of the EFF_CHANGE for a certain player, the largest drop will be True while all other columns will be False.

In [17]:
def create_target_column(df):
    df['TARGET'] = False
    grouped = df.groupby('PLAYER_ID')
    for player, group in grouped:
        last_half = group.iloc[int(len(group)/2):]
        idx = last_half['EFF_CHANGE'].idxmin()
        df.loc[idx, 'TARGET'] = True
    return df

player_eff = create_target_column(player_eff)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TARGET'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [18]:
player_eff
#everything looks good

Unnamed: 0,PLAYER_ID,SEASON_ID,RANK_PG_EFF,EFF_CHANGE,TARGET
1,200746,2007-08,53.0,20.0,False
2,200746,2008-09,33.0,-2.0,False
3,200746,2009-10,35.0,22.0,False
4,200746,2010-11,13.0,3.0,False
5,200746,2011-12,10.0,0.0,False
...,...,...,...,...,...
2781,201152,2015-16,37.0,-48.0,False
2782,201152,2016-17,85.0,14.0,False
2783,201152,2017-18,71.0,2.0,False
2784,201152,2018-19,69.0,-79.0,True


We will now use this player_eff to merge with the save_players df, which if you recall has each player's stats for every season. Therefore, this merged_df will have each player's stats for every season as well as their rank that season and the drop off about to happen.

In [19]:
merged_df = pd.merge(player_eff, save_players, on=['PLAYER_ID', 'SEASON_ID'])

In [20]:
def check_target(df):
    grouped = df.groupby('PLAYER_ID')
    for player, group in grouped:
        if not group['TARGET'].any():
            print(f"Player {player} does not have a row where TARGET is True")
    print("All players have a row where TARGET is True.")

check_target(merged_df)

All players have a row where TARGET is True.


In [21]:
merged_df

Unnamed: 0,PLAYER_ID,SEASON_ID,RANK_PG_EFF,EFF_CHANGE,TARGET,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,200746,2007-08,53.0,20.0,False,POR,22.0,76,34.9,7.4,...,0.762,2.9,4.7,7.6,1.6,0.7,1.2,1.7,3.2,17.8
1,200746,2008-09,33.0,-2.0,False,POR,23.0,81,37.1,7.4,...,0.781,2.9,4.6,7.5,1.9,1.0,1.0,1.5,2.6,18.1
2,200746,2009-10,35.0,22.0,False,POR,24.0,78,37.5,7.4,...,0.757,2.5,5.6,8.0,2.1,0.9,0.6,1.3,3.0,17.9
3,200746,2010-11,13.0,3.0,False,POR,25.0,81,39.6,8.7,...,0.791,3.4,5.3,8.8,2.1,1.0,1.2,1.9,2.7,21.8
4,200746,2011-12,10.0,0.0,False,POR,26.0,55,36.3,8.8,...,0.814,2.7,5.3,8.0,2.4,0.9,0.8,2.0,2.8,21.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1546,201152,2015-16,37.0,-48.0,False,BKN,28.0,73,33.0,6.8,...,0.644,2.4,6.6,9.0,1.9,1.5,0.5,1.9,2.5,15.1
1547,201152,2016-17,85.0,14.0,False,IND,29.0,74,30.2,4.9,...,0.523,1.8,4.3,6.1,1.6,1.5,0.4,1.3,1.8,11.0
1548,201152,2017-18,71.0,2.0,False,IND,30.0,81,32.2,5.2,...,0.598,2.3,4.0,6.3,1.9,1.7,0.4,1.3,2.2,11.8
1549,201152,2018-19,69.0,-79.0,True,IND,31.0,81,30.7,5.5,...,0.644,2.4,4.1,6.5,2.5,1.5,0.4,1.5,2.4,12.6


We will then take that result and merge to the original player dictionary to retrieve player names and make it easier to reference and call.

In [None]:
merged_df = pd.merge(merged_df, df[['id', 'full_name']], left_on='PLAYER_ID', right_on='id')
merged_df = merged_df.drop(columns=['id'])
merged_df.insert(0, 'full_name', merged_df.pop('full_name'))

In [None]:
merged_df

I found that some players had multiple rows for the same season. This is because they played for multiple teams, so NBA stats creates a row for that season for every team they played for, as well as the combined season stats. The below code only keeps the combined season stats for any season with multiple rows.

In [None]:
merged_df = merged_df.groupby(['PLAYER_ID','SEASON_ID']).apply(lambda x: x[x['TEAM_ABBREVIATION'] == 'TOT'] if x['TEAM_ABBREVIATION'].nunique()>1 else x)
merged_df = merged_df.reset_index(drop=True)

In [None]:
merged_df[merged_df['FULL_NAME'] == 'Wesley Matthews']
#Only keeps one row for his 2017-18 season

After this, I took a scan through the data just to do any contextual cleaning. I decided to remove the following players because they either had some data missing that made their target drop off age inaccurate to any NBA fan or they have yet to clearly drop off. For Blake Griffin, his data stopped at 2014, Steph Curry's drop off was supposedly following the 2016-17 season, Kobe's drop off was following 2009-10, James Harden's drop off was following 2017-18, LeBron's drop off was following 2014-15, and Isaiah Thomas's data stopped at 2016.

In [None]:
not_included = ['Blake Griffin', 'Stephen Curry', 'Kobe Bryant', 'James Harden', 'LeBron James', 'Isaiah Thomas']
rows_to_drop = merged_df[merged_df['FULL_NAME'].isin(not_included)].index
merged_df = merged_df.drop(rows_to_drop)

In [None]:
merged_df.to_csv('C:/Users/kevin/Downloads/cleaned player df v2.csv', index=False)

In [22]:
classification_df = pd.read_csv('C:/Users/kevin/Downloads/cleaned player df v2.csv')

**I have two main output dataframes for part 2. The first, for classification, does not include career averages as I want to be able to tell if a player is in his final season before the drop off with my model.**

In [23]:
classification_df
#manually uploaded Position

Unnamed: 0,FULL_NAME,PLAYER_ID,SEASON_ID,RANK_PG_EFF,EFF_CHANGE,TARGET,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,POSITION
0,Grant Hill,255,1994-95,22,15,False,DET,22,70,38.3,...,1.8,4.6,6.4,5.0,1.8,0.9,2.9,2.9,19.9,SF
1,Grant Hill,255,1995-96,7,3,False,DET,23,80,40.8,...,1.6,8.2,9.8,6.9,1.3,0.6,3.3,3.0,20.2,SF
2,Grant Hill,255,1996-97,4,-4,False,DET,24,80,39.3,...,1.5,7.5,9.0,7.3,1.8,0.6,3.2,2.3,21.4,SF
3,Grant Hill,255,1997-98,8,-4,False,DET,25,81,40.7,...,1.1,6.5,7.7,6.8,1.8,0.7,3.5,2.4,21.1,SF
4,Grant Hill,255,1998-99,12,4,False,DET,26,50,37.0,...,1.3,5.8,7.1,6.0,1.6,0.5,3.7,2.3,21.1,SF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1274,Joe Ingles,204060,2015-16,258,93,False,UTA,28,81,15.3,...,0.2,1.6,1.9,1.2,0.7,0.0,0.8,1.2,4.2,SF
1275,Joe Ingles,204060,2016-17,165,96,False,UTA,29,82,24.1,...,0.3,2.9,3.2,2.7,1.2,0.1,1.3,2.0,7.1,SF
1276,Joe Ingles,204060,2017-18,69,-12,False,UTA,30,82,31.4,...,0.3,3.9,4.2,4.8,1.1,0.2,1.9,2.2,11.5,SF
1277,Joe Ingles,204060,2018-19,81,-29,True,UTA,31,82,31.3,...,0.4,3.6,4.0,5.7,1.2,0.2,2.4,2.2,12.1,SF


In [24]:
copy_df = classification_df.copy()
copy_df.head()

Unnamed: 0,FULL_NAME,PLAYER_ID,SEASON_ID,RANK_PG_EFF,EFF_CHANGE,TARGET,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,POSITION
0,Grant Hill,255,1994-95,22,15,False,DET,22,70,38.3,...,1.8,4.6,6.4,5.0,1.8,0.9,2.9,2.9,19.9,SF
1,Grant Hill,255,1995-96,7,3,False,DET,23,80,40.8,...,1.6,8.2,9.8,6.9,1.3,0.6,3.3,3.0,20.2,SF
2,Grant Hill,255,1996-97,4,-4,False,DET,24,80,39.3,...,1.5,7.5,9.0,7.3,1.8,0.6,3.2,2.3,21.4,SF
3,Grant Hill,255,1997-98,8,-4,False,DET,25,81,40.7,...,1.1,6.5,7.7,6.8,1.8,0.7,3.5,2.4,21.1,SF
4,Grant Hill,255,1998-99,12,4,False,DET,26,50,37.0,...,1.3,5.8,7.1,6.0,1.6,0.5,3.7,2.3,21.1,SF


**The second dataframe will only include rows where the target is True, meaning the drop off is looming. I then merge career averages to this dataframe as each player now only has one row. Later in part 3, I remove the individual season stats from this dataframe to allow the scope of any model using this dataframe to extend to all players, even those who have yet to hit the year before their drop off. For now this is saved as regression_df**

In [25]:
copy_df = classification_df[classification_df['TARGET'] != False]
copy_df

Unnamed: 0,FULL_NAME,PLAYER_ID,SEASON_ID,RANK_PG_EFF,EFF_CHANGE,TARGET,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,POSITION
6,Grant Hill,255,2007-08,71,-11,True,PHX,35,70,31.7,...,1.1,4.0,5.0,2.9,0.9,0.8,1.4,2.2,13.1,SF
18,Juwan Howard,436,2005-06,90,-26,True,HOU,33,80,31.7,...,2.1,4.6,6.7,1.4,0.6,0.1,1.7,2.9,11.8,C
32,Jason Kidd,467,2009-10,33,-39,True,DAL,37,80,36.0,...,0.6,5.0,5.6,9.1,1.8,0.4,2.4,1.8,10.3,PG
40,Kurt Thomas,703,2007-08,109,-47,True,TOT,35,70,22.6,...,1.9,5.4,7.2,1.0,0.8,0.8,0.7,2.7,6.3,C
56,Kevin Garnett,708,2011-12,20,-16,True,BOS,36,60,31.1,...,1.1,7.1,8.2,2.9,0.9,1.0,1.8,2.4,15.8,PF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259,Bojan Bogdanovic,202711,2019-20,75,-16,True,UTA,31,63,33.1,...,0.6,3.5,4.1,2.1,0.5,0.1,2.5,1.7,20.2,SF
1264,Pablo Prigioni,203143,2013-14,195,-39,True,NYK,37,66,19.4,...,0.5,1.4,2.0,3.5,1.0,0.0,0.9,2.0,3.8,PG
1267,Aron Baynes,203382,2015-16,182,-27,True,DET,29,81,15.2,...,1.7,3.0,4.7,0.6,0.3,0.6,0.8,1.9,6.3,C
1272,Dewayne Dedmon,203473,2017-18,64,-10,True,ATL,28,62,24.9,...,1.6,6.3,7.9,1.5,0.6,0.8,1.4,2.6,10.0,C


In [26]:
career_stats = career_stats.add_suffix('_career')

In [45]:
career_stats.columns

Index(['PLAYER_ID_career', 'LEAGUE_ID_career', 'Team_ID_career', 'GP_career',
       'GS_career', 'MIN_career', 'FGM_career', 'FGA_career', 'FG_PCT_career',
       'FG3M_career', 'FG3A_career', 'FG3_PCT_career', 'FTM_career',
       'FTA_career', 'FT_PCT_career', 'OREB_career', 'DREB_career',
       'REB_career', 'AST_career', 'STL_career', 'BLK_career', 'TOV_career',
       'PF_career', 'PTS_career'],
      dtype='object')

In [27]:
copy_df = pd.merge(copy_df, career_stats, left_on='PLAYER_ID', right_on='PLAYER_ID_career')
copy_df

Unnamed: 0,FULL_NAME,PLAYER_ID,SEASON_ID,RANK_PG_EFF,EFF_CHANGE,TARGET,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,...,FT_PCT_career,OREB_career,DREB_career,REB_career,AST_career,STL_career,BLK_career,TOV_career,PF_career,PTS_career
0,Grant Hill,255,2007-08,71,-11,True,PHX,35,70,31.7,...,0.769,1.1,4.9,6.0,4.1,1.2,0.6,2.4,2.3,16.7
1,Juwan Howard,436,2005-06,90,-26,True,HOU,33,80,31.7,...,0.764,1.9,4.2,6.1,2.2,0.7,0.3,2.0,2.9,13.4
2,Jason Kidd,467,2009-10,33,-39,True,DAL,37,80,36.0,...,0.785,1.3,5.0,6.3,8.7,1.9,0.3,2.9,1.8,12.6
3,Kurt Thomas,703,2007-08,109,-47,True,TOT,35,70,22.6,...,0.760,1.7,4.9,6.6,1.1,0.7,0.8,1.1,3.2,8.1
4,Kevin Garnett,708,2011-12,20,-16,True,BOS,36,60,31.1,...,0.789,2.2,7.8,10.0,3.7,1.3,1.4,2.2,2.4,17.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,Bojan Bogdanovic,202711,2019-20,75,-16,True,UTA,31,63,33.1,...,0.864,0.5,3.1,3.6,1.6,0.6,0.1,1.7,1.6,15.4
165,Pablo Prigioni,203143,2013-14,195,-39,True,NYK,37,66,19.4,...,0.872,0.5,1.3,1.9,2.8,1.0,0.0,1.0,1.6,3.5
166,Aron Baynes,203382,2015-16,182,-27,True,DET,29,81,15.2,...,0.794,1.5,3.0,4.6,0.8,0.2,0.5,0.9,2.2,6.0
167,Dewayne Dedmon,203473,2017-18,64,-10,True,ATL,28,62,24.9,...,0.735,1.5,4.3,5.8,0.7,0.5,0.8,1.0,2.5,6.4


In [28]:
copy_df.drop(['PLAYER_ID_career', 'LEAGUE_ID_career', 'Team_ID_career'], axis=1, inplace=True)

In [29]:
copy_df

Unnamed: 0,FULL_NAME,PLAYER_ID,SEASON_ID,RANK_PG_EFF,EFF_CHANGE,TARGET,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,...,FT_PCT_career,OREB_career,DREB_career,REB_career,AST_career,STL_career,BLK_career,TOV_career,PF_career,PTS_career
0,Grant Hill,255,2007-08,71,-11,True,PHX,35,70,31.7,...,0.769,1.1,4.9,6.0,4.1,1.2,0.6,2.4,2.3,16.7
1,Juwan Howard,436,2005-06,90,-26,True,HOU,33,80,31.7,...,0.764,1.9,4.2,6.1,2.2,0.7,0.3,2.0,2.9,13.4
2,Jason Kidd,467,2009-10,33,-39,True,DAL,37,80,36.0,...,0.785,1.3,5.0,6.3,8.7,1.9,0.3,2.9,1.8,12.6
3,Kurt Thomas,703,2007-08,109,-47,True,TOT,35,70,22.6,...,0.760,1.7,4.9,6.6,1.1,0.7,0.8,1.1,3.2,8.1
4,Kevin Garnett,708,2011-12,20,-16,True,BOS,36,60,31.1,...,0.789,2.2,7.8,10.0,3.7,1.3,1.4,2.2,2.4,17.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,Bojan Bogdanovic,202711,2019-20,75,-16,True,UTA,31,63,33.1,...,0.864,0.5,3.1,3.6,1.6,0.6,0.1,1.7,1.6,15.4
165,Pablo Prigioni,203143,2013-14,195,-39,True,NYK,37,66,19.4,...,0.872,0.5,1.3,1.9,2.8,1.0,0.0,1.0,1.6,3.5
166,Aron Baynes,203382,2015-16,182,-27,True,DET,29,81,15.2,...,0.794,1.5,3.0,4.6,0.8,0.2,0.5,0.9,2.2,6.0
167,Dewayne Dedmon,203473,2017-18,64,-10,True,ATL,28,62,24.9,...,0.735,1.5,4.3,5.8,0.7,0.5,0.8,1.0,2.5,6.4


In [50]:
copy_df.to_csv('C:/Users/kevin/Downloads/regression inputs.csv', index=False)

In [30]:
regression_df = pd.read_csv('C:/Users/kevin/Downloads/regression inputs.csv')
regression_df

Unnamed: 0,FULL_NAME,PLAYER_ID,SEASON_ID,RANK_PG_EFF,EFF_CHANGE,TARGET,TEAM_ABBREVIATION,PLAYER_AGE,GP,MIN,...,FT_PCT_career,OREB_career,DREB_career,REB_career,AST_career,STL_career,BLK_career,TOV_career,PF_career,PTS_career
0,Grant Hill,255,2007-08,71,-11,True,PHX,35,70,31.7,...,0.769,1.1,4.9,6.0,4.1,1.2,0.6,2.4,2.3,16.7
1,Juwan Howard,436,2005-06,90,-26,True,HOU,33,80,31.7,...,0.764,1.9,4.2,6.1,2.2,0.7,0.3,2.0,2.9,13.4
2,Jason Kidd,467,2009-10,33,-39,True,DAL,37,80,36.0,...,0.785,1.3,5.0,6.3,8.7,1.9,0.3,2.9,1.8,12.6
3,Kurt Thomas,703,2007-08,109,-47,True,TOT,35,70,22.6,...,0.760,1.7,4.9,6.6,1.1,0.7,0.8,1.1,3.2,8.1
4,Kevin Garnett,708,2011-12,20,-16,True,BOS,36,60,31.1,...,0.789,2.2,7.8,10.0,3.7,1.3,1.4,2.2,2.4,17.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,Bojan Bogdanovic,202711,2019-20,75,-16,True,UTA,31,63,33.1,...,0.864,0.5,3.1,3.6,1.6,0.6,0.1,1.7,1.6,15.4
165,Pablo Prigioni,203143,2013-14,195,-39,True,NYK,37,66,19.4,...,0.872,0.5,1.3,1.9,2.8,1.0,0.0,1.0,1.6,3.5
166,Aron Baynes,203382,2015-16,182,-27,True,DET,29,81,15.2,...,0.794,1.5,3.0,4.6,0.8,0.2,0.5,0.9,2.2,6.0
167,Dewayne Dedmon,203473,2017-18,64,-10,True,ATL,28,62,24.9,...,0.735,1.5,4.3,5.8,0.7,0.5,0.8,1.0,2.5,6.4
