In [12]:
import pandas as pd
pd.set_option("display.max_columns", 75)

## Cleaning Data

In [13]:
nbaData = "NBAdata.csv"
# drop unwanted values
temp = pd.read_csv(nbaData)
data = temp.copy()
data = data.drop('#', axis=1)
data = data.drop('blank2', axis=1)
data = data.drop('blanl', axis=1)

# rename columns
data.rename(columns={"Season Start": "Season"}, inplace=True)
data.rename(columns={"Tm": "Team"}, inplace=True)
data.rename(columns={"Player Name": "Name"}, inplace=True)
data.rename(columns={"Player Salary in $": "Salary"}, inplace=True)
data.rename(columns={"G": "GP"}, inplace=True)

# we can assign the positions numberical values
positions = {'PG': 1, 'SG': 2, 'SF': 3, 'PF': 4, 'C': 5}
data['Pos'] = data['Pos'].map(positions)

# ratio of Games Started to Games Played
data["GS/GP"] = data["GS"] / data["GP"]

# last row is null
data = data[:-1]

In [14]:
# since the stats in our data are cumulative, we must convert them to per-game data
# stats should be a on a per-game basis instead of totals to eliminate the effect of games played
data["MPG"] = data["MP"]/data["GP"]
data["ORPG"] = data["ORB"]/data["GP"]
data["DRPG"] = data["DRB"]/data["GP"]
data["RPG"] = data["TRB"]/data["GP"]
data["APG"] = data["AST"]/data["GP"]
data["SPG"] = data["STL"]/data["GP"]
data["BPG"] = data["BLK"]/data["GP"]
data["TPG"] = data["TOV"]/data["GP"]
data["PFPG"] = data["PF"]/data["GP"]
data["PPG"] = data["PTS"]/data["GP"]
data["FGPG"] = data["FG"]/data["GP"]
data["FGAPG"] = data["FGA"]/data["GP"]
data["3PPG"] = data["3P"]/data["GP"]
data["3PAPG"] = data["3PA"]/data["GP"]
data["2PPG"] = data["2P"]/data["GP"]
data["2PAPG"] = data["2PA"]/data["GP"]
data["FTPG"] = data["FT"]/data["GP"]
data["FTAPG"] = data["FTA"]/data["GP"]

#drop all total columns
drop_column = ['MP', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV',
               'PF', 'PTS', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA']
data.drop(drop_column, axis=1, inplace=True)

In [15]:
# splitting stats into 3 categories for later
adv_data = ['% of Cap', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%',
            'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
reg_data = ['% of Cap', 'ORPG', 'DRPG', 'RPG', 'APG', 'SPG', 'BPG', 'TPG', 'PFPG',
           'PPG', 'FGPG', 'FGAPG', '3PPG', '3PAPG', '2PPG', '2PAPG', 'FTPG', 'FTAPG']
data['Salary'] = data.Salary.astype(float)

data['Name'] = data['Name'].map(lambda x: x.rstrip('*'))


In [16]:
# handling invalid values and special cases
data = data[-data["Team"].str.contains("TOT", na=False)]

# get players who have played at least 20 games
data = data[data['GP']>=20]

# missing salary values are a no-go
data = data.dropna(subset=['Salary'])

null_value = ['TS%', '3PAr', 'FTr', 'TOV%', 'FG%', '3P%', '2P%', 'eFG%', 'FT%']
for i in null_value:
    data[i].fillna(0, inplace=True)


In [17]:
print(len(data['Team'].unique()))
# are there 38 teams in the NBA?
# we must handle the name changes of franchises since 1995 such as the Charlotte Hornets, New Orleans Pelicans, and others
data["Team"].replace("CHH", "NOP", inplace=True)
data["Team"].replace("NOH", "NOP", inplace=True)
data["Team"].replace("NOK", "NOP", inplace=True)
data["Team"].replace("NJN", "BRK", inplace=True)
data["Team"].replace("WSB", "WAS", inplace=True)
data["Team"].replace("SEA", "OKC", inplace=True)
data["Team"].replace("VAN", "MEM", inplace=True)
data["Team"].replace("CHA", "CHO", inplace=True)

# we now have the correct number of teams represented in the dataset
print(len(data['Team'].unique()))


38
30
