# Clean Team Data
The nba_teams.csv dataset is a hand-crafted dataset for all primary NBA team infomration. The teams.csv dataset is an outsourced dataset which contains an ID (team_id) which facilitates joining data for game-based statistical information. This cleansing process prepares the teams dataset for insertion into a PostgreSQL instance hosted on AWS RDS.

In [1]:
import pandas as pd
import numpy as np

# Setup prettier printing
pd.set_option('expand_frame_repr', False)

In [2]:
# Read in both datasets
teams_df = pd.read_csv(
	'../src/data/raw/nba_teams.csv',
	sep=',',
	header=0,
	encoding='utf-8',
	skip_blank_lines=True
)

teams_ids = pd.read_csv(
	'../src/data/raw/teams.csv',
	sep=',',
	header=0,
	encoding='utf-8',
	dtype={
		'ABBREVIATION': pd.StringDtype(),
		'TEAM_ID': pd.Int64Dtype()
	},
	skip_blank_lines=True
)

# teams_df.head(5)
# teams_ids.head(5)

In [3]:
# Verify our index column for merging is consistent
left_abbr = teams_df.shortname.sort_values().unique()
right_abbr = teams_ids['ABBREVIATION'].sort_values().unique()

print(np.array_equal(left_abbr, right_abbr))

True


In [4]:
# Merge with teams_join dataframe to obtain the stock ID
teams_df = pd.merge(
	teams_df,
	teams_ids,
	left_on='shortname',
	right_on='ABBREVIATION',
	how='left',
	suffixes=['_left', '_right']
)

In [5]:
# Drop all columns except for the specified columns
teams_df.drop(
	teams_df.columns.difference(['TEAM_ID', 'name', 'shortname', 'city', 'state', 'conference', 'division']),
	axis=1,
	inplace=True
)
teams_df.columns

Index(['name', 'shortname', 'city', 'state', 'conference', 'division',
       'TEAM_ID'],
      dtype='object')

In [6]:
# Convert headings to lowercase
teams_df.columns = teams_df.columns.str.lower()

In [7]:
teams_df.set_index('team_id', inplace=True)
teams_df.head(10)

Unnamed: 0_level_0,name,shortname,city,state,conference,division
team_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1610612738,Celtics,BOS,Boston,MA,East,Atlantic
1610612751,Nets,BKN,Brooklyn,NY,East,Atlantic
1610612752,Knicks,NYK,New York City,NY,East,Atlantic
1610612755,76ers,PHI,Philadelphia,PA,East,Atlantic
1610612761,Raptors,TOR,Toronto,CA-ON,East,Atlantic
1610612741,Bulls,CHI,Chicago,IL,East,Central
1610612739,Cavaliers,CLE,Cleveland,OH,East,Central
1610612765,Pistons,DET,Detroit,MI,East,Central
1610612754,Pacers,IND,Indianapolis,IN,East,Central
1610612749,Bucks,MIL,Milwaukee,WI,East,Central


In [40]:
# Export intermediate data
teams_df.to_csv(path_or_buf='../src/data/intermediate/teams_intermediate.csv')