# Data Cleaning
**Golden State Warriors NBA Data | 2010 to 2022**

In [25]:
# import libraries
import pandas as pd
import numpy as np

In [26]:
# read data
nba = pd.read_csv('data/golden_state_nba_raw.csv')
nba.head()

Unnamed: 0,DATE,OPPONENT,RESULT,W-L,Hi Points,Hi Rebounds,Hi Assists,season,post_season_qualification
0,"Thu, 28 Oct",vs Houston,W132-128,1-0,Ellis 46,Lee 15,Curry 11,2010/2011,not qualified
1,"Sat, 30 Oct",vs LA,W109-91,2-0,Wright 24,Lee 12,Ellis 11,2010/2011,not qualified
2,"Mon, 1 Nov",@ Los Angeles,L107-83,2-1,Ellis 20,Carney 8,Biedrins 4,2010/2011,not qualified
3,"Thu, 4 Nov",vs Memphis,W115-109,3-1,Ellis 39,Lee 16,Ellis 8,2010/2011,not qualified
4,"Sat, 6 Nov",vs Utah,W85-78,4-1,Ellis 23,Biedrins 20,Curry 6,2010/2011,not qualified


## Assessing Data

**Detect Issues**

In [27]:
# make a copy of dataframe for cleaning
nba_copy = nba.copy()

## Document Issues

#### Quality Issues:
- transform column names to lowercase and remove empty spaces
- remove characters like vs@ and empty spaces

#### Tidy Issues:
- split Hi Points into two columns
- drop unnecessary columns

## Data Cleaning

### Quality Issue #1

#### Define:

- transform column names to lowercase and remove empty spaces

#### Code:

In [29]:
nba_copy.columns = map(str.lower, nba_copy.columns)
nba_copy.columns = nba_copy.columns.str.replace(' ', '_')

#### Test

In [30]:
nba_copy.columns

Index(['date', 'opponent', 'result', 'w-l', 'hi_points', 'hi_rebounds',
       'hi_assists', 'season', 'post_season_qualification'],
      dtype='object')

### Quality Issue #2

#### Define:

- remove characters like vs@ and empty spaces

#### Code:

In [34]:
nba_copy.opponent = nba_copy.opponent.str.lstrip('vs@ ')
nba_copy.opponent.replace({'LA': 'Los Angeles'}, inplace=True)

#### Test

In [35]:
nba_copy.opponent.unique()

array(['Houston', 'Los Angeles', 'Memphis', 'Utah', 'Detroit', 'Toronto',
       'New York', 'Chicago', 'Milwaukee', 'Denver', 'Minnesota',
       'San Antonio', 'Phoenix', 'Oklahoma City', 'Dallas', 'Miami',
       'Portland', 'Sacramento', 'Philadelphia', 'Atlanta', 'Charlotte',
       'Orlando', 'New Orleans', 'Cleveland', 'New Jersey', 'Indiana',
       'Boston', 'Washington', 'Brooklyn'], dtype=object)

### Tidiness Issue #1

#### Defining:

- split Hi Points into two columns

#### Code:

In [36]:
nba_copy[['hi_points_player', 'hi_points_scored']] = nba_copy.hi_points.str.split('  ', n=-1, expand=True)

#### Test

In [37]:
nba_copy.head(2)

Unnamed: 0,date,opponent,result,w-l,hi_points,hi_rebounds,hi_assists,season,post_season_qualification,hi_points_player,hi_points_scored
0,"Thu, 28 Oct",Houston,W132-128,1-0,Ellis 46,Lee 15,Curry 11,2010/2011,not qualified,Ellis,46
1,"Sat, 30 Oct",Los Angeles,W109-91,2-0,Wright 24,Lee 12,Ellis 11,2010/2011,not qualified,Wright,24


### Tidiness Issue #2

#### Define:

- drop unnecessary columns

#### Code:

In [38]:
nba_copy.drop(columns=['hi_points', 'post_season_qualification'], axis=1, inplace=True)

#### Test

In [39]:
nba_copy.columns

Index(['date', 'opponent', 'result', 'w-l', 'hi_rebounds', 'hi_assists',
       'season', 'hi_points_player', 'hi_points_scored'],
      dtype='object')

## Storing Dataset

In [40]:
# save data to new csv file
nba_copy.to_csv('data/nba_gsw_cleaned.csv', index=False)