# Data Cleaning: CT Income

In [5]:
# Imports
import pandas as pd 

In [6]:
# Reading in the CT income data
df = pd.read_csv('../data/ct_income_raw.csv')

In [7]:
# Taking a look at the data
df.head()

Unnamed: 0,city,median_household_income,mean_household_income,per_capita_income
0,Andover,100321,111230,40182
1,Ansonia,43305,62858,24359
2,Ashford,77870,95339,39139
3,Avon,123894,172245,66822
4,Barkhamsted,95735,102210,40156


In [8]:
# Checking the data types
df.dtypes

city                       object
median_household_income    object
mean_household_income      object
per_capita_income          object
dtype: object

**Note:** the data types are strings when they should be integers. The commas are likely what is causing this.

In [9]:
# Getting rid of commas and casting the values to integers
df['median_household_income'] = df['median_household_income'].str.replace(',', '').astype(int)
df['mean_household_income'] = df['mean_household_income'].str.replace(',', '').astype(int)
df['per_capita_income'] = df['per_capita_income'].str.replace(',', '').astype(int)

## Median Household Income Buckets

In [11]:
# 1: $1 under $25,000
# 2: $25,000 under $50,000
# 3: $50,000 under $75,000
# 4: $75,000 under $100,000
# 5: $100,000 under $200,000
# 6: $200,000 or more

mask0_25k = (df['median_household_income'] > 0) & (df['median_household_income'] <= 25_000)
mask25k_50k = (df['median_household_income'] >= 25_001) & (df['median_household_income'] <= 50_000)
mask50k_75k = (df['median_household_income'] >= 50_001) & (df['median_household_income'] <= 75_000)
mask75k_100k = (df['median_household_income'] >= 75_001) & (df['median_household_income'] <= 100_000)
mask100k_200k = (df['median_household_income'] >= 100_001) & (df['median_household_income'] <= 200_000)
mask200k_more = (df['median_household_income'] >= 200_001) 

In [12]:
bucket1 = df.loc[mask0_25k]
bucket2 = df.loc[mask25k_50k]
bucket3 = df.loc[mask50k_75k]
bucket4 = df.loc[mask75k_100k]
bucket5 = df.loc[mask100k_200k]
bucket6 = df.loc[mask200k_more]

bucket1['median_bucket'] = 1
bucket2['median_bucket'] = 2
bucket3['median_bucket'] = 3
bucket4['median_bucket'] = 4
bucket5['median_bucket'] = 5
bucket6['median_bucket'] = 6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [13]:
df_bucket = pd.concat([bucket1,bucket2,bucket3,bucket4,bucket5,bucket6,], axis = 0)

In [14]:
df_bucket.shape

(169, 5)

In [15]:
# Sort by city 
df_bucket.sort_values(by = 'city', inplace = True)

In [16]:
df_bucket.head()

Unnamed: 0,city,median_household_income,mean_household_income,per_capita_income,median_bucket
0,Andover,100321,111230,40182,5
1,Ansonia,43305,62858,24359,2
2,Ashford,77870,95339,39139,4
3,Avon,123894,172245,66822,5
4,Barkhamsted,95735,102210,40156,4


## Mean Household Income Buckets

In [17]:
# 1: $1 under $25,000
# 2: $25,000 under $50,000
# 3: $50,000 under $75,000
# 4: $75,000 under $100,000
# 5: $100,000 under $200,000
# 6: $200,000 or more

mask0_25k = (df_bucket['mean_household_income'] > 0) & (df_bucket['mean_household_income'] <= 25_000)
mask25k_50k = (df_bucket['mean_household_income'] >= 25_001) & (df_bucket['mean_household_income'] <= 50_000)
mask50k_75k = (df_bucket['mean_household_income'] >= 50_001) & (df_bucket['mean_household_income'] <= 75_000)
mask75k_100k = (df_bucket['mean_household_income'] >= 75_001) & (df_bucket['mean_household_income'] <= 100_000)
mask100k_200k = (df_bucket['mean_household_income'] >= 100_001) & (df_bucket['mean_household_income'] <= 200_000)
mask200k_more = (df_bucket['mean_household_income'] >= 200_001) 

In [18]:
bucket1 = df_bucket.loc[mask0_25k]
bucket2 = df_bucket.loc[mask25k_50k]
bucket3 = df_bucket.loc[mask50k_75k]
bucket4 = df_bucket.loc[mask75k_100k]
bucket5 = df_bucket.loc[mask100k_200k]
bucket6 = df_bucket.loc[mask200k_more]

bucket1['mean_bucket'] = 1
bucket2['mean_bucket'] = 2
bucket3['mean_bucket'] = 3
bucket4['mean_bucket'] = 4
bucket5['mean_bucket'] = 5
bucket6['mean_bucket'] = 6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [19]:
df_bucket = pd.concat([bucket1,bucket2,bucket3,bucket4,bucket5,bucket6,], axis = 0)

In [20]:
df_bucket.head()

Unnamed: 0,city,median_household_income,mean_household_income,per_capita_income,median_bucket,mean_bucket
63,Hartford,30630,44434,17311,2,2
1,Ansonia,43305,62858,24359,2,3
14,Bridgeport,41801,56565,21002,2,3
16,Bristol,61478,74762,31709,3,3
18,Brooklyn,60694,74756,27244,3,3


In [21]:
df_bucket.sort_values(by = 'city', inplace = True)

In [22]:
df_bucket.head()

Unnamed: 0,city,median_household_income,mean_household_income,per_capita_income,median_bucket,mean_bucket
0,Andover,100321,111230,40182,5,5
1,Ansonia,43305,62858,24359,2,3
2,Ashford,77870,95339,39139,4,4
3,Avon,123894,172245,66822,5,5
4,Barkhamsted,95735,102210,40156,4,5


#### Export

In [23]:
df_bucket.to_csv('../data/ct_income_clean.csv', index=False)

## Per Capita Income Buckets

In [None]:
df_bucket['per_capita_income'].describe()