# Cleaning and preparing the raw socioeconomic census data

In [2]:
import pandas as pd

In [3]:
# Load data
data = pd.read_csv("../CensusData/acs2017_census_tract_data.csv")
data.head()

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001020100,Alabama,Autauga County,1845,899,946,2.4,86.3,5.2,0.0,...,0.5,0.0,2.1,24.5,881,74.2,21.2,4.5,0.0,4.6
1,1001020200,Alabama,Autauga County,2172,1167,1005,1.1,41.6,54.5,0.0,...,0.0,0.5,0.0,22.2,852,75.9,15.0,9.0,0.0,3.4
2,1001020300,Alabama,Autauga County,3385,1533,1852,8.0,61.4,26.5,0.6,...,1.0,0.8,1.5,23.1,1482,73.3,21.1,4.8,0.7,4.7
3,1001020400,Alabama,Autauga County,4267,2001,2266,9.6,80.3,7.1,0.5,...,1.5,2.9,2.1,25.9,1849,75.8,19.7,4.5,0.0,6.1
4,1001020500,Alabama,Autauga County,9965,5054,4911,0.9,77.5,16.4,0.0,...,0.8,0.3,0.7,21.0,4787,71.4,24.1,4.5,0.0,2.3


In [4]:
# Check for null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74001 entries, 0 to 74000
Data columns (total 37 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TractId           74001 non-null  int64  
 1   State             74001 non-null  object 
 2   County            74001 non-null  object 
 3   TotalPop          74001 non-null  int64  
 4   Men               74001 non-null  int64  
 5   Women             74001 non-null  int64  
 6   Hispanic          73305 non-null  float64
 7   White             73305 non-null  float64
 8   Black             73305 non-null  float64
 9   Native            73305 non-null  float64
 10  Asian             73305 non-null  float64
 11  Pacific           73305 non-null  float64
 12  VotingAgeCitizen  74001 non-null  int64  
 13  Income            72885 non-null  float64
 14  IncomeErr         72885 non-null  float64
 15  IncomePerCap      73256 non-null  float64
 16  IncomePerCapErr   73256 non-null  float6

Since it appears to be that there are quite a few null values, but there are so many rows that it doesn't matter if they are all dropped.  



In [5]:
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72718 entries, 0 to 74000
Data columns (total 37 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TractId           72718 non-null  int64  
 1   State             72718 non-null  object 
 2   County            72718 non-null  object 
 3   TotalPop          72718 non-null  int64  
 4   Men               72718 non-null  int64  
 5   Women             72718 non-null  int64  
 6   Hispanic          72718 non-null  float64
 7   White             72718 non-null  float64
 8   Black             72718 non-null  float64
 9   Native            72718 non-null  float64
 10  Asian             72718 non-null  float64
 11  Pacific           72718 non-null  float64
 12  VotingAgeCitizen  72718 non-null  int64  
 13  Income            72718 non-null  float64
 14  IncomeErr         72718 non-null  float64
 15  IncomePerCap      72718 non-null  float64
 16  IncomePerCapErr   72718 non-null  float6

Now, there are no more null values and we still have 72718 rows.  



Next, change all the column names to camel case instead of pascal case.  

In [9]:
def pascal_to_camel(pascal):
    return pascal[0].lower() + pascal[1:]

name_changes = {}

for column_name in data.columns:
    name_changes[column_name] = pascal_to_camel(column_name)

data = data.rename(columns=name_changes)
data[data.columns[:18]].head()


Unnamed: 0,tractID,state,county,totalPop,men,women,hispanic,white,black,native,asian,pacific,votingAgeCitizen,income,incomeErr,incomePerCap,incomePerCapErr,poverty
0,1001020100,Alabama,Autauga County,1845,899,946,2.4,86.3,5.2,0.0,1.2,0.0,1407,67826.0,14560.0,33018.0,6294.0,10.7
1,1001020200,Alabama,Autauga County,2172,1167,1005,1.1,41.6,54.5,0.0,1.0,0.0,1652,41287.0,3819.0,18996.0,2453.0,22.4
2,1001020300,Alabama,Autauga County,3385,1533,1852,8.0,61.4,26.5,0.6,0.7,0.4,2480,46806.0,9496.0,21236.0,2562.0,14.7
3,1001020400,Alabama,Autauga County,4267,2001,2266,9.6,80.3,7.1,0.5,0.2,0.0,3257,55895.0,4369.0,28068.0,3190.0,2.3
4,1001020500,Alabama,Autauga County,9965,5054,4911,0.9,77.5,16.4,0.0,3.1,0.0,7229,68143.0,14424.0,36905.0,10706.0,12.2


In [10]:
data[data.columns[18:]].head()

Unnamed: 0,childPoverty,professional,service,office,construction,production,drive,carpool,transit,walk,otherTransp,workAtHome,meanCommute,employed,privateWork,publicWork,selfEmployed,familyWork,unemployment
0,20.8,38.5,15.6,22.8,10.8,12.4,94.2,3.3,0.0,0.5,0.0,2.1,24.5,881,74.2,21.2,4.5,0.0,4.6
1,35.8,30.5,24.9,22.9,6.3,15.4,90.5,9.1,0.0,0.0,0.5,0.0,22.2,852,75.9,15.0,9.0,0.0,3.4
2,21.1,27.9,19.4,33.3,9.9,9.6,88.3,8.4,0.0,1.0,0.8,1.5,23.1,1482,73.3,21.1,4.8,0.7,4.7
3,1.7,29.0,16.6,25.8,9.1,19.5,82.3,11.2,0.0,1.5,2.9,2.1,25.9,1849,75.8,19.7,4.5,0.0,6.1
4,17.9,48.8,13.8,20.5,3.5,13.4,86.9,11.2,0.0,0.8,0.3,0.7,21.0,4787,71.4,24.1,4.5,0.0,2.3


In [12]:
# Normalize data to make it a percentage of the population 

data['employed'] = data['employed']/data['totalPop']
data['women'] = data['women']/data['totalPop']
data['men'] = data['men']/data['totalPop']

In [13]:
# Change special case of tractID
data = data.rename(columns={"tractId": "tractID"})

In [14]:
# Reset indices so they make sense
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,tractID,state,county,totalPop,men,women,hispanic,white,black,native,...,walk,otherTransp,workAtHome,meanCommute,employed,privateWork,publicWork,selfEmployed,familyWork,unemployment
0,1001020100,Alabama,Autauga County,1845,0.487263,0.512737,2.4,86.3,5.2,0.0,...,0.5,0.0,2.1,24.5,0.477507,74.2,21.2,4.5,0.0,4.6
1,1001020200,Alabama,Autauga County,2172,0.537293,0.462707,1.1,41.6,54.5,0.0,...,0.0,0.5,0.0,22.2,0.392265,75.9,15.0,9.0,0.0,3.4
2,1001020300,Alabama,Autauga County,3385,0.45288,0.54712,8.0,61.4,26.5,0.6,...,1.0,0.8,1.5,23.1,0.437814,73.3,21.1,4.8,0.7,4.7
3,1001020400,Alabama,Autauga County,4267,0.468948,0.531052,9.6,80.3,7.1,0.5,...,1.5,2.9,2.1,25.9,0.433326,75.8,19.7,4.5,0.0,6.1
4,1001020500,Alabama,Autauga County,9965,0.507175,0.492825,0.9,77.5,16.4,0.0,...,0.8,0.3,0.7,21.0,0.480381,71.4,24.1,4.5,0.0,2.3


In [15]:
# Save to clean csv file
data.to_csv("../Clean Data/socioeconomicData.csv", index=False)