# Census Data

This notebook imports 4 csv files that contain census data for the 207 tracts in Suffolk County, MA. The data has been normalized (credit to Brown University) to reflect the tracts from the 2010 census. After a number of pre-processing steps, I calculate the percentage change between the two censuses with an end goal of using k-means clustering on the tracts.

In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

## Data pre-processing (2000 census, full count)

In [153]:
df_2000 = pd.read_csv('LTDB_Std_2000_fullcount.csv',sep=',', engine='python')

In [155]:
df_MA_full_2000 = df_2000[(df_2000['state']=='MA') & (df_2000['county']=='Suffolk County')]

In [156]:
df_MA_full_2000.drop(columns=['placefp10','cbsa10','metdiv10','ccflag10'],axis=1,inplace=True)

In [158]:
df_MA_full_2000.reset_index(inplace=True, drop=True)

In [159]:
# rename tract id column to match 2000
df_MA_full_2000.rename(columns={'TRTID10':'tractid'},inplace=True)

In [160]:
# make columns in same order as 2010 so we can iterate over the df
df_MA_full_2000 = df_MA_full_2000[['tractid', 'state', 'county', 'tract', 'POP00', 'NHWHT00', 'NHBLK00',
       'NTV00', 'ASIAN00', 'HISP00', 'HAW00', 'INDIA00', 'CHINA00', 'FILIP00',
       'JAPAN00', 'KOREA00', 'VIET00', 'MEX00', 'PR00', 'CUBAN00', 'FAMILY00','FHH00','HU00',
       'VAC00', 'OHU00','OWN00', 'RENT00', 'A18UND00', 'A60UP00', 'A75UP00', 'AGEWHT00',
       'A15WHT00', 'A60WHT00', 'AGEBLK00', 'A15BLK00', 'A60BLK00', 'AGEHSP00',
       'A15HSP00', 'A60HSP00', 'AGEASN00','A15ASN00','A60ASN00','AGENTV00','A15NTV00', 'A60NTV00',
       'GlobD00', 'GlobG00']]

## Data pre-processing (2010 census, full count)

In [161]:
df_2010 = pd.read_csv('LTDB_Std_2010_fullcount.csv',sep=',', engine='python')

In [163]:
df_MA_full_2010 = df_2010[(df_2010['state']=='MA') & (df_2010['county']=='Suffolk County')]

In [165]:
df_MA_full_2010.reset_index(inplace=True,drop=True)

## Calculate change b/w censuses (full count)

In [166]:
## New df for change in values between 2000 and 2010

In [167]:
df_joined = df_MA_full_2000.merge(df_MA_full_2010, on='tractid')

In [168]:
df_joined.drop(columns=['state_y','county_y','tract_y'],inplace=True)

In [170]:
df_joined.drop(columns=['GlobD00','GlobG00','GlobD10','GlobG10'],axis=1,inplace=True)

In [171]:
df_joined.head()

Unnamed: 0,tractid,state_x,county_x,tract_x,POP00,NHWHT00,NHBLK00,NTV00,ASIAN00,HISP00,HAW00,INDIA00,CHINA00,FILIP00,JAPAN00,KOREA00,VIET00,MEX00,PR00,CUBAN00,FAMILY00,FHH00,HU00,VAC00,OHU00,OWN00,RENT00,A18UND00,A60UP00,A75UP00,AGEWHT00,A15WHT00,A60WHT00,AGEBLK00,A15BLK00,A60BLK00,AGEHSP00,A15HSP00,A60HSP00,AGEASN00,A15ASN00,A60ASN00,AGENTV00,A15NTV00,A60NTV00,pop10,nhwht10,nhblk10,ntv10,asian10,hisp10,haw10,india10,china10,filip10,japan10,korea10,viet10,mex10,pr10,cuban10,family10,fhh10,hu10,vac10,ohu10,own10,rent10,a18und10,a60up10,a75up10,agewht10,a15wht10,a60wht10,ageblk10,a15blk10,a60blk10,agehsp10,a15hsp10,a60hsp10,ageasn10,a15asn10,a60asn10,agentv10,a15ntv10,a60ntv10
0,25025000100,MA,Suffolk County,Census Tract 1,3968.0,2810.0,198.0,22.0,434.0,349.0,0.0,40.0,245.0,12.0,12.0,15.0,98.0,23.0,39.0,6.0,697.0,62.0,1614.0,51.0,1563.0,441.0,1122.0,545.0,547.0,211.0,2810.0,231.0,467.0,145.0,30.0,5.0,349.0,84.0,18.0,403.0,92.0,40.0,8.0,2.0,0.0,4254,2508,288,16,524,621,0,38,284,16,18,38,49,44,91,11,698,100,1707,91,1616,408,1208,579,481,182,2508,179,369,249,57,16,621,144,24,468,55,60,3,0,0
1,25025000201,MA,Suffolk County,Census Tract 2.01,3887.0,3017.0,137.0,11.0,349.0,289.0,0.0,33.0,223.0,26.0,11.0,14.0,18.0,25.0,34.0,25.0,655.0,45.0,1574.0,31.0,1543.0,463.0,1080.0,450.0,426.0,167.0,3017.0,222.0,369.0,109.0,28.0,7.0,289.0,70.0,19.0,328.0,52.0,26.0,6.0,0.0,1.0,3854,2728,275,13,352,407,0,42,198,22,15,22,14,18,52,23,571,71,1618,77,1541,447,1094,439,383,137,2728,151,293,244,64,13,407,121,32,324,44,41,5,1,1
2,25025000202,MA,Suffolk County,Census Tract 2.02,3925.0,2523.0,299.0,10.0,606.0,442.0,0.0,45.0,377.0,4.0,7.0,22.0,107.0,19.0,130.0,15.0,766.0,148.0,1516.0,22.0,1494.0,511.0,983.0,747.0,546.0,185.0,2523.0,248.0,444.0,251.0,95.0,22.0,442.0,165.0,19.0,593.0,136.0,56.0,8.0,5.0,0.0,3885,2370,289,16,549,596,0,42,346,13,4,18,69,23,122,19,711,149,1522,42,1480,474,1006,656,478,179,2370,200,339,273,74,25,596,197,28,525,82,79,12,4,1
3,25025000301,MA,Suffolk County,Census Tract 3.01,2923.0,2363.0,99.0,2.0,235.0,162.0,0.0,19.0,135.0,6.0,3.0,25.0,8.0,18.0,17.0,6.0,529.0,48.0,1194.0,35.0,1159.0,445.0,714.0,353.0,534.0,242.0,2363.0,197.0,484.0,86.0,20.0,5.0,162.0,44.0,11.0,209.0,32.0,23.0,2.0,0.0,1.0,2740,2050,161,4,307,190,0,37,212,9,6,24,7,13,26,7,463,37,1219,60,1159,476,683,273,418,169,2050,134,335,124,15,24,190,31,23,295,41,30,0,0,0
4,25025000302,MA,Suffolk County,Census Tract 3.02,2909.483505,2356.817139,66.784554,8.711028,375.542114,92.917633,0.0,30.004654,259.395081,11.614704,21.293625,32.908329,10.646812,3.871568,13.550488,1.935784,587.510498,30.004654,1305.686401,20.325733,1285.360596,634.937195,650.423462,268.10611,449.101898,179.060028,2356.817139,165.509537,404.578888,55.169846,5.807352,1.935784,92.917633,13.550488,4.83946,359.087952,35.812004,37.747791,3.871568,0.967892,0.0,3142,2295,103,4,480,229,0,51,322,14,20,38,9,30,19,7,545,29,1346,70,1276,631,645,310,490,164,2295,142,410,89,14,6,229,25,8,449,47,63,2,1,0


In [None]:
i = 4
j = 45

while j < len(df_joined.columns):
    df_joined[f'{df_joined.columns[i]}_POC'] = df_joined.iloc[:,4:].apply(lambda x: 100*
                                    ((x[df_joined.columns[j]]-x[df_joined.columns[i]])
                                     /x[df_joined.columns[i]]),axis=1)
    i +=1
    j +=1


  import sys
  import sys
  


In [None]:
df_joined.head()

## Data pre-processing (2000 Census, Sample count)

In [154]:
df_2000_2 = pd.read_csv('LTDB_Std_2000_Sample.csv',sep=',', engine='python')

In [157]:
df_MA_sample_2000 = df_2000_2[(df_2000_2['state']=='MA') & (df_2000_2['county']=='Suffolk County')]

## Data pre-processing (2010 Census, Sample count)

In [162]:
df_2010_2 = pd.read_csv('LTDB_Std_2010_Sample.csv',sep=',', engine='python')

In [164]:
df_MA_sample_2010 = df_2010_2[(df_2010_2['statea']==25) & (df_2010_2['countya']==25)]

## Calculate change b/w censuses (sample count)