# Census Data

This notebook imports 4 csv files from the Longitudinal Tract Data Base (LTDB) which contain census data for 262 census tracts in the Boston area. After a number of pre-processing steps, I calculate the percentage change between the two censuses with an end goal of using k-means clustering on the tracts.

In [1]:
import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

## File 1: 2000 Census, Long-form questionnaire

### Pre-processing

In [952]:
# import file
df_2000 = pd.read_csv('CSV_files/LTDB_Std_2000_fullcount.csv',sep=',', engine='python')

# subset the file to Massachusetts
df_2000_MA = df_2000[(df_2000['state']=='MA')]

In [953]:
# collect tracts for Cambridge and Brookline
Cambridge_tracts = list((range(3522,3531)))+list(((range(3532,3551))))
Cambridge_tracts = Cambridge_tracts + [3521.01,3521.02,3531.02,3531.01]
Cambridge_tracts = [f'Census Tract {str(x)}' for x in Cambridge_tracts]

Brookline_tracts = list(range(4001,4013))
Brookline_tracts = [f'Census Tract {str(x)}' for x in Brookline_tracts]

Somerville_tracts = [3507,3508,3505,3504,3506,3503,3502,3501.04,3501.03,
                     3514.03,3512.03,3512.04,3513,3511,3510,3509,3515]
Somerville_tracts = [f'Census Tract {str(x)}' for x in Somerville_tracts]

In [954]:
# get Suffolk County
df_2000_MA_1 = df_2000_MA[df_2000_MA['county']=='Suffolk County']

# get Cambridge
df_2000_MA_2 =df_2000_MA[df_2000_MA['tract'].apply(lambda x: any(tract for tract in Cambridge_tracts if tract in x))]

# get Brookline
df_2000_MA_3=df_2000_MA[df_2000_MA['tract'].apply(lambda x: any(tract for tract in Brookline_tracts if tract in x))]

# get Somerville
df_2000_MA_4=df_2000_MA[df_2000_MA['tract'].apply(lambda x: any(tract for tract in Somerville_tracts if tract in x))]

In [955]:
print(df_2000_MA_1.shape)
print(df_2000_MA_2.shape)
print(df_2000_MA_3.shape)
print(df_2000_MA_4.shape)

(203, 51)
(32, 51)
(12, 51)
(17, 51)


In [956]:
# merge the 3
df_MA_full_2000= pd.concat([df_2000_MA_1,df_2000_MA_2,df_2000_MA_3,df_2000_MA_4],ignore_index=True)

# drop irrelevant columns
df_MA_full_2000.drop(columns=['placefp10','cbsa10','metdiv10','ccflag10'],axis=1,inplace=True)

# reset the index
df_MA_full_2000.reset_index(inplace=True, drop=True)

In [957]:
# rename tract id column to match 2010
df_MA_full_2000.rename(columns={'TRTID10':'tractid'},inplace=True)

### Feature engineering

In [958]:
# calculate percentages of population for each category 
df_MA_full_2000['percent_white00']=100*df_MA_full_2000['NHWHT00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_black00']=100*df_MA_full_2000['NHBLK00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_asian00']=100*df_MA_full_2000['ASIAN00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_hispanic00']=100*df_MA_full_2000['HISP00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_indian00']=100*df_MA_full_2000['INDIA00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_chinese00']=100*df_MA_full_2000['CHINA00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_filip00']=100*df_MA_full_2000['FILIP00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_japan00']=100*df_MA_full_2000['JAPAN00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_korean00']=100*df_MA_full_2000['KOREA00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_viet00']=100*df_MA_full_2000['VIET00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_mex00']=100*df_MA_full_2000['MEX00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_pr00']=100*df_MA_full_2000['PR00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_cuban00']=100*df_MA_full_2000['CUBAN00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_vacant_housing00']=100*df_MA_full_2000['VAC00']/df_MA_full_2000['HU00']
df_MA_full_2000['percent_occupied_housing00']=100*df_MA_full_2000['OHU00']/df_MA_full_2000['HU00']
df_MA_full_2000['percent_under18_00']=100*df_MA_full_2000['A18UND00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_60andup_00']=100*df_MA_full_2000['A60UP00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_75andup_00']=100*df_MA_full_2000['A75UP00']/df_MA_full_2000['POP00']
df_MA_full_2000['percent_owneroccupied_00']=100*df_MA_full_2000['OWN00']/df_MA_full_2000['HU00']
df_MA_full_2000['percent_renteroccupied_00']=100*df_MA_full_2000['RENT00']/df_MA_full_2000['HU00']

In [959]:
# let's recreate the race buckets into "percent white" and "percent non-white"
df_MA_full_2000['percent_non-white00'] = 100*(df_MA_full_2000.iloc[:,6:20].sum(1)/df_MA_full_2000['POP00'])

In [960]:
# now lets drop all other columns
df_MA_full_2000.drop(df_MA_full_2000.columns[5:47],axis=1,inplace=True)

In [961]:
## save to csv for classification
df_MA_full_2000.to_csv('Final_2000_data.csv')

## File 1: 2010 Census, Long-form questionnaire

### Pre-processing

In [962]:
df_2010 = pd.read_csv('CSV_files/LTDB_Std_2010_fullcount.csv',sep=',', engine='python')

In [963]:
df_2010_MA = df_2010[(df_2010['state']=='MA')]

In [964]:
# same steps for 2010 

# get Suffolk County
df_2010_MA_1 = df_2010_MA[df_2010_MA['county']=='Suffolk County']

# get Cambridge
df_2010_MA_2 =df_2010_MA[df_2010_MA['tract'].apply(lambda x: any(tract for tract in Cambridge_tracts if tract in x))]

# get Brookline
df_2010_MA_3=df_2010_MA[df_2010_MA['tract'].apply(lambda x: any(tract for tract in Brookline_tracts if tract in x))]

# get Somerville
df_2010_MA_4=df_2010_MA[df_2010_MA['tract'].apply(lambda x: any(tract for tract in Somerville_tracts if tract in x))]

In [965]:
print(df_2010_MA_1.shape)
print(df_2010_MA_2.shape)
print(df_2010_MA_3.shape)
print(df_2010_MA_4.shape)

(204, 47)
(32, 47)
(12, 47)
(17, 47)


In [966]:
# merge the 3
df_MA_full_2010= pd.concat([df_2010_MA_1,df_2010_MA_2,df_2010_MA_3,df_2010_MA_4],ignore_index=True)

# reset the index
df_MA_full_2010.reset_index(inplace=True, drop=True)

### Feature engineering

In [967]:
# calculate percentages of population for each category 
df_MA_full_2010['percent_white10']=100*df_MA_full_2010['nhwht10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_black10']=100*df_MA_full_2010['nhblk10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_asian10']=100*df_MA_full_2010['asian10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_hispanic10']=100*df_MA_full_2010['hisp10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_indian10']=100*df_MA_full_2010['india10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_chinese10']=100*df_MA_full_2010['china10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_filip10']=100*df_MA_full_2010['filip10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_japan10']=100*df_MA_full_2010['japan10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_korean10']=100*df_MA_full_2010['korea10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_viet10']=100*df_MA_full_2010['viet10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_mex10']=100*df_MA_full_2010['mex10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_pr10']=100*df_MA_full_2010['pr10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_cuban10']=100*df_MA_full_2010['cuban10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_vacant_housing10']=100*df_MA_full_2010['vac10']/df_MA_full_2010['hu10']
df_MA_full_2010['percent_occupied_housing10']=100*df_MA_full_2010['ohu10']/df_MA_full_2010['hu10']
df_MA_full_2010['percent_under18_10']=100*df_MA_full_2010['a18und10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_60andup_10']=100*df_MA_full_2010['a60up10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_75andup_10']=100*df_MA_full_2010['a75up10']/df_MA_full_2010['pop10']
df_MA_full_2010['percent_owneroccupied_10']=100*df_MA_full_2010['own10']/df_MA_full_2010['hu10']
df_MA_full_2010['percent_renteroccupied_10']=100*df_MA_full_2010['rent10']/df_MA_full_2010['hu10']

In [968]:
# let's recreate the race buckets into "white" and "non-white"
df_MA_full_2010['non-white10'] = 100*(df_MA_full_2010.iloc[:,6:20].sum(1)/df_MA_full_2010['pop10'])

In [969]:
# now lets drop all other columns
df_MA_full_2010.drop(df_MA_full_2010.columns[5:47],axis=1,inplace=True)

In [970]:
# save to csv to predict
df_MA_full_2010.to_csv('Final_2010_data.csv')

## Percentage change 2000-2000, Long-form questionnaire

In [941]:
## New df for change in values between 2000 and 2010
df_joined = df_MA_full_2000.merge(df_MA_full_2010, on='tractid')

In [942]:
df_joined.drop(columns=['state_y','county_y','tract_y'],axis=1,inplace=True)

In [943]:
df_joined.columns

Index(['tractid', 'state_x', 'county_x', 'tract_x', 'POP00', 'percent_white00',
       'percent_black00', 'percent_asian00', 'percent_hispanic00',
       'percent_indian00', 'percent_chinese00', 'percent_filip00',
       'percent_japan00', 'percent_korean00', 'percent_viet00',
       'percent_mex00', 'percent_pr00', 'percent_cuban00',
       'percent_vacant_housing00', 'percent_occupied_housing00',
       'percent_under18_00', 'percent_60andup_00', 'percent_75andup_00',
       'percent_owneroccupied_00', 'percent_renteroccupied_00',
       'percent_non-white00', 'pop10', 'percent_white10', 'percent_black10',
       'percent_asian10', 'percent_hispanic10', 'percent_indian10',
       'percent_chinese10', 'percent_filip10', 'percent_japan10',
       'percent_korean10', 'percent_viet10', 'percent_mex10', 'percent_pr10',
       'percent_cuban10', 'percent_vacant_housing10',
       'percent_occupied_housing10', 'percent_under18_10',
       'percent_60andup_10', 'percent_75andup_10', 'percent

In [944]:
# 2000 starts at column 4
i = 4
# 2010 starts at column
j = 26
# stop once you get to final column
while j < 48:
    try:
        df_joined[f'{df_joined.columns[i]}_PC'] = df_joined.iloc[:,4:].apply(lambda x: 100*
                                    ((x[df_joined.columns[j]]-x[df_joined.columns[i]])
                                   /x[df_joined.columns[i]]),axis=1)
    except ZeroDivisionError:
        0
        
    i +=1
    j +=1

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


In [945]:
df_joined.to_csv('CSV_files/Full_count_census_changes.csv')

## File 3: 2000 Census, Short-form questionnaire

### Pre-processing

In [1064]:
# import the file
df_2000_2 = pd.read_csv('CSV_files/LTDB_Std_2000_Sample.csv',sep=',', engine='python')

# subset to MA
df_2000_MA_2 = df_2000_2[(df_2000_2['state']=='MA')]

In [1065]:
# get Suffolk County
df_MA_sample_2000_1 = df_2000_MA_2[df_2000_MA_2['county']=='Suffolk County']

# get Cambridge
df_MA_sample_2000_2=df_2000_MA_2[df_2000_MA_2['tract'].apply(lambda x: any(tract for tract in Cambridge_tracts if tract in x))]

# get Brookline
df_MA_sample_2000_3=df_2000_MA_2[df_2000_MA_2['tract'].apply(lambda x: any(tract for tract in Brookline_tracts if tract in x))]

# get Somerville
df_MA_sample_2000_4=df_2000_MA_2[df_2000_MA_2['tract'].apply(lambda x: any(tract for tract in Somerville_tracts if tract in x))]

In [1066]:
print(df_MA_sample_2000_1.shape)
print(df_MA_sample_2000_2.shape)
print(df_MA_sample_2000_3.shape)
print(df_MA_sample_2000_4.shape)

(203, 77)
(32, 77)
(12, 77)
(17, 77)


In [1067]:
# merge the 3
df_MA_sample_2000= pd.concat([df_MA_sample_2000_1,df_MA_sample_2000_2,df_MA_sample_2000_3,df_MA_sample_2000_4],ignore_index=True)

# reset the index
df_MA_sample_2000.reset_index(inplace=True, drop=True)

# drop irrelevant columns
df_MA_sample_2000.drop(columns=['placefp10','cbsa10','metdiv10','ccflag10'],axis=1,inplace=True)

In [1068]:
df_MA_sample_2000.shape

(264, 73)

In [1069]:
df_MA_sample_2000.drop(columns=['state','county','tract'],axis=1,inplace=True)

In [1070]:
# drop ethnic groups
df_MA_sample_2000.drop(df_MA_sample_2000.columns[2:12],axis=1,inplace=True)

In [1071]:
# drop columns that are not variables in 2010
df_MA_sample_2000.drop(columns=['DIS00','CNI16U00'],axis=1,inplace=True)

### Feature engineering

In [1072]:
# percentage
df_MA_sample_2000['percent_foreign_born']=100*df_MA_sample_2000['FB00']/df_MA_sample_2000['POP00SF3']
df_MA_sample_2000['percent_naturalized']=100*df_MA_sample_2000['NAT00']/df_MA_sample_2000['POP00SF3']
df_MA_sample_2000['percent_recent_immigrants(10)']=100*df_MA_sample_2000['N10IMM00']/df_MA_sample_2000['POP00SF3']
df_MA_sample_2000['percent_other_languages']=100*df_MA_sample_2000['OLANG00']/df_MA_sample_2000['AG5UP00']
df_MA_sample_2000['percent_hs_degree_orless']=100*df_MA_sample_2000['HS00']/df_MA_sample_2000['AG25UP00']
df_MA_sample_2000['percent_4yrcollege_degree_ormore']=100*df_MA_sample_2000['COL00']/df_MA_sample_2000['AG25UP00']
df_MA_sample_2000['percent_married']=100*df_MA_sample_2000['Mar-00']/df_MA_sample_2000['AG15UP00']
df_MA_sample_2000['percent_unemployed']=100*df_MA_sample_2000['UNEMP00']/df_MA_sample_2000['CLF00']
df_MA_sample_2000['percent_employed']=100*df_MA_sample_2000['EMPCLF00']/df_MA_sample_2000['CLF00']
df_MA_sample_2000['percent_professional']=100*df_MA_sample_2000['PROF00']/df_MA_sample_2000['EMPCLF00']
df_MA_sample_2000['percent_manufacturing']=100*df_MA_sample_2000['MANUF00']/df_MA_sample_2000['EMPCLF00']
df_MA_sample_2000['percent_self_employed']=100*df_MA_sample_2000['SEMP00']/df_MA_sample_2000['EMPCLF00']
df_MA_sample_2000['percent_poverty']=100*df_MA_sample_2000['NPOV00']/df_MA_sample_2000['DPOV00']
df_MA_sample_2000['percent_houses_30yrsago']=100*df_MA_sample_2000['H30OLD00']/df_MA_sample_2000['DMULTI00']
df_MA_sample_2000['percent_multiunit_houses']=100*df_MA_sample_2000['MULTI00']/df_MA_sample_2000['DMULTI00']

In [1073]:
# drop the remaining columns
# only keep: per capita income , median household rent, median home value, andmedian household income
df_MA_sample_2000.drop(df_MA_sample_2000.columns[1:39],axis=1,inplace=True)

In [1074]:
df_MA_sample_2000.drop(df_MA_sample_2000.columns[2:8],axis=1,inplace=True)

In [1075]:
df_MA_sample_2000.drop(df_MA_sample_2000.columns[3:7],axis=1,inplace=True)

In [1076]:
df_MA_sample_2000.drop(df_MA_sample_2000.columns[5:11],axis=1,inplace=True)

In [1077]:
### convert dtypes to floats so we can do math
df_MA_sample_2000['MHMVAL00'] = pd.to_numeric(df_MA_sample_2000['MHMVAL00'],errors='coerce')
df_MA_sample_2000['HINC00'] = pd.to_numeric(df_MA_sample_2000['MHMVAL00'],errors='coerce')
df_MA_sample_2000['MRENT00'] = pd.to_numeric(df_MA_sample_2000['MRENT00'],errors='coerce')

In [1078]:
# match id column to 2010
df_MA_sample_2000.rename(columns={'TRTID10':'tractid'},inplace=True)

## File 4: 2010 Census, Short-form questionnaire

In [1028]:
# import the file
df_2010_2 = pd.read_csv('CSV_files/LTDB_Std_2010_Sample.csv',sep=',', engine='python')

# subset to MA
df_2010_2_MA= df_2010_2[(df_2010_2['statea']==25)]

In [1029]:
# formatting is slightly different because this data is from the American Community Survey
# so we need to amend our lists
Cambridge_tracts_ACS = list((range(3522,3531)))+list(((range(3532,3551))))
Cambridge_tracts_ACS = [str(x)+'00' for x in Cambridge_tracts_ACS]
Cambridge_tracts_ACS = Cambridge_tracts_ACS + ['352101','352102','353102','353101']

Brookline_tracts_ACS = list(range(4001,4013))
Brookline_tracts_ACS = [str(x)+'00' for x in Brookline_tracts_ACS]

Somerville_tracts_ACS = [3507,3508,3505,3504,3506,3503,3502,3513,3511,3510,3509,3515,3501.04,3501.03,3514.03,3512.03,3512.04]
Somerville_tracts_ACS = [str(x).replace(".","") if str(x).find('.') != -1 else str(x)+'00' for x in Somerville_tracts_ACS]

In [1030]:
df_2010_2_MA['tracta'] = df_2010_2_MA['tracta'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [1031]:
# get Suffolk County
df_MA_sample_2010_1 = df_2010_2_MA[df_2010_2_MA['countya']==25]

# get Cambridge
df_MA_sample_2010_2=df_2010_2_MA[df_2010_2_MA['tracta'].apply(lambda x: any(tract for tract in Cambridge_tracts_ACS if tract in x))]

# get Brookline
df_MA_sample_2010_3=df_2010_2_MA[df_2010_2_MA['tracta'].apply(lambda x: any(tract for tract in Brookline_tracts_ACS if tract in x))]

# get Somervile
df_MA_sample_2010_4=df_2010_2_MA[df_2010_2_MA['tracta'].apply(lambda x: any(tract for tract in Somerville_tracts_ACS if tract in x))]

In [1032]:
print(df_MA_sample_2010_1.shape)
print(df_MA_sample_2010_2.shape)
print(df_MA_sample_2010_3.shape)
print(df_MA_sample_2010_4.shape)

(204, 176)
(32, 176)
(12, 176)
(17, 176)


In [1033]:
# merge the 3
df_MA_sample_2010= pd.concat([df_MA_sample_2010_1,df_MA_sample_2010_2,df_MA_sample_2010_3,df_MA_sample_2010_4],ignore_index=True)

# reset the index
df_MA_sample_2010.reset_index(inplace=True, drop=True)

In [1034]:
df_MA_sample_2010.shape

(265, 176)

### Feature engineering

In [1035]:
# feature engineer the one column that is missing
df_MA_sample_2010['percent_employed12']=100*(df_MA_sample_2010['empclf12']/df_MA_sample_2010['clf12'])

In [1081]:
df_MA_sample_2010.head()

Unnamed: 0,tractid,statea,countya,tracta,pnhwht12,pnhblk12,phisp12,pntv12,pasian12,phaw12,pindia12,pchina12,pfilip12,pjapan12,pkorea12,pviet12,p15wht12,p65wht12,p15blk12,p65blk12,p15hsp12,p65hsp12,p15ntv12,p65ntv12,p15asn12,p65asn12,pmex12,pcuban12,ppr12,pruanc12,pitanc12,pgeanc12,piranc12,pscanc12,pfb12,pnat12,p10imm12,prufb12,pitfb12,pgefb12,pirfb12,pscfb12,polang12,plep12,phs12,pcol12,punemp12,pflabf12,pprof12,pmanuf12,psemp12,pvet12,p65pov12,ppov12,pwpov12,pnapov12,pfmpov12,pbpov12,phpov12,papov12,pvac12,pown12,pmulti12,p30old12,p18und12,p60up12,p75up12,pmar12,pwds12,pfhh12,p10yrs12,ageblk12,agentv12,agewht12,agehsp12,india12,filip12,japan12,korea12,viet12,pop12,nhwht12,nhblk12,ntv12,hisp12,asian12,haw12,china12,a15wht12,a65wht12,a15blk12,a65blk12,a15hsp12,a65hsp12,a15ntv12,a65ntv12,ageasn12,a15asn12,a65asn12,mex12,pr12,cuban12,geanc12,iranc12,itanc12,ruanc12,fb12,nat12,itfb12,rufb12,ag5up12,irfb12,gefb12,scanc12,n10imm12,olang12,lep12,scfb12,ag25up12,dfmpov12,hh12,hinc12,hincb12,hincw12,hinch12,incpc12,ag18cv12,vet12,empclf12,dpov12,npov12,dbpov12,nbpov12,dnapov12,nnapov12,dwpov12,nwpov12,dhpov12,nhpov12,hhb12,hhw12,hhh12,hs12,col12,clf12,unemp12,dflabf12,flabf12,prof12,manuf12,semp12,hha12,hinca12,n65pov12,nfmpov12,napov12,dapov12,family12,hu12,vac12,ohu12,own12,rent12,dmulti12,mrent12,mhmval12,multi12,h30old12,h10yrs12,a18und12,a60up12,a75up12,ag15up12,12-Mar,wds12,fhh12,percent_employed12
0,25025000100,25,25,100,67.260002,3.8,17.58,0.0,9.79,0.0,1.69,6.35,0.0,0.64,0.33,0.0,5.32,16.030001,25.77,0.0,20.66,0.0,-999.0,-999.0,9.35,0.0,0.25,0.67,5.24,1.69,10.65,3.05,13.06,2.99,29.030001,15.3,9.48,0.8,0.5,0.0,0.42,0.39,38.450001,15.54,54.939999,30.780001,6.06,62.259998,22.5,7.83,7.13,5.51,1.37,14.66,18.549999,-999.0,3.41,6.38,2.07,14.73,10.17,24.57,78.639999,94.529999,10.81,16.219999,5.77,27.690001,17.1,10.24,76.889999,163,0,2426,634,61,0,23,12,0,3607,2426,137,0,634,353,0,229,129,389,42,0,131,0,0,0,353,33,0,9,189,24,110,471,384,61,1047,552,18,29,3358,15,0,108,342,1291,522,14,2563,674,1510,53571,31759,54777,46106,27564,3192,176,2133,3574,524,141,9,0,0,2415,448,629,13,38,1184,143,1408,789,2260,137,1587,988,480,167,152,136,62115.0,49,23,52,353,674,1681,171,1510,371,1139,1681,1348,348200,1322,1589,1161,390,585,208,3258,902,557,69,94.380531
1,25025000201,25,25,201,67.959999,3.85,16.879999,0.0,9.24,0.0,0.76,6.56,1.02,0.89,0.0,0.0,4.64,7.97,0.0,16.530001,13.21,14.72,-999.0,-999.0,0.0,10.69,1.02,3.6,5.99,0.32,10.41,0.64,24.65,1.11,25.129999,14.55,7.07,0.0,1.18,0.86,2.1,0.0,29.030001,6.06,32.290001,53.32,1.89,79.230003,45.75,5.98,8.02,3.54,0.64,10.13,6.42,-999.0,5.09,49.59,10.38,6.55,13.34,29.379999,83.279999,95.980003,6.37,14.78,5.35,27.059999,9.05,10.18,70.400002,121,0,2134,530,24,32,28,0,0,3140,2134,121,0,530,290,0,206,99,170,0,20,70,78,0,0,290,0,31,32,188,113,20,774,327,10,789,457,37,0,3038,66,27,35,222,882,184,0,2273,570,1358,65506,9500,76218,65000,38128,2940,104,2343,3140,318,121,60,0,0,2134,137,530,55,52,996,184,734,1212,2381,45,1671,1324,1072,140,188,73,58393.0,20,29,19,290,570,1567,209,1358,399,959,1567,1405,532000,1305,1504,956,200,464,168,2971,804,269,58,98.404032
2,25025000202,25,25,202,53.66,8.54,17.190001,0.0,17.280001,0.0,0.0,9.42,0.35,0.0,0.0,1.91,6.87,10.73,22.370001,0.0,17.139999,0.79,-999.0,-999.0,7.74,27.33,1.26,0.0,0.74,2.37,4.26,1.36,24.18,0.93,36.240002,17.66,12.23,0.49,0.66,0.52,1.42,0.0,41.509998,12.42,32.82,43.119999,9.6,55.040001,45.950001,1.62,8.85,2.96,0.35,18.200001,13.22,-999.0,9.33,28.030001,35.869999,12.32,9.55,32.919998,78.57,96.260002,13.43,16.18,7.34,33.09,13.3,17.870001,62.799999,371,0,1966,630,0,13,0,0,70,3664,1966,313,0,630,633,0,345,135,211,83,0,108,5,0,0,633,49,173,46,27,0,50,886,156,87,1328,647,24,18,3438,52,19,34,448,1427,427,0,2544,761,1355,54607,30000,65184,32891,29292,3172,94,1854,3664,667,371,104,0,0,1966,260,630,226,72,867,185,835,1097,2031,195,1628,896,852,30,164,192,24803.0,13,71,78,633,761,1498,143,1355,446,909,1498,1299,432800,1177,1442,851,492,593,269,3309,1095,440,136,91.285081
3,25025000301,25,25,301,82.220001,1.8,5.28,0.0,10.71,0.0,5.5,5.2,0.0,0.0,0.0,0.0,9.56,12.52,0.0,29.17,9.22,17.73,-999.0,-999.0,13.99,15.03,0.0,0.56,2.4,1.35,11.72,3.97,29.690001,2.4,15.2,6.1,6.1,0.3,0.82,1.42,1.24,0.0,19.83,1.66,26.52,61.369999,4.36,77.099998,63.82,5.92,7.74,5.6,0.66,8.99,8.54,-999.0,12.03,26.469999,33.330002,0.0,13.68,43.709999,76.589996,93.970001,10.48,16.77,8.42,34.970001,11.38,10.47,68.410004,48,0,2196,141,147,0,0,0,0,2671,2196,48,0,141,286,0,139,210,275,0,14,13,25,0,0,286,40,43,0,64,15,106,793,313,36,406,163,22,8,2592,33,38,64,163,514,43,0,2115,449,1073,89688,100781,84925,103015,41955,2391,134,1874,2570,231,34,9,0,0,2130,182,120,40,33,881,43,561,1298,1925,84,1288,993,1196,111,145,116,90556.0,17,54,0,286,449,1243,170,1073,469,604,1243,1469,409700,952,1168,734,280,448,225,2408,842,274,47,97.350649
4,25025000302,25,25,302,77.519997,2.31,3.28,0.0,14.72,0.0,0.33,11.44,0.0,1.17,0.5,0.0,9.24,13.72,0.0,0.0,23.469999,0.0,-999.0,-999.0,13.64,3.64,0.7,0.0,0.0,6.69,6.66,4.42,18.74,1.37,34.490002,13.92,17.9,2.81,1.94,0.23,0.0,0.0,42.18,8.73,31.459999,56.759998,3.02,68.370003,56.02,5.47,2.68,4.41,3.02,16.74,14.98,-999.0,2.6,17.389999,100.0,19.629999,6.17,53.57,75.019997,94.82,11.21,15.19,4.92,33.619999,12.96,1.8,69.400002,69,0,2317,98,10,0,35,15,0,2989,2317,69,0,98,440,0,342,214,318,0,0,23,0,0,0,440,60,16,21,0,0,132,560,199,200,1031,416,58,84,2807,0,7,41,535,1184,245,0,2114,500,1232,69438,112500,70074,-999,35219,2654,117,1903,2916,488,69,12,0,0,2303,345,46,46,68,960,13,665,1200,1954,59,1366,934,1066,104,51,180,48182.0,88,13,85,433,500,1313,81,1232,660,572,1313,1436,322200,985,1245,855,335,454,147,2692,905,349,9,97.389969


In [1082]:
# force these to be in the same order as the 2000 sample file
df_MA_sample_2010 = df_MA_sample_2010[['tractid','incpc12','hinc12','mhmval12','mrent12','pfb12','pnat12','p10imm12','polang12','phs12','pcol12','pmar12',
                 'punemp12','percent_employed12','pprof12','pmanuf12','psemp12','ppov12','p30old12','pmulti12']]

## Percentage change 2000-2000, short-form questionnaire

In [1083]:
# merge so we can calculate % change
df_joined_sample = df_MA_sample_2000.merge(df_MA_sample_2010, on='tractid')

In [1084]:
df_joined_sample.shape

(264, 38)

In [1085]:
# this is where 2000 data starts 
i = 1
# this is where 2010 data starts
j = 19
# stop once you get to final column
while j < 37:

    df_joined_sample[f'{df_joined_sample.columns[i]}_PC'] = df_joined_sample.iloc[:,1:].apply(lambda x: 100*
                                    ((x[df_joined_sample.columns[j]]-x[df_joined_sample.columns[i]])
                                     /x[df_joined_sample.columns[i]]),axis=1)
        
    i +=1
    j +=1

  # Remove the CWD from sys.path while we load stuff.


In [1086]:
df_joined_sample.head()

Unnamed: 0,tractid,INCPC00,HINC00,MHMVAL00,MRENT00,percent_naturalized,percent_recent_immigrants(10),percent_other_languages,percent_hs_degree_orless,percent_4yrcollege_degree_ormore,percent_married,percent_unemployed,percent_employed,percent_professional,percent_manufacturing,percent_self_employed,percent_poverty,percent_houses_30yrsago,percent_multiunit_houses,incpc12,hinc12,mhmval12,mrent12,pfb12,pnat12,p10imm12,polang12,phs12,pcol12,pmar12,punemp12,percent_employed12,pprof12,pmanuf12,psemp12,ppov12,p30old12,pmulti12,INCPC00_PC,HINC00_PC,MHMVAL00_PC,MRENT00_PC,percent_naturalized_PC,percent_recent_immigrants(10)_PC,percent_other_languages_PC,percent_hs_degree_orless_PC,percent_4yrcollege_degree_ormore_PC,percent_married_PC,percent_unemployed_PC,percent_employed_PC,percent_professional_PC,percent_manufacturing_PC,percent_self_employed_PC,percent_poverty_PC,percent_houses_30yrsago_PC,percent_multiunit_houses_PC
0,25025000100,21999.0,222100.0,222100.0,898.0,11.265121,10.509073,34.381663,43.822674,40.988372,32.633698,4.170074,95.829926,41.894198,5.588737,11.348123,13.463986,81.412639,85.439901,27564,53571,348200,1348,29.030001,15.3,9.48,38.450001,54.939999,30.780001,27.690001,6.06,94.380531,22.5,7.83,7.13,14.66,94.529999,78.639999,25.296604,-75.879784,56.776227,50.111359,157.698082,45.588491,-72.427164,-12.260031,34.038011,-5.680315,564.017072,-93.676297,125.28306,302.59542,-31.001805,-47.043912,-81.992968,10.639172
1,25025000201,27675.0,265600.0,265600.0,991.0,11.757139,11.474145,26.682565,29.159148,47.347528,29.92059,1.925134,98.074866,49.109415,8.106143,6.288622,8.696774,92.630241,80.813215,38128,65506,532000,1405,25.129999,14.55,7.07,29.030001,32.290001,53.32,27.059999,1.89,98.404032,45.75,5.98,8.02,10.13,95.980003,83.279999,37.770551,-75.336596,100.301205,41.775984,113.742466,26.80684,-73.503296,-0.442906,-31.80214,78.205042,1305.616639,-98.072901,100.377122,464.386771,-4.90763,-7.781894,-89.064047,18.767709
2,25025000202,23602.0,276700.0,276700.0,820.0,15.312102,14.394904,36.802373,43.3161,38.810931,39.981813,2.715725,97.284275,47.996398,7.744259,5.718145,12.193878,96.503958,79.94723,29292,54607,432800,1299,36.240002,17.66,12.23,41.509998,32.82,43.119999,33.09,9.6,91.285081,45.950001,1.62,8.85,18.200001,96.260002,78.57,24.108126,-80.264908,56.41489,58.414634,136.675552,22.6823,-66.768448,-4.169586,-15.436195,7.849034,1118.459199,-90.132012,90.191525,493.342742,-71.669134,-27.422591,-81.140669,20.404425
3,25025000301,24920.0,214400.0,214400.0,971.0,12.213202,11.43664,21.726079,36.535797,44.618938,33.253205,1.617735,98.382265,53.349574,3.958587,7.003654,8.985399,86.30742,79.063604,41955,89688,409700,1469,15.2,6.1,6.1,19.83,26.52,61.369999,34.970001,4.36,97.350649,63.82,5.92,7.74,8.99,93.970001,76.589996,68.358748,-58.16791,91.091418,51.287333,24.45549,-46.662655,-71.923144,-45.724463,-40.563353,84.553635,2061.664149,-95.568307,82.476902,1512.191377,-15.472695,-13.860253,-89.583746,18.853678
4,25025000302,35207.00128,259499.9961,259499.9961,1240.999993,16.633729,7.312253,21.2938,25.886993,57.424443,33.655568,3.50213,96.49787,59.097599,9.612556,5.738107,5.566535,79.821962,81.379821,35219,69438,322200,1436,34.490002,13.92,17.9,42.18,31.459999,56.759998,33.619999,3.02,97.389969,56.02,5.47,2.68,16.74,94.82,75.019997,0.03408,-73.241618,24.161852,15.713135,107.34979,90.365402,-15.937975,62.938975,-45.214969,68.649653,859.987306,-96.870397,64.795138,482.779466,-4.672398,-51.855148,-79.028328,16.51537


In [1087]:
df_joined_sample.to_csv('CSV_files/Sample_count_census_changes.csv')