In [1]:
import glob
import os
import pandas as pd
import re

In [2]:
import csv
years = ['2007', '2008', '2009','2010', '2011', '2012', '2013', '2014', '2015', '2016']

dict_list={year:{} for year in years}
with open('data/derived/colnames_final.csv', 'r') as csvfile:
    spamwriter = csv.reader(csvfile)
    for row in spamwriter:
        final_name=row[-1]
        for year,colname in zip(years, row[:-1]):
            dict_list[year][colname]=final_name

In [3]:
from functools import reduce

dem_files_byYear = []

merge_columns = ['MSA', 'year','MSA_type','Id', 'Id2','Geography']

raw_tables_path = 'data/raw/ACS_2007_2016'

# for each year folder, concatenate across the 4 data profile files, 
# save one resulting df per year in the dem_files_byYear list

for year in years:
    # * allows for gziped csv's 
    dem_files = glob.glob(os.path.join(raw_tables_path, year, 'ACS_*_1YR_*with_ann.csv*'))  
    year_pat = re.compile(r'(\d{2})')
    dem_df_list = []
    for fname in dem_files:
        dem_yr_df = pd.read_csv(fname, skiprows=1, header=0)
        dem_yr_df['year'] = int(year)
        dem_yr_df[['MSA', 'MSA_type']] = dem_yr_df['Geography'].str.extract(r'(^.*)\ (M.*)$')
        #drop any column that isn't a percent
        bad_col_names = [x for x in dem_yr_df.columns.values if 'Percent;' not in x and x not in merge_columns]
        dem_yr_df = dem_yr_df[dem_yr_df.columns.drop(bad_col_names)]
        #drop any margin of error columns
        dem_yr_df = dem_yr_df[dem_yr_df.columns.drop(list(dem_yr_df.filter(regex='Margin of Error')))]
        #rename all columns to a standard naming format
        dem_yr_df.rename_axis(dict_list[year], axis=1, inplace=True)
        
        dem_df_list.append(dem_yr_df)
    dem_year = reduce(lambda x, y: pd.merge(x, y, on = merge_columns), dem_df_list)
    #dem_year.to_csv((folder + 'dataprof.csv'), index=False)
    dem_files_byYear.append(dem_year)
dem_year.shape



(511, 522)

In [4]:
will = dem_files_byYear[0].copy()
will = will.append(dem_files_byYear[1])
will = will.append(dem_files_byYear[2])
will = will.append(dem_files_byYear[3])
will = will.append(dem_files_byYear[4])
will = will.append(dem_files_byYear[5])
will = will.append(dem_files_byYear[6])
will.shape

(3579, 524)

In [5]:
#concatenate each resulting year file so we have one final ACS data file
dem_df = will

print(dem_df.shape)
dem_df.head(10)

(3579, 524)


Unnamed: 0,Geography,Id,Id2,MSA,MSA_type,Percent; ANCESTRY - Total population,Percent; ANCESTRY - Total population - American,Percent; ANCESTRY - Total population - Arab,Percent; ANCESTRY - Total population - Czech,Percent; ANCESTRY - Total population - Danish,...,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1940 to 1949,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1950 to 1959,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1960 to 1969,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1970 to 1979,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1980 to 1989,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1990 to 1999,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2000 to 2009,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 to 2013,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later,year
0,"Aberdeen, WA Micro Area",3100000US10140,10140,"Aberdeen, WA",Micro Area,100,3.6,0.0,0.1,1.0,...,27.4,4.8,6.7,13.3,20.1,12.7,9.6,3.6,1.9,2007
1,"Abilene, TX Metro Area",3100000US10180,10180,"Abilene, TX",Metro Area,100,6.9,0.0,0.1,0.2,...,9.5,7.0,19.8,11.8,15.5,16.8,9.6,6.5,3.4,2007
2,"Adrian, MI Micro Area",3100000US10300,10300,"Adrian, MI",Micro Area,100,7.0,0.1,0.3,0.4,...,25.6,5.5,12.7,8.0,14.9,9.1,13.8,8.2,2.2,2007
3,"Akron, OH Metro Area",3100000US10420,10420,"Akron, OH",Metro Area,100,5.4,0.7,0.9,0.2,...,19.4,8.0,16.5,12.4,14.4,8.6,11.7,6.7,2.2,2007
4,"Albany, GA Metro Area",3100000US10500,10500,"Albany, GA",Metro Area,100,8.3,0.0,0.0,0.2,...,4.8,4.9,14.1,11.4,18.7,17.0,17.0,10.0,2.2,2007
5,"Albany-Lebanon, OR Micro Area",3100000US10540,10540,"Albany-Lebanon, OR",Micro Area,100,5.9,0.0,0.5,1.1,...,12.1,5.8,10.9,9.9,26.0,9.2,12.6,7.9,5.7,2007
6,"Albany-Schenectady-Troy, NY Metro Area",3100000US10580,10580,"Albany-Schenectady-Troy, NY",Metro Area,100,6.9,0.4,0.6,0.5,...,32.3,6.0,12.4,10.0,12.9,11.0,9.1,4.5,1.8,2007
7,"Albertville, AL Micro Area",3100000US10700,10700,"Albertville, AL",Micro Area,N,N,N,N,N,...,3.7,6.6,9.8,14.3,19.1,15.0,18.2,8.6,4.8,2007
8,"Albuquerque, NM Metro Area",3100000US10740,10740,"Albuquerque, NM",Metro Area,100,3.1,0.3,0.3,0.4,...,3.0,4.0,10.7,10.2,20.7,17.2,19.0,10.3,5.0,2007
9,"Alexandria, LA Metro Area",3100000US10780,10780,"Alexandria, LA",Metro Area,100,24.4,1.0,0.3,0.1,...,6.4,7.7,11.8,14.6,20.7,16.4,12.3,7.0,3.2,2007


In [6]:
print(dem_df.shape)
dem_df.head(10)

(3579, 524)


Unnamed: 0,Geography,Id,Id2,MSA,MSA_type,Percent; ANCESTRY - Total population,Percent; ANCESTRY - Total population - American,Percent; ANCESTRY - Total population - Arab,Percent; ANCESTRY - Total population - Czech,Percent; ANCESTRY - Total population - Danish,...,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1940 to 1949,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1950 to 1959,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1960 to 1969,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1970 to 1979,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1980 to 1989,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1990 to 1999,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2000 to 2009,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 to 2013,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later,year
0,"Aberdeen, WA Micro Area",3100000US10140,10140,"Aberdeen, WA",Micro Area,100,3.6,0.0,0.1,1.0,...,27.4,4.8,6.7,13.3,20.1,12.7,9.6,3.6,1.9,2007
1,"Abilene, TX Metro Area",3100000US10180,10180,"Abilene, TX",Metro Area,100,6.9,0.0,0.1,0.2,...,9.5,7.0,19.8,11.8,15.5,16.8,9.6,6.5,3.4,2007
2,"Adrian, MI Micro Area",3100000US10300,10300,"Adrian, MI",Micro Area,100,7.0,0.1,0.3,0.4,...,25.6,5.5,12.7,8.0,14.9,9.1,13.8,8.2,2.2,2007
3,"Akron, OH Metro Area",3100000US10420,10420,"Akron, OH",Metro Area,100,5.4,0.7,0.9,0.2,...,19.4,8.0,16.5,12.4,14.4,8.6,11.7,6.7,2.2,2007
4,"Albany, GA Metro Area",3100000US10500,10500,"Albany, GA",Metro Area,100,8.3,0.0,0.0,0.2,...,4.8,4.9,14.1,11.4,18.7,17.0,17.0,10.0,2.2,2007
5,"Albany-Lebanon, OR Micro Area",3100000US10540,10540,"Albany-Lebanon, OR",Micro Area,100,5.9,0.0,0.5,1.1,...,12.1,5.8,10.9,9.9,26.0,9.2,12.6,7.9,5.7,2007
6,"Albany-Schenectady-Troy, NY Metro Area",3100000US10580,10580,"Albany-Schenectady-Troy, NY",Metro Area,100,6.9,0.4,0.6,0.5,...,32.3,6.0,12.4,10.0,12.9,11.0,9.1,4.5,1.8,2007
7,"Albertville, AL Micro Area",3100000US10700,10700,"Albertville, AL",Micro Area,N,N,N,N,N,...,3.7,6.6,9.8,14.3,19.1,15.0,18.2,8.6,4.8,2007
8,"Albuquerque, NM Metro Area",3100000US10740,10740,"Albuquerque, NM",Metro Area,100,3.1,0.3,0.3,0.4,...,3.0,4.0,10.7,10.2,20.7,17.2,19.0,10.3,5.0,2007
9,"Alexandria, LA Metro Area",3100000US10780,10780,"Alexandria, LA",Metro Area,100,24.4,1.0,0.3,0.1,...,6.4,7.7,11.8,14.6,20.7,16.4,12.3,7.0,3.2,2007


In [7]:
##saving the ACS df
dem_df.to_csv('data/derived/allDem_df.csv', index=False)

# Load and merge with crime data

In [None]:
# load crime data
all_year_dfs = pd.read_csv('data/derived/AC209_RawDump-2.csv.gz')
print(all_year_dfs.shape)
all_year_dfs.head()

In [None]:
# Filter crime data
norm_rows = all_year_dfs['Counties/principal cities'] == 'Rate per 100,000 inhabitants'
all_year_msa_norm = all_year_dfs[norm_rows]
all_year_msa_norm.head()

In [None]:
# merge crime and dem
#all_year_msa_norm.merge(dem_df, right_on=['MSA', 'year'], left_on=['msa_label', 'year'])

#line above won't work because msa_label in crime df has 'M.S.A.' suffix on every entry, 
 #so won't match up with demographic df

In [None]:
all_year_msa_norm.loc[:, 'msa_label_strip'] = all_year_msa_norm['msa_label'].str.replace(' M.S.A.[\d\,\ ]+$', '')
# all_year_msa_norm[['msa_label_strip', 'year']].head()

In [None]:
dem_df[['MSA', 'year']].head()

Some the MSA labels contain a number after the state abbreviation: e.g. 'Akron, OH2'. This is fixed with a regex replacement like the M.S.A. above.


In [None]:
#some the MSA labels contain a number after the state abbreviation: e.g. 'Akron, OH2'. NEED TO FIX THIS FOR THE MERGE!
all_year_msa_norm.loc[:, 'msa_label_strip'] = all_year_msa_norm['msa_label_strip'].str.replace(r'[\d\,\ ]+$', '').str.strip()
all_year_msa_norm['msa_label_strip'].head(15)

In [None]:
all_year_msa_norm[['msa_label_strip', 'year']].head()

In [None]:
all_year_msa_norm['msa_label_strip'] == 'Abilene, TX'
all_year_msa_norm[['msa_label_strip', 'year']].dtypes

In [None]:
dem_df.loc[:, 'MSA'] = dem_df['MSA'].str.strip()

In [None]:
dem_df[['MSA', 'year']].dtypes

In [None]:
dem_df_merged = pd.merge(left=all_year_msa_norm, left_on=['msa_label_strip', 'year'],
                         right=dem_df, right_on=['MSA', 'year'], how='inner')
dem_df_merged.head(10)

In [None]:
dem_df_merged.shape

In [None]:
dem_df_merged.head(10)

In [None]:
dem_df_merged.to_csv('data/derived/crime_ACS_merged.csv', index=False)