In [1]:
import glob
import os
import pandas as pd
import re

In [2]:
from functools import reduce

years = ['2007', '2008', '2009','2010', '2011', '2012', '2013', '2014', '2015', '2016']
dem_files_byYear = []

raw_tables_path = 'data/raw/ACS_2007_2016'

# for each year folder, concatenate across the 4 data profile files, 
# save one resulting df per year in the dem_files_byYear list

for year in years:
    # * allows for gziped csv's 
    dem_files = glob.glob(os.path.join(raw_tables_path, year, 'ACS_*_1YR_*with_ann.csv*'))  
    year_pat = re.compile(r'(\d{2})')
    dem_df_list = []
    for fname in dem_files:
        dem_yr_df = pd.read_csv(fname, skiprows=1, header=0)
        dem_yr_df['year'] = int(year)
        dem_yr_df[['MSA', 'MSA_type']] = dem_yr_df['Geography'].str.extract(r'(^.*)\ (M.*)$')
        dem_df_list.append(dem_yr_df)
    dem_year = reduce(lambda x, y: pd.merge(x, y, on = ['MSA', 'year','MSA_type','Id', 'Id2','Geography']), dem_df_list)
    #dem_year.to_csv((folder + 'dataprof.csv'), index=False)
    dem_files_byYear.append(dem_year)
dem_year.shape



(511, 2070)

In [3]:
dem_files_byYear[-1].shape

(511, 2070)

In [4]:
len(dem_files_byYear)

10

In [5]:
#concatenate each resulting year file so we have one final ACS data file
dem_df = pd.concat(dem_files_byYear, ignore_index=True)

print(dem_df.shape)
dem_df.head(10)

(5107, 7718)


Unnamed: 0,Estimate Margin of Error; ANCESTRY - American,Estimate Margin of Error; ANCESTRY - Arab,Estimate Margin of Error; ANCESTRY - Czech,Estimate Margin of Error; ANCESTRY - Danish,Estimate Margin of Error; ANCESTRY - Dutch,Estimate Margin of Error; ANCESTRY - English,Estimate Margin of Error; ANCESTRY - French (except Basque),Estimate Margin of Error; ANCESTRY - French Canadian,Estimate Margin of Error; ANCESTRY - German,Estimate Margin of Error; ANCESTRY - Greek,...,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1950 to 1959,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1960 to 1969,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1970 to 1979,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1980 to 1989,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1990 to 1999,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2000 to 2009,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 or later,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 to 2013,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later,year
0,,,,,,,,,,,...,,,,,,,,,,2007
1,,,,,,,,,,,...,,,,,,,,,,2007
2,,,,,,,,,,,...,,,,,,,,,,2007
3,,,,,,,,,,,...,,,,,,,,,,2007
4,,,,,,,,,,,...,,,,,,,,,,2007
5,,,,,,,,,,,...,,,,,,,,,,2007
6,,,,,,,,,,,...,,,,,,,,,,2007
7,,,,,,,,,,,...,,,,,,,,,,2007
8,,,,,,,,,,,...,,,,,,,,,,2007
9,,,,,,,,,,,...,,,,,,,,,,2007


In [6]:
#drop columns specifying margin of error
dem_df = dem_df[dem_df.columns.drop(list(dem_df.filter(regex='Margin of Error')))]

In [7]:
print(dem_df.shape)
dem_df.head(10)

(5107, 3630)


Unnamed: 0,Estimate; ANCESTRY - American,Estimate; ANCESTRY - Arab,Estimate; ANCESTRY - Czech,Estimate; ANCESTRY - Danish,Estimate; ANCESTRY - Dutch,Estimate; ANCESTRY - English,Estimate; ANCESTRY - French (except Basque),Estimate; ANCESTRY - French Canadian,Estimate; ANCESTRY - German,Estimate; ANCESTRY - Greek,...,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1950 to 1959,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1960 to 1969,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1970 to 1979,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1980 to 1989,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1990 to 1999,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2000 to 2009,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 or later,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 to 2013,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later,year
0,,,,,,,,,,,...,,,,,,,,,,2007
1,,,,,,,,,,,...,,,,,,,,,,2007
2,,,,,,,,,,,...,,,,,,,,,,2007
3,,,,,,,,,,,...,,,,,,,,,,2007
4,,,,,,,,,,,...,,,,,,,,,,2007
5,,,,,,,,,,,...,,,,,,,,,,2007
6,,,,,,,,,,,...,,,,,,,,,,2007
7,,,,,,,,,,,...,,,,,,,,,,2007
8,,,,,,,,,,,...,,,,,,,,,,2007
9,,,,,,,,,,,...,,,,,,,,,,2007


In [8]:
##saving the ACS df
#dem_df.to_csv('allDem_df.csv', index=False)

### The variable names in the ACS survey are not consistently named across years -- our concatenated ACS df (2007-2016) contains many missing values for some years because the column names are slightly different per year

## How do we fix this?

# Load and merge with crime data

In [9]:
# load crime data
all_year_dfs = pd.read_csv('data/derived/AC209_RawDump-2.csv.gz')
print(all_year_dfs.shape)
all_year_dfs.head()

(16262, 18)


Unnamed: 0,Aggravated assault,Burglary,Counties/principal cities,Forcible rape,Larceny- theft,Larceny-theft,Larceny‑ theft,Motor vehicle theft,Murder and nonnegligent manslaughter,Population,Property crime,Rape 1,Robbery,Violent crime,counties,msa_label,msa_pop,year
0,335.0,905.0,City of Abilene,,3113.0,,,251.0,8.0,122523.0,4269.0,70.0,133.0,546.0,"Includes Callahan, Jones, and Taylor Counties","Abilene, TX M.S.A.",169885,2016
1,387.0,1104.0,Total area actually reporting,,3530.0,,,298.0,11.0,100.0,4932.0,79.0,137.0,614.0,"Includes Callahan, Jones, and Taylor Counties","Abilene, TX M.S.A.",169885,2016
2,227.8,649.9,"Rate per 100,000 inhabitants",,2077.9,,,175.4,6.5,,2903.1,46.5,80.6,361.4,"Includes Callahan, Jones, and Taylor Counties","Abilene, TX M.S.A.",169885,2016
3,555.0,2483.0,City of Akron,,5483.0,,,680.0,34.0,197257.0,8646.0,212.0,415.0,1216.0,Includes Portage and Summit Counties,"Akron, OH M.S.A.2",703561,2016
4,833.0,3865.0,Total area actually reporting,,12599.0,,,989.0,41.0,96.2,17453.0,335.0,530.0,1739.0,Includes Portage and Summit Counties,"Akron, OH M.S.A.2",703561,2016


In [10]:
# Filter crime data
norm_rows = all_year_dfs['Counties/principal cities'] == 'Rate per 100,000 inhabitants'
all_year_msa_norm = all_year_dfs[norm_rows]
all_year_msa_norm.head()

Unnamed: 0,Aggravated assault,Burglary,Counties/principal cities,Forcible rape,Larceny- theft,Larceny-theft,Larceny‑ theft,Motor vehicle theft,Murder and nonnegligent manslaughter,Population,Property crime,Rape 1,Robbery,Violent crime,counties,msa_label,msa_pop,year
2,227.8,649.9,"Rate per 100,000 inhabitants",,2077.9,,,175.4,6.5,,2903.1,46.5,80.6,361.4,"Includes Callahan, Jones, and Taylor Counties","Abilene, TX M.S.A.",169885,2016
6,121.0,563.3,"Rate per 100,000 inhabitants",,1857.3,,,144.0,6.0,,2564.5,48.9,77.3,253.1,Includes Portage and Summit Counties,"Akron, OH M.S.A.2",703561,2016
10,534.9,949.8,"Rate per 100,000 inhabitants",,2676.9,,,144.2,11.8,,3770.8,28.2,159.9,734.8,"Includes Baker, Dougherty, Lee, Terrell, and W...","Albany, GA M.S.A.",152566,2016
13,59.0,378.6,"Rate per 100,000 inhabitants",,2101.1,,,210.6,0.0,,2690.3,27.9,27.0,113.9,Includes Linn County,"Albany, OR M.S.A.",122030,2016
18,180.0,281.5,"Rate per 100,000 inhabitants",,1646.0,,,82.0,2.0,,2009.5,38.9,68.7,289.7,"Includes Albany, Rensselaer, Saratoga, Schenec...","Albany-Schenectady-Troy, NY M.S.A.",878166,2016


In [11]:
# merge crime and dem
#all_year_msa_norm.merge(dem_df, right_on=['MSA', 'year'], left_on=['msa_label', 'year'])

#line above won't work because msa_label in crime df has 'M.S.A.' suffix on every entry, 
 #so won't match up with demographic df

In [12]:
all_year_msa_norm.loc[:, 'msa_label_strip'] = all_year_msa_norm['msa_label'].str.replace(' M.S.A.[\d\,\ ]+$', '')
# all_year_msa_norm[['msa_label_strip', 'year']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [13]:
dem_df[['MSA', 'year']].head()

Unnamed: 0,MSA,year
0,"Aberdeen, WA",2007
1,"Abilene, TX",2007
2,"Adrian, MI",2007
3,"Akron, OH",2007
4,"Albany, GA",2007


Some the MSA labels contain a number after the state abbreviation: e.g. 'Akron, OH2'. This is fixed with a regex replacement like the M.S.A. above.


In [14]:
#some the MSA labels contain a number after the state abbreviation: e.g. 'Akron, OH2'. NEED TO FIX THIS FOR THE MERGE!
all_year_msa_norm.loc[:, 'msa_label_strip'] = all_year_msa_norm['msa_label_strip'].str.replace(r'[\d\,\ ]+$', '').str.strip()
all_year_msa_norm['msa_label_strip'].head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


2                          Abilene, TX M.S.A.
6                                   Akron, OH
10                          Albany, GA M.S.A.
13                          Albany, OR M.S.A.
18         Albany-Schenectady-Troy, NY M.S.A.
21                     Albuquerque, NM M.S.A.
25                      Alexandria, LA M.S.A.
30          Allentown-Bethlehem-Easton, PA-NJ
33                         Altoona, PA M.S.A.
36                               Amarillo, TX
40                            Ames, IA M.S.A.
43                       Anchorage, AK M.S.A.
46                       Ann Arbor, MI M.S.A.
52    Anniston-Oxford-Jacksonville, AL M.S.A.
55                        Appleton, WI M.S.A.
Name: msa_label_strip, dtype: object

In [15]:
all_year_msa_norm[['msa_label_strip', 'year']].head()

Unnamed: 0,msa_label_strip,year
2,"Abilene, TX M.S.A.",2016
6,"Akron, OH",2016
10,"Albany, GA M.S.A.",2016
13,"Albany, OR M.S.A.",2016
18,"Albany-Schenectady-Troy, NY M.S.A.",2016


In [16]:
all_year_msa_norm['msa_label_strip'] == 'Abilene, TX'
all_year_msa_norm[['msa_label_strip', 'year']].dtypes

msa_label_strip    object
year                int64
dtype: object

In [17]:
dem_df.loc[:, 'MSA'] = dem_df['MSA'].str.strip()

Unnamed: 0,MSA,year
0,"Aberdeen, WA",2007
1,"Abilene, TX",2007
2,"Adrian, MI",2007
3,"Akron, OH",2007
4,"Albany, GA",2007


In [18]:
dem_df[['MSA', 'year']].dtypes

MSA     object
year     int64
dtype: object

In [20]:
dem_df_merged = pd.merge(left=all_year_msa_norm, left_on=['msa_label_strip', 'year'],
                         right=dem_df, right_on=['MSA', 'year'], how='inner')
dem_df_merged.head(10)

Unnamed: 0,Aggravated assault,Burglary,Counties/principal cities,Forcible rape,Larceny- theft,Larceny-theft,Larceny‑ theft,Motor vehicle theft,Murder and nonnegligent manslaughter,Population,...,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1940 to 1949,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1950 to 1959,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1960 to 1969,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1970 to 1979,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1980 to 1989,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1990 to 1999,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2000 to 2009,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 or later,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 to 2013,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later
0,121.0,563.3,"Rate per 100,000 inhabitants",,1857.3,,,144.0,6.0,,...,7.2,14.9,11.6,13.9,9.1,12.9,9.4,,1.5,0.6
1,,264.7,"Rate per 100,000 inhabitants",,,,,89.1,2.6,,...,5.8,11.6,10.2,12.1,10.4,10.7,11.6,,2.0,0.9
2,413.7,761.3,"Rate per 100,000 inhabitants",,2781.3,,,390.2,5.3,,...,6.5,17.4,18.2,14.4,11.6,7.9,12.7,,4.7,1.0
3,158.9,,"Rate per 100,000 inhabitants",,1532.9,,,160.6,5.1,,...,4.0,6.6,9.4,11.6,16.9,18.2,19.6,,3.6,1.4
4,,398.9,"Rate per 100,000 inhabitants",,1101.0,,,62.4,0.5,,...,3.7,11.4,11.6,19.3,22.4,9.4,7.7,,1.5,0.6
5,382.9,755.3,"Rate per 100,000 inhabitants",,1775.3,,,209.7,6.6,,...,4.4,13.5,13.1,17.7,10.9,12.5,15.6,,5.5,1.9
6,,202.7,"Rate per 100,000 inhabitants",,1088.9,,,107.7,1.9,,...,5.4,10.9,10.2,11.4,10.4,7.3,7.3,,2.0,1.2
7,138.1,,"Rate per 100,000 inhabitants",,1716.8,,,175.2,1.9,,...,4.1,10.0,12.3,13.0,8.4,15.9,12.5,,3.6,2.3
8,232.8,330.7,"Rate per 100,000 inhabitants",,1563.4,,,187.4,10.2,,...,6.1,13.4,12.2,14.2,9.1,10.8,11.4,,1.2,0.7
9,94.9,594.4,"Rate per 100,000 inhabitants",,2100.6,,,216.1,5.2,,...,4.2,10.4,11.5,14.6,11.5,16.5,14.8,,3.1,1.6


In [21]:
dem_df_merged.shape

(350, 3648)

In [22]:
dem_df_merged.head(10)

Unnamed: 0,Aggravated assault,Burglary,Counties/principal cities,Forcible rape,Larceny- theft,Larceny-theft,Larceny‑ theft,Motor vehicle theft,Murder and nonnegligent manslaughter,Population,...,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1940 to 1949,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1950 to 1959,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1960 to 1969,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1970 to 1979,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1980 to 1989,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1990 to 1999,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2000 to 2009,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 or later,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 to 2013,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later
0,121.0,563.3,"Rate per 100,000 inhabitants",,1857.3,,,144.0,6.0,,...,7.2,14.9,11.6,13.9,9.1,12.9,9.4,,1.5,0.6
1,,264.7,"Rate per 100,000 inhabitants",,,,,89.1,2.6,,...,5.8,11.6,10.2,12.1,10.4,10.7,11.6,,2.0,0.9
2,413.7,761.3,"Rate per 100,000 inhabitants",,2781.3,,,390.2,5.3,,...,6.5,17.4,18.2,14.4,11.6,7.9,12.7,,4.7,1.0
3,158.9,,"Rate per 100,000 inhabitants",,1532.9,,,160.6,5.1,,...,4.0,6.6,9.4,11.6,16.9,18.2,19.6,,3.6,1.4
4,,398.9,"Rate per 100,000 inhabitants",,1101.0,,,62.4,0.5,,...,3.7,11.4,11.6,19.3,22.4,9.4,7.7,,1.5,0.6
5,382.9,755.3,"Rate per 100,000 inhabitants",,1775.3,,,209.7,6.6,,...,4.4,13.5,13.1,17.7,10.9,12.5,15.6,,5.5,1.9
6,,202.7,"Rate per 100,000 inhabitants",,1088.9,,,107.7,1.9,,...,5.4,10.9,10.2,11.4,10.4,7.3,7.3,,2.0,1.2
7,138.1,,"Rate per 100,000 inhabitants",,1716.8,,,175.2,1.9,,...,4.1,10.0,12.3,13.0,8.4,15.9,12.5,,3.6,2.3
8,232.8,330.7,"Rate per 100,000 inhabitants",,1563.4,,,187.4,10.2,,...,6.1,13.4,12.2,14.2,9.1,10.8,11.4,,1.2,0.7
9,94.9,594.4,"Rate per 100,000 inhabitants",,2100.6,,,216.1,5.2,,...,4.2,10.4,11.5,14.6,11.5,16.5,14.8,,3.1,1.6


In [23]:
dem_df_merged.to_csv('data/derived/crime_ACS_merged.csv', index=False)