In [4]:
import glob
import os
import pandas as pd
import re

In [5]:
import csv
years = ['2007', '2008', '2009','2010', '2011', '2012', '2013', '2014', '2015', '2016']

dict_list={year:{} for year in years}
with open('data/derived/colnames_final.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        final_name=row[-1]
        for year,colname in zip(years, row[:-1]):
            dict_list[year][colname]=final_name

In [6]:
from functools import reduce

dem_files_byYear = []

merge_columns = ['MSA', 'year','MSA_type','Id', 'Id2','Geography']

raw_tables_path = 'data/raw/ACS_2007_2016'

# for each year folder, concatenate across the 4 data profile files, 
# save one resulting df per year in the dem_files_byYear list

for year in years:
    # * allows for gziped csv's 
    dem_files = glob.glob(os.path.join(raw_tables_path, year, 'ACS_*_1YR_*with_ann.csv*'))  
    year_pat = re.compile(r'(\d{2})')
    dem_df_list = []
    for fname in dem_files:
        dem_yr_df = pd.read_csv(fname, skiprows=1, header=0)
        dem_yr_df['year'] = int(year)
        dem_yr_df[['MSA', 'MSA_type']] = dem_yr_df['Geography'].str.extract(r'(^.*)\ (M.*)$')
        #drop any column that isn't a percent
        bad_col_names = [x for x in dem_yr_df.columns.values if 'Percent;' not in x and x not in merge_columns]
        dem_yr_df = dem_yr_df[dem_yr_df.columns.drop(bad_col_names)]
        #drop any margin of error columns
        dem_yr_df = dem_yr_df[dem_yr_df.columns.drop(list(dem_yr_df.filter(regex='Margin of Error')))]
        #rename all columns to a standard naming format
        dem_yr_df.rename_axis(dict_list[year], axis=1, inplace=True)
        
        dem_df_list.append(dem_yr_df)
    dem_year = reduce(lambda x, y: pd.merge(x, y, on = merge_columns), dem_df_list)
    #dem_year.to_csv((folder + 'dataprof.csv'), index=False)
    dem_files_byYear.append(dem_year)
dem_year.shape



(511, 522)

In [7]:
will = dem_files_byYear[0].copy()
for x in range(1,10):
    will = will.append(dem_files_byYear[x])

will.shape

(5107, 545)

In [8]:
#concatenate each resulting year file so we have one final ACS data file
dem_df = will

print(dem_df.shape)
dem_df.head(10)

(5107, 545)


Unnamed: 0,Geography,Id,Id2,MSA,MSA_type,Percent; ANCESTRY - Total population,Percent; ANCESTRY - Total population - American,Percent; ANCESTRY - Total population - Arab,Percent; ANCESTRY - Total population - Czech,Percent; ANCESTRY - Total population - Danish,...,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1940 to 1949,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1950 to 1959,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1960 to 1969,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1970 to 1979,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1980 to 1989,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1990 to 1999,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2000 to 2009,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 to 2013,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later,year
0,"Aberdeen, WA Micro Area",3100000US10140,10140,"Aberdeen, WA",Micro Area,100,3.6,0.0,0.1,1.0,...,27.4,4.8,6.7,13.3,20.1,12.7,9.6,3.6,1.9,2007
1,"Abilene, TX Metro Area",3100000US10180,10180,"Abilene, TX",Metro Area,100,6.9,0.0,0.1,0.2,...,9.5,7.0,19.8,11.8,15.5,16.8,9.6,6.5,3.4,2007
2,"Adrian, MI Micro Area",3100000US10300,10300,"Adrian, MI",Micro Area,100,7.0,0.1,0.3,0.4,...,25.6,5.5,12.7,8.0,14.9,9.1,13.8,8.2,2.2,2007
3,"Akron, OH Metro Area",3100000US10420,10420,"Akron, OH",Metro Area,100,5.4,0.7,0.9,0.2,...,19.4,8.0,16.5,12.4,14.4,8.6,11.7,6.7,2.2,2007
4,"Albany, GA Metro Area",3100000US10500,10500,"Albany, GA",Metro Area,100,8.3,0.0,0.0,0.2,...,4.8,4.9,14.1,11.4,18.7,17.0,17.0,10.0,2.2,2007
5,"Albany-Lebanon, OR Micro Area",3100000US10540,10540,"Albany-Lebanon, OR",Micro Area,100,5.9,0.0,0.5,1.1,...,12.1,5.8,10.9,9.9,26.0,9.2,12.6,7.9,5.7,2007
6,"Albany-Schenectady-Troy, NY Metro Area",3100000US10580,10580,"Albany-Schenectady-Troy, NY",Metro Area,100,6.9,0.4,0.6,0.5,...,32.3,6.0,12.4,10.0,12.9,11.0,9.1,4.5,1.8,2007
7,"Albertville, AL Micro Area",3100000US10700,10700,"Albertville, AL",Micro Area,N,N,N,N,N,...,3.7,6.6,9.8,14.3,19.1,15.0,18.2,8.6,4.8,2007
8,"Albuquerque, NM Metro Area",3100000US10740,10740,"Albuquerque, NM",Metro Area,100,3.1,0.3,0.3,0.4,...,3.0,4.0,10.7,10.2,20.7,17.2,19.0,10.3,5.0,2007
9,"Alexandria, LA Metro Area",3100000US10780,10780,"Alexandria, LA",Metro Area,100,24.4,1.0,0.3,0.1,...,6.4,7.7,11.8,14.6,20.7,16.4,12.3,7.0,3.2,2007


In [9]:
print(dem_df.shape)
dem_df.head(10)

(5107, 545)


Unnamed: 0,Geography,Id,Id2,MSA,MSA_type,Percent; ANCESTRY - Total population,Percent; ANCESTRY - Total population - American,Percent; ANCESTRY - Total population - Arab,Percent; ANCESTRY - Total population - Czech,Percent; ANCESTRY - Total population - Danish,...,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1940 to 1949,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1950 to 1959,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1960 to 1969,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1970 to 1979,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1980 to 1989,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1990 to 1999,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2000 to 2009,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 to 2013,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later,year
0,"Aberdeen, WA Micro Area",3100000US10140,10140,"Aberdeen, WA",Micro Area,100,3.6,0.0,0.1,1.0,...,27.4,4.8,6.7,13.3,20.1,12.7,9.6,3.6,1.9,2007
1,"Abilene, TX Metro Area",3100000US10180,10180,"Abilene, TX",Metro Area,100,6.9,0.0,0.1,0.2,...,9.5,7.0,19.8,11.8,15.5,16.8,9.6,6.5,3.4,2007
2,"Adrian, MI Micro Area",3100000US10300,10300,"Adrian, MI",Micro Area,100,7.0,0.1,0.3,0.4,...,25.6,5.5,12.7,8.0,14.9,9.1,13.8,8.2,2.2,2007
3,"Akron, OH Metro Area",3100000US10420,10420,"Akron, OH",Metro Area,100,5.4,0.7,0.9,0.2,...,19.4,8.0,16.5,12.4,14.4,8.6,11.7,6.7,2.2,2007
4,"Albany, GA Metro Area",3100000US10500,10500,"Albany, GA",Metro Area,100,8.3,0.0,0.0,0.2,...,4.8,4.9,14.1,11.4,18.7,17.0,17.0,10.0,2.2,2007
5,"Albany-Lebanon, OR Micro Area",3100000US10540,10540,"Albany-Lebanon, OR",Micro Area,100,5.9,0.0,0.5,1.1,...,12.1,5.8,10.9,9.9,26.0,9.2,12.6,7.9,5.7,2007
6,"Albany-Schenectady-Troy, NY Metro Area",3100000US10580,10580,"Albany-Schenectady-Troy, NY",Metro Area,100,6.9,0.4,0.6,0.5,...,32.3,6.0,12.4,10.0,12.9,11.0,9.1,4.5,1.8,2007
7,"Albertville, AL Micro Area",3100000US10700,10700,"Albertville, AL",Micro Area,N,N,N,N,N,...,3.7,6.6,9.8,14.3,19.1,15.0,18.2,8.6,4.8,2007
8,"Albuquerque, NM Metro Area",3100000US10740,10740,"Albuquerque, NM",Metro Area,100,3.1,0.3,0.3,0.4,...,3.0,4.0,10.7,10.2,20.7,17.2,19.0,10.3,5.0,2007
9,"Alexandria, LA Metro Area",3100000US10780,10780,"Alexandria, LA",Metro Area,100,24.4,1.0,0.3,0.1,...,6.4,7.7,11.8,14.6,20.7,16.4,12.3,7.0,3.2,2007


In [10]:
##saving the ACS df
dem_df.to_csv('data/derived/allDem_df.csv', index=False)

# Load and merge with crime data

In [12]:
# load crime data
all_year_dfs = pd.read_csv('data/derived/AC209_RawDump-2.csv.gz')
print(all_year_dfs.shape)
all_year_dfs.head()

(16262, 18)


Unnamed: 0,Aggravated assault,Burglary,Counties/principal cities,Forcible rape,Larceny- theft,Larceny-theft,Larceny‑ theft,Motor vehicle theft,Murder and nonnegligent manslaughter,Population,Property crime,Rape 1,Robbery,Violent crime,counties,msa_label,msa_pop,year
0,335.0,905.0,City of Abilene,,3113.0,,,251.0,8.0,122523.0,4269.0,70.0,133.0,546.0,"Includes Callahan, Jones, and Taylor Counties","Abilene, TX M.S.A.",169885,2016
1,387.0,1104.0,Total area actually reporting,,3530.0,,,298.0,11.0,100.0,4932.0,79.0,137.0,614.0,"Includes Callahan, Jones, and Taylor Counties","Abilene, TX M.S.A.",169885,2016
2,227.8,649.9,"Rate per 100,000 inhabitants",,2077.9,,,175.4,6.5,,2903.1,46.5,80.6,361.4,"Includes Callahan, Jones, and Taylor Counties","Abilene, TX M.S.A.",169885,2016
3,555.0,2483.0,City of Akron,,5483.0,,,680.0,34.0,197257.0,8646.0,212.0,415.0,1216.0,Includes Portage and Summit Counties,"Akron, OH M.S.A.2",703561,2016
4,833.0,3865.0,Total area actually reporting,,12599.0,,,989.0,41.0,96.2,17453.0,335.0,530.0,1739.0,Includes Portage and Summit Counties,"Akron, OH M.S.A.2",703561,2016


In [13]:
# Filter crime data
norm_rows = all_year_dfs['Counties/principal cities'] == 'Rate per 100,000 inhabitants'
all_year_msa_norm = all_year_dfs[norm_rows]
all_year_msa_norm.head()

Unnamed: 0,Aggravated assault,Burglary,Counties/principal cities,Forcible rape,Larceny- theft,Larceny-theft,Larceny‑ theft,Motor vehicle theft,Murder and nonnegligent manslaughter,Population,Property crime,Rape 1,Robbery,Violent crime,counties,msa_label,msa_pop,year
2,227.8,649.9,"Rate per 100,000 inhabitants",,2077.9,,,175.4,6.5,,2903.1,46.5,80.6,361.4,"Includes Callahan, Jones, and Taylor Counties","Abilene, TX M.S.A.",169885,2016
6,121.0,563.3,"Rate per 100,000 inhabitants",,1857.3,,,144.0,6.0,,2564.5,48.9,77.3,253.1,Includes Portage and Summit Counties,"Akron, OH M.S.A.2",703561,2016
10,534.9,949.8,"Rate per 100,000 inhabitants",,2676.9,,,144.2,11.8,,3770.8,28.2,159.9,734.8,"Includes Baker, Dougherty, Lee, Terrell, and W...","Albany, GA M.S.A.",152566,2016
13,59.0,378.6,"Rate per 100,000 inhabitants",,2101.1,,,210.6,0.0,,2690.3,27.9,27.0,113.9,Includes Linn County,"Albany, OR M.S.A.",122030,2016
18,180.0,281.5,"Rate per 100,000 inhabitants",,1646.0,,,82.0,2.0,,2009.5,38.9,68.7,289.7,"Includes Albany, Rensselaer, Saratoga, Schenec...","Albany-Schenectady-Troy, NY M.S.A.",878166,2016


In [14]:
# merge crime and dem
#all_year_msa_norm.merge(dem_df, right_on=['MSA', 'year'], left_on=['msa_label', 'year'])

#line above won't work because msa_label in crime df has 'M.S.A.' suffix on every entry, 
 #so won't match up with demographic df

In [35]:
all_year_msa_norm.loc[:, 'msa_label_strip'] = all_year_msa_norm['msa_label'].str.replace(' M\.S\.A\.[\d\,\ ]*$', '')
# all_year_msa_norm[['msa_label_strip', 'year']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [36]:
dem_df[['MSA', 'year']].head()

Unnamed: 0,MSA,year
0,"Aberdeen, WA",2007
1,"Abilene, TX",2007
2,"Adrian, MI",2007
3,"Akron, OH",2007
4,"Albany, GA",2007


Some the MSA labels contain a number after the state abbreviation: e.g. 'Akron, OH2'. This is fixed with a regex replacement like the M.S.A. above.


In [37]:
#some the MSA labels contain a number after the state abbreviation: e.g. 'Akron, OH2'. NEED TO FIX THIS FOR THE MERGE!
all_year_msa_norm.loc[:, 'msa_label_strip'] = all_year_msa_norm['msa_label_strip'].str.replace(r'[\d\,\ ]+$', '').str.strip()
all_year_msa_norm['msa_label_strip'].head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


2                           Abilene, TX
6                             Akron, OH
10                           Albany, GA
13                           Albany, OR
18          Albany-Schenectady-Troy, NY
21                      Albuquerque, NM
25                       Alexandria, LA
30    Allentown-Bethlehem-Easton, PA-NJ
33                          Altoona, PA
36                         Amarillo, TX
40                             Ames, IA
43                        Anchorage, AK
46                        Ann Arbor, MI
52     Anniston-Oxford-Jacksonville, AL
55                         Appleton, WI
Name: msa_label_strip, dtype: object

In [38]:
all_year_msa_norm[['msa_label_strip', 'year']].head()

Unnamed: 0,msa_label_strip,year
2,"Abilene, TX",2016
6,"Akron, OH",2016
10,"Albany, GA",2016
13,"Albany, OR",2016
18,"Albany-Schenectady-Troy, NY",2016


In [39]:
all_year_msa_norm['msa_label_strip'] == 'Abilene, TX'
all_year_msa_norm[['msa_label_strip', 'year']].dtypes

msa_label_strip    object
year                int64
dtype: object

In [40]:
dem_df.loc[:, 'MSA'] = dem_df['MSA'].str.strip()

In [41]:
dem_df[['MSA', 'year']].dtypes

MSA     object
year     int64
dtype: object

In [42]:
dem_df_merged = pd.merge(left=all_year_msa_norm, left_on=['msa_label_strip', 'year'],
                         right=dem_df, right_on=['MSA', 'year'], how='inner')
dem_df_merged.head(10)

Unnamed: 0,Aggravated assault,Burglary,Counties/principal cities,Forcible rape,Larceny- theft,Larceny-theft,Larceny‑ theft,Motor vehicle theft,Murder and nonnegligent manslaughter,Population,...,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1939 or earlier,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1940 to 1949,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1950 to 1959,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1960 to 1969,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1970 to 1979,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1980 to 1989,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1990 to 1999,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2000 to 2009,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 to 2013,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later
0,227.8,649.9,"Rate per 100,000 inhabitants",,2077.9,,,175.4,6.5,,...,9.5,8.4,18.6,11.1,15.0,15.5,8.6,9.3,2.6,1.5
1,121.0,563.3,"Rate per 100,000 inhabitants",,1857.3,,,144.0,6.0,,...,19.0,7.2,14.9,11.6,13.9,9.1,12.9,9.4,1.5,0.6
2,534.9,949.8,"Rate per 100,000 inhabitants",,2676.9,,,144.2,11.8,,...,6.2,3.2,9.7,14.0,15.2,15.4,17.0,16.9,1.9,0.3
3,59.0,378.6,"Rate per 100,000 inhabitants",,2101.1,,,210.6,0.0,,...,10.9,8.6,8.7,9.7,20.9,7.9,16.8,12.9,2.5,1.1
4,180.0,281.5,"Rate per 100,000 inhabitants",,1646.0,,,82.0,2.0,,...,29.8,5.9,9.8,8.9,11.3,12.1,9.6,8.9,2.8,0.9
5,596.7,951.1,"Rate per 100,000 inhabitants",,3319.6,,,1018.0,8.3,,...,2.8,3.7,9.2,10.0,18.1,16.4,19.0,17.3,2.1,1.3
6,748.5,1271.2,"Rate per 100,000 inhabitants",,3111.7,,,314.9,7.8,,...,5.2,8.0,15.0,12.4,16.8,12.3,12.6,11.3,5.0,1.3
7,,264.7,"Rate per 100,000 inhabitants",,,,,89.1,2.6,,...,24.8,5.8,11.6,10.2,12.1,10.4,10.7,11.6,2.0,0.9
8,143.3,190.5,"Rate per 100,000 inhabitants",,1124.5,,,44.0,0.8,,...,31.1,7.0,11.4,11.3,14.9,6.8,8.3,7.5,1.4,0.2
9,413.7,761.3,"Rate per 100,000 inhabitants",,2781.3,,,390.2,5.3,,...,5.7,6.5,17.4,18.2,14.4,11.6,7.9,12.7,4.7,1.0


In [43]:
dem_df_merged.shape

(3283, 563)

In [44]:
dem_df_merged.head(10)

Unnamed: 0,Aggravated assault,Burglary,Counties/principal cities,Forcible rape,Larceny- theft,Larceny-theft,Larceny‑ theft,Motor vehicle theft,Murder and nonnegligent manslaughter,Population,...,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1939 or earlier,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1940 to 1949,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1950 to 1959,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1960 to 1969,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1970 to 1979,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1980 to 1989,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 1990 to 1999,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2000 to 2009,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2010 to 2013,Percent; YEAR STRUCTURE BUILT - Total housing units - Built 2014 or later
0,227.8,649.9,"Rate per 100,000 inhabitants",,2077.9,,,175.4,6.5,,...,9.5,8.4,18.6,11.1,15.0,15.5,8.6,9.3,2.6,1.5
1,121.0,563.3,"Rate per 100,000 inhabitants",,1857.3,,,144.0,6.0,,...,19.0,7.2,14.9,11.6,13.9,9.1,12.9,9.4,1.5,0.6
2,534.9,949.8,"Rate per 100,000 inhabitants",,2676.9,,,144.2,11.8,,...,6.2,3.2,9.7,14.0,15.2,15.4,17.0,16.9,1.9,0.3
3,59.0,378.6,"Rate per 100,000 inhabitants",,2101.1,,,210.6,0.0,,...,10.9,8.6,8.7,9.7,20.9,7.9,16.8,12.9,2.5,1.1
4,180.0,281.5,"Rate per 100,000 inhabitants",,1646.0,,,82.0,2.0,,...,29.8,5.9,9.8,8.9,11.3,12.1,9.6,8.9,2.8,0.9
5,596.7,951.1,"Rate per 100,000 inhabitants",,3319.6,,,1018.0,8.3,,...,2.8,3.7,9.2,10.0,18.1,16.4,19.0,17.3,2.1,1.3
6,748.5,1271.2,"Rate per 100,000 inhabitants",,3111.7,,,314.9,7.8,,...,5.2,8.0,15.0,12.4,16.8,12.3,12.6,11.3,5.0,1.3
7,,264.7,"Rate per 100,000 inhabitants",,,,,89.1,2.6,,...,24.8,5.8,11.6,10.2,12.1,10.4,10.7,11.6,2.0,0.9
8,143.3,190.5,"Rate per 100,000 inhabitants",,1124.5,,,44.0,0.8,,...,31.1,7.0,11.4,11.3,14.9,6.8,8.3,7.5,1.4,0.2
9,413.7,761.3,"Rate per 100,000 inhabitants",,2781.3,,,390.2,5.3,,...,5.7,6.5,17.4,18.2,14.4,11.6,7.9,12.7,4.7,1.0


In [45]:
dem_df_merged.to_csv('data/derived/crime_ACS_merged.csv', index=False)