In [None]:
import pandas as pd

## Load in the "rosetta stone" file

I made this file using QGIS, the open-source mapping software. I loaded in the US Census 2010 block-level shapefile for Cook and DuPage counties in IL and the Chicago police boundaries shapefile [from here](https://data.cityofchicago.org/Public-Safety/Boundaries-Police-Districts-current-/fthy-xz3r). I then used the block centroids, provided by the census, to colect them within each zone. Since the centroids, by nature, are a "half a block" from the nearest street, this is more reliable than a polygon-in-polygon calculation. I then inspected the map visually for outliers.

I'll write up my steps for that soonest.

In [4]:
rosetta_df = pd.read_csv('../data/chicago/initial_rosetta.csv')

In [5]:
rosetta_df

Unnamed: 0,GEOID10,dist_num
0,170318105015005,31
1,170318105015000,31
2,170318105023016,31
3,170317709014009,31
4,170318105012006,31
...,...,...
46707,170310810002000,18
46708,170310812012008,18
46709,170310814033004,18
46710,170313201001006,1


## Adding in blocks that also fall into district 16

There were several blocks that didn't fall neatly into police districts out toward O'Hare airport when I did my initial mapping. So I exported those block lists from the mapping software and am adding them to the collection here.

In [6]:
dupage16_df = pd.read_csv('../data/chicago/to_16_dupage.csv')

In [7]:
cook16_df = pd.read_csv('../data/chicago/to_16.csv')

In [9]:
both_df = pd.concat([dupage16_df,cook16_df])

In [10]:
both_df

Unnamed: 0,STATEFP10,COUNTYFP10,TRACTCE10,BLOCKCE10,GEOID10,NAME10,MTFCC10,UR10,UACE10,FUNCSTAT10,ALAND10,AWATER10,INTPTLAT10,INTPTLON10
0,17,43,840000,2000,170438400002000,Block 2000,G5040,,,S,4011895,46078,41.979738,-87.928430
1,17,43,840000,2041,170438400002041,Block 2041,G5040,,,S,54498,1233,41.960233,-87.921280
2,17,43,840801,1018,170438408011018,Block 1018,G5040,,,S,0,3737,41.956262,-87.922710
3,17,43,840000,2009,170438400002009,Block 2009,G5040,,,S,30821,0,41.984599,-87.938467
4,17,43,840000,2004,170438400002004,Block 2004,G5040,,,S,25011,0,41.965593,-87.935880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,17,31,760802,1042,170317608021042,Block 1042,G5040,,,S,187895,0,41.957310,-87.855825
22,17,31,760801,1011,170317608011011,Block 1011,G5040,,,S,3786,0,41.983793,-87.848128
23,17,31,760801,1010,170317608011010,Block 1010,G5040,,,S,24881,0,41.984008,-87.854674
24,17,31,770700,1066,170317707001066,Block 1066,G5040,,,S,2331,0,41.973635,-87.862159


In [14]:
both_small = pd.DataFrame(both_df['GEOID10'])

In [15]:
both_small

Unnamed: 0,GEOID10
0,170438400002000
1,170438400002041
2,170438408011018
3,170438400002009
4,170438400002004
...,...
21,170317608021042
22,170317608011011
23,170317608011010
24,170317707001066


In [16]:
both_small['dist_num'] = 16

In [17]:
both_small

Unnamed: 0,GEOID10,dist_num
0,170438400002000,16
1,170438400002041,16
2,170438408011018,16
3,170438400002009,16
4,170438400002004,16
...,...,...
21,170317608021042,16
22,170317608011011,16
23,170317608011010,16
24,170317707001066,16


In [18]:
rosetta_df2 = pd.concat([rosetta_df, both_small])

In [20]:
rosetta_df2.shape

(46785, 2)

## Make some fixes

```
170310814031007
=> 1

to_16.csv => 16

170318104003050
northern half is in 31st
southern half is in 16th
but the southern half seems to be mostly commercial. So putting it in 31st
=> 31

170438400002041
This block is in dupage county south of O’Hare
The southern half of hangs outside the 16th District … but that part of the block is a rail yard. So leaving it all in 16.
```


In [25]:
# let's see the current status of 170310814031007
rosetta_df2.loc[rosetta_df2['GEOID10'] == 170310814031007]

Unnamed: 0,GEOID10,dist_num


In [32]:
# adding a row
quick_row = pd.DataFrame([[170310814031007, 1]], columns=['GEOID10', 'dist_num'])
quick_row

Unnamed: 0,GEOID10,dist_num
0,170310814031007,1


In [34]:
rosetta_df3 = pd.concat([rosetta_df2,quick_row ])

In [35]:
rosetta_df3.shape

(46786, 2)

In [38]:
# let's see the current status of 170318104003050
rosetta_df3.loc[rosetta_df3['GEOID10'] == 170318104003050]

Unnamed: 0,GEOID10,dist_num
5512,170318104003050,16


In [39]:
to_change = [170318104003050]
for item in to_change:
    rosetta_df3.loc[rosetta_df3['GEOID10'] == item, ['dist_num']] = 31

In [40]:
rosetta_df3.loc[rosetta_df3['GEOID10'] == 170318104003050]

Unnamed: 0,GEOID10,dist_num
5512,170318104003050,31


In [41]:
# let's see the current status of 170438400002041
rosetta_df3.loc[rosetta_df3['GEOID10'] == 170438400002041]

Unnamed: 0,GEOID10,dist_num
1,170438400002041,16


Leeaving that at 16

In [45]:
rosetta_df3.to_csv('../data/chicago/chicago_2010blocks_2020policedistricts_key.csv', index=False)

## Load in the population data

I downloaded the population files from [census.data.gov](https://census.data.gov). 

Here are the [P3 and P5 census table files for Cook County](https://s3.amazonaws.com/media.johnkeefe.net/census-by-precinct/17031_Cook_County.zip). And here is the ["productDownload_2020-06-07T173132" zip file](https://s3.amazonaws.com/media.johnkeefe.net/census-by-precinct/productDownload_2020-06-07T173132.zip). It's a little messy, and the census doesn't label the files well, but I'm providing them as I got them. The CSVs you need are in there! Adjust your paths accordingly.

In [None]:
# census P3 for cook county by block
cook_df_p3 = pd.read_csv('/Volumes/JK_Smarts_Data/precinct_project/IL/17031_Cook_County/DECENNIALSF12010.P3_2020-06-07T150142/DECENNIALSF12010.P3_data_with_overlays_2020-06-07T150129.csv')

In [112]:
cook_df_p3.reset_index()
cook_df_p3.drop(0, inplace=True)

In [113]:
# census P3 for cook county by block
cook_df_p5 = pd.read_csv('/Volumes/JK_Smarts_Data/precinct_project/IL/17031_Cook_County/DECENNIALSF12010.P5_2020-06-07T145711/DECENNIALSF12010.P5_data_with_overlays_2020-06-07T145658.csv', low_memory=False)

In [114]:
cook_df_p5.reset_index()
cook_df_p5.drop(0, inplace=True)

In [115]:
cook_df_p3.shape, cook_df_p5.shape

((99042, 10), (99042, 19))

In [116]:
cook_df = cook_df_p3.merge(cook_df_p5, on='GEO_ID')

In [117]:
cook_df.shape

(99042, 28)

In [118]:
cook_df

Unnamed: 0,GEO_ID,NAME_x,P003001,P003002,P003003,P003004,P003005,P003006,P003007,P003008,...,P005008,P005009,P005010,P005011,P005012,P005013,P005014,P005015,P005016,P005017
0,1000000US170310101001000,"Block 1000, Block Group 1, Census Tract 101, C...",128,92,24,0,9,0,0,3,...,0,3,5,5,0,0,0,0,0,0
1,1000000US170310101001001,"Block 1001, Block Group 1, Census Tract 101, C...",71,23,34,2,2,0,5,5,...,4,5,7,4,0,2,0,0,1,0
2,1000000US170310101001002,"Block 1002, Block Group 1, Census Tract 101, C...",45,37,3,0,5,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,1000000US170310101001003,"Block 1003, Block Group 1, Census Tract 101, C...",335,171,110,6,5,0,29,14,...,1,10,108,68,3,5,0,0,28,4
4,1000000US170310101002013,"Block 2013, Block Group 2, Census Tract 101, C...",302,31,240,3,4,0,10,14,...,0,14,20,7,1,2,0,0,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99037,1000000US170319900000012,"Block 0012, Block Group 0, Census Tract 9900, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99038,1000000US170319900000013,"Block 0013, Block Group 0, Census Tract 9900, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99039,1000000US170319900000014,"Block 0014, Block Group 0, Census Tract 9900, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99040,1000000US170319900000015,"Block 0015, Block Group 0, Census Tract 9900, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


See note a few cells up about where to get thse files.

In [119]:
dupage_p3 = pd.read_csv('/Volumes/JK_Smarts_Data/precinct_project/IL/DuPage County/productDownload_2020-06-07T173132/DECENNIALSF12010.P3_data_with_overlays_2020-06-07T173122.csv')
dupage_p3.reset_index()
dupage_p3.drop(0, inplace=True)

In [120]:
dupage_p5 = pd.read_csv('/Volumes/JK_Smarts_Data/precinct_project/IL/DuPage County/productDownload_2020-06-07T173132/DECENNIALSF12010.P5_data_with_overlays_2020-06-07T173122.csv')
dupage_p5.reset_index()
dupage_p5.drop(0, inplace=True)

In [121]:
dupage_p3.shape,dupage_p5.shape

((17374, 10), (17374, 19))

In [122]:
dupage_df = dupage_p3.merge(dupage_p5, on="GEO_ID")

In [124]:
chicago_df = pd.concat([cook_df,dupage_df])

In [125]:
chicago_df.shape

(116416, 28)

In [127]:
chicago_df.columns

Index(['GEO_ID', 'NAME_x', 'P003001', 'P003002', 'P003003', 'P003004',
       'P003005', 'P003006', 'P003007', 'P003008', 'NAME_y', 'P005001',
       'P005002', 'P005003', 'P005004', 'P005005', 'P005006', 'P005007',
       'P005008', 'P005009', 'P005010', 'P005011', 'P005012', 'P005013',
       'P005014', 'P005015', 'P005016', 'P005017'],
      dtype='object')

In [144]:
chicago_df['GEOID10'] = chicago_df['GEO_ID'].str[9:].astype(int)

In [130]:
chicago_df.drop(columns=['NAME_y'], inplace = True)

In [131]:
chicago_df.columns

Index(['GEO_ID', 'NAME_x', 'P003001', 'P003002', 'P003003', 'P003004',
       'P003005', 'P003006', 'P003007', 'P003008', 'P005001', 'P005002',
       'P005003', 'P005004', 'P005005', 'P005006', 'P005007', 'P005008',
       'P005009', 'P005010', 'P005011', 'P005012', 'P005013', 'P005014',
       'P005015', 'P005016', 'P005017', 'GEOID10'],
      dtype='object')

In [141]:
chicago_df.columns

Index(['GEO_ID', 'NAME_x', 'P003001', 'P003002', 'P003003', 'P003004',
       'P003005', 'P003006', 'P003007', 'P003008', 'P005001', 'P005002',
       'P005003', 'P005004', 'P005005', 'P005006', 'P005007', 'P005008',
       'P005009', 'P005010', 'P005011', 'P005012', 'P005013', 'P005014',
       'P005015', 'P005016', 'P005017', 'GEOID10'],
      dtype='object')

In [133]:
rosetta_df3.shape

(46786, 2)

In [145]:
rosetta_df3.dtypes

GEOID10     int64
dist_num    int64
dtype: object

In [146]:
chicago_df.dtypes

GEO_ID     object
NAME_x     object
P003001    object
P003002    object
P003003    object
P003004    object
P003005    object
P003006    object
P003007    object
P003008    object
P005001    object
P005002    object
P005003    object
P005004    object
P005005    object
P005006    object
P005007    object
P005008    object
P005009    object
P005010    object
P005011    object
P005012    object
P005013    object
P005014    object
P005015    object
P005016    object
P005017    object
GEOID10     int64
dtype: object

In [147]:
## Add demographic data to each chicago PD district block
block_data = rosetta_df3.merge(chicago_df, on="GEOID10", how="left")

In [148]:
block_data.shape

(46786, 29)

In [149]:
block_data

Unnamed: 0,GEOID10,dist_num,GEO_ID,NAME_x,P003001,P003002,P003003,P003004,P003005,P003006,...,P005008,P005009,P005010,P005011,P005012,P005013,P005014,P005015,P005016,P005017
0,170318105015005,31,1000000US170318105015005,"Block 5005, Block Group 5, Census Tract 8105.0...",2,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,170318105015000,31,1000000US170318105015000,"Block 5000, Block Group 5, Census Tract 8105.0...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,170318105023016,31,1000000US170318105023016,"Block 3016, Block Group 3, Census Tract 8105.0...",466,403,16,1,29,0,...,0,1,38,21,0,1,0,0,13,3
3,170317709014009,31,1000000US170317709014009,"Block 4009, Block Group 4, Census Tract 7709.0...",82,70,0,0,0,0,...,0,0,19,7,0,0,0,0,6,6
4,170318105012006,31,1000000US170318105012006,"Block 2006, Block Group 2, Census Tract 8105.0...",189,169,0,0,13,0,...,0,0,38,31,0,0,0,0,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46781,170317608011011,16,1000000US170317608011011,"Block 1011, Block Group 1, Census Tract 7608.0...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46782,170317608011010,16,1000000US170317608011010,"Block 1010, Block Group 1, Census Tract 7608.0...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46783,170317707001066,16,1000000US170317707001066,"Block 1066, Block Group 1, Census Tract 7707, ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46784,170317608011003,16,1000000US170317608011003,"Block 1003, Block Group 1, Census Tract 7608.0...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [154]:
block_data.to_csv('./temp_data/chicago_2010blocks_2020policedistricts_population.csv', index=False)

In [156]:
# need to make all those columns numeric
block_data[['P003001', 'P003002', 'P003003', 'P003004',
       'P003005', 'P003006', 'P003007', 'P003008', 'P005001', 'P005002',
       'P005003', 'P005004', 'P005005', 'P005006', 'P005007', 'P005008',
       'P005009', 'P005010', 'P005011', 'P005012', 'P005013', 'P005014',
       'P005015', 'P005016', 'P005017']] = block_data[['P003001', 'P003002', 'P003003', 'P003004',
       'P003005', 'P003006', 'P003007', 'P003008', 'P005001', 'P005002',
       'P005003', 'P005004', 'P005005', 'P005006', 'P005007', 'P005008',
       'P005009', 'P005010', 'P005011', 'P005012', 'P005013', 'P005014',
       'P005015', 'P005016', 'P005017']].apply(pd.to_numeric)

In [171]:
## Check for duplicates
block_data.duplicated(subset='GEOID10', keep='first').sum()

0

In [157]:
import numpy as np
pivot = pd.pivot_table(block_data, index="dist_num", aggfunc=np.sum)

In [158]:
pivot

Unnamed: 0_level_0,GEOID10,P003001,P003002,P003003,P003004,P003005,P003006,P003007,P003008,P005001,...,P005008,P005009,P005010,P005011,P005012,P005013,P005014,P005015,P005016,P005017
dist_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,146812115328730304,62781,35208,13657,138,10835,29,1088,1826,62781,...,157,1547,3766,2256,205,43,45,7,931,279
2,215789978266006464,95439,19189,66577,187,5867,13,963,2643,95439,...,202,2267,3242,1442,584,49,30,0,761,376
3,203356890907365568,75235,1654,71508,182,318,4,332,1237,75235,...,98,1047,1123,182,498,13,6,0,234,190
4,577199272725154944,123575,26117,77303,689,311,31,16378,2746,123575,...,138,1222,35381,16192,904,466,53,2,16240,1524
5,386274777052580352,74396,1629,70429,166,41,10,1202,919,74396,...,55,746,2524,786,365,48,3,2,1147,173
6,333990462626651776,90841,446,88938,182,61,7,269,938,90841,...,60,799,914,134,413,18,0,1,209,139
7,310487988079624064,71071,511,69202,149,62,9,435,703,71071,...,49,606,1167,249,415,13,6,1,386,97
8,792312243417101312,247373,118778,53462,1632,2168,61,64904,6368,247373,...,231,1312,139854,67287,1243,1385,167,43,64673,5056
9,516741376640528384,165201,65820,19860,1153,26106,35,47743,4484,165201,...,146,1011,94610,41513,816,980,212,19,47597,3473
10,267906356651607328,118093,38171,40080,1099,321,16,35543,2863,118093,...,121,405,73441,33871,640,955,82,13,35422,2458


In [161]:
pivot.reset_index(inplace=True)

In [163]:
pivot.drop(columns=['GEOID10'], inplace=True)

In [164]:
pivot

Unnamed: 0,dist_num,P003001,P003002,P003003,P003004,P003005,P003006,P003007,P003008,P005001,...,P005008,P005009,P005010,P005011,P005012,P005013,P005014,P005015,P005016,P005017
0,1,62781,35208,13657,138,10835,29,1088,1826,62781,...,157,1547,3766,2256,205,43,45,7,931,279
1,2,95439,19189,66577,187,5867,13,963,2643,95439,...,202,2267,3242,1442,584,49,30,0,761,376
2,3,75235,1654,71508,182,318,4,332,1237,75235,...,98,1047,1123,182,498,13,6,0,234,190
3,4,123575,26117,77303,689,311,31,16378,2746,123575,...,138,1222,35381,16192,904,466,53,2,16240,1524
4,5,74396,1629,70429,166,41,10,1202,919,74396,...,55,746,2524,786,365,48,3,2,1147,173
5,6,90841,446,88938,182,61,7,269,938,90841,...,60,799,914,134,413,18,0,1,209,139
6,7,71071,511,69202,149,62,9,435,703,71071,...,49,606,1167,249,415,13,6,1,386,97
7,8,247373,118778,53462,1632,2168,61,64904,6368,247373,...,231,1312,139854,67287,1243,1385,167,43,64673,5056
8,9,165201,65820,19860,1153,26106,35,47743,4484,165201,...,146,1011,94610,41513,816,980,212,19,47597,3473
9,10,118093,38171,40080,1099,321,16,35543,2863,118093,...,121,405,73441,33871,640,955,82,13,35422,2458


In [166]:
pivot.to_csv('../data/chicago/chicago_2010pop_by_2020policedistricts.csv', index=False)

In [170]:
pivot['P003001'].sum()

2719858

Done!