In [1]:
import pandas as pd

## Load in the "rosetta stone" file

I made this file using mapshaper, as executed in the `makefile` one directory up from this one.

I then viewed the confidence-check files in QGIS to visually inspect the boundaries, and deal with shapes that didn't quite fit.

In [3]:
# this is where the Makefile put my files. Your path will vary
mypath = '/Volumes/JK_Smarts_Data/precinct_project/NY'

In [83]:
rosetta_df = pd.read_csv(f'{mypath}/nyc_blocks_precincts_prelim.csv', dtype={'GEOID20':'str', 'precinct':'Int64'})

In [84]:
rosetta_df.columns

Index(['STATEFP20', 'COUNTYFP20', 'TRACTCE20', 'BLOCKCE20', 'GEOID20',
       'NAME20', 'MTFCC20', 'UR20', 'UACE20', 'UATYPE20', 'FUNCSTAT20',
       'ALAND20', 'AWATER20', 'precinct', 'shape_area', 'shape_leng'],
      dtype='object')

In [85]:
rosetta_df.drop(columns=['STATEFP20', 'COUNTYFP20', 'TRACTCE20', 'BLOCKCE20',
       'NAME20', 'MTFCC20', 'UR20', 'UACE20', 'UATYPE20', 'FUNCSTAT20',
       'ALAND20', 'AWATER20', 'shape_area', 'shape_leng'], inplace=True)

In [94]:
rosetta_df

Unnamed: 0,GEOID20,precinct
0,360050054003000,43
1,360050442003002,47
2,360050462083001,47
3,360050296001007,49
4,360050169001005,42
...,...,...
37375,360850208053004,123
37376,360850277042003,121
37377,360850244013006,123
37378,360850020011014,120


## Adding fixes

In the analysis of the data files, the following block centroids a) fell out of any precinct and b) had a population greater than zero:
```
GEOID20,precinct
360471018001000,69
360610007008002,1
360050274023006,45
360050090002018,45
360810945001000,109
360470628002029,61
360470628003014,61
360850097011000,120
360850223001000,121
```

Using [Mr. Data Conveter](https://shancarter.github.io/mr-data-converter/)...


In [98]:
to_change = [{"GEOID20":'360471018001000',"precinct":69},
{"GEOID20":'360610007008002',"precinct":1},
{"GEOID20":'360050274023006',"precinct":45},
{"GEOID20":'360050090002018',"precinct":45},
{"GEOID20":'360810945001000',"precinct":109},
{"GEOID20":'360470628002029',"precinct":61},
{"GEOID20":'360470628003014',"precinct":61},
{"GEOID20":'360850097011000',"precinct":120},
{"GEOID20":'360850223001000',"precinct":121}]

In [99]:
for item in to_change:
    rosetta_df.loc[rosetta_df['GEOID20'] == item['GEOID20'], ['precinct']] = item['precinct']

In [101]:
# let's see the current status of 360050274023006
rosetta_df.loc[rosetta_df['GEOID20'] == '360050274023006']

Unnamed: 0,GEOID20,precinct
334,360050274023006,45


## Load in the population data

I downloaded the population files from [data.census.gov](https://data.census.gov/cedsci/advanced). 

I'm using the 2020 Census [P1 Race](https://data.census.gov/cedsci/table?q=United%20States%20Race%20and%20Ethnicity&g=0500000US36005%241000000,36047%241000000,36061%241000000,36081%241000000,36085%241000000&y=2020&tid=DECENNIALPL2020.P1) and [P2 Hispanic or Latino](https://data.census.gov/cedsci/table?q=United%20States%20Race%20and%20Ethnicity&g=0500000US36005%241000000,36047%241000000,36061%241000000,36081%241000000,36085%241000000&y=2020&tid=DECENNIALPL2020.P1) files.

### P1 Race

Note that the column numbers are translated into column descriptions [in this metadata file]('../data/nyc/DECENNIALPL2020.P1_metadata_2022-02-06T162722.csv')

In [112]:
# census P1 for NYC
p1_df = pd.read_csv(f'{mypath}/DECENNIALPL2020.P1_2022-02-06T162735/DECENNIALPL2020.P1_data_with_overlays_2022-02-06T162722.csv', skiprows=[1])


In [113]:
p1_df

Unnamed: 0,GEO_ID,NAME,P1_001N,P1_002N,P1_003N,P1_004N,P1_005N,P1_006N,P1_007N,P1_008N,...,P1_062N,P1_063N,P1_064N,P1_065N,P1_066N,P1_067N,P1_068N,P1_069N,P1_070N,P1_071N
0,1000000US360050001000001,"Block 0001, Block Group 0, Census Tract 1, Bro...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000000US360050001000002,"Block 0002, Block Group 0, Census Tract 1, Bro...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1000000US360050001001000,"Block 1000, Block Group 1, Census Tract 1, Bro...",171,170,24,100,1,0,0,45,...,0,0,0,0,0,0,0,0,0,0
3,1000000US360050001001001,"Block 1001, Block Group 1, Census Tract 1, Bro...",1344,1318,293,734,11,32,0,248,...,0,0,0,0,0,0,0,0,0,0
4,1000000US360050001001002,"Block 1002, Block Group 1, Census Tract 1, Bro...",367,366,44,253,0,1,0,68,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37980,1000000US360859901000007,"Block 0007, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37981,1000000US360859901000008,"Block 0008, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37982,1000000US360859901000009,"Block 0009, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37983,1000000US360859901000010,"Block 0010, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


^^ Accidentally picked up the entire US. But that will fall away on the merge.

In [114]:
p1_df['GEO_ID'] = p1_df['GEO_ID'].str.replace('1000000US','')

In [115]:
p1_df

Unnamed: 0,GEO_ID,NAME,P1_001N,P1_002N,P1_003N,P1_004N,P1_005N,P1_006N,P1_007N,P1_008N,...,P1_062N,P1_063N,P1_064N,P1_065N,P1_066N,P1_067N,P1_068N,P1_069N,P1_070N,P1_071N
0,360050001000001,"Block 0001, Block Group 0, Census Tract 1, Bro...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,360050001000002,"Block 0002, Block Group 0, Census Tract 1, Bro...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,360050001001000,"Block 1000, Block Group 1, Census Tract 1, Bro...",171,170,24,100,1,0,0,45,...,0,0,0,0,0,0,0,0,0,0
3,360050001001001,"Block 1001, Block Group 1, Census Tract 1, Bro...",1344,1318,293,734,11,32,0,248,...,0,0,0,0,0,0,0,0,0,0
4,360050001001002,"Block 1002, Block Group 1, Census Tract 1, Bro...",367,366,44,253,0,1,0,68,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37980,360859901000007,"Block 0007, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37981,360859901000008,"Block 0008, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37982,360859901000009,"Block 0009, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37983,360859901000010,"Block 0010, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [116]:
## merge onto the rosetta file
rosetta_p1 = rosetta_df.merge(p1_df, left_on='GEOID20', right_on='GEO_ID', how='left')

In [117]:
rosetta_p1

Unnamed: 0,GEOID20,precinct,GEO_ID,NAME,P1_001N,P1_002N,P1_003N,P1_004N,P1_005N,P1_006N,...,P1_062N,P1_063N,P1_064N,P1_065N,P1_066N,P1_067N,P1_068N,P1_069N,P1_070N,P1_071N
0,360050054003000,43,360050054003000,"Block 3000, Block Group 3, Census Tract 54, Br...",538,473,32,92,19,23,...,0,0,0,0,0,0,0,0,0,0
1,360050442003002,47,360050442003002,"Block 3002, Block Group 3, Census Tract 442, B...",155,129,3,117,2,4,...,0,0,0,0,0,0,0,0,0,0
2,360050462083001,47,360050462083001,"Block 3001, Block Group 3, Census Tract 462.08...",264,250,10,215,0,0,...,0,0,0,0,0,0,0,0,0,0
3,360050296001007,49,360050296001007,"Block 1007, Block Group 1, Census Tract 296, B...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,360050169001005,42,360050169001005,"Block 1005, Block Group 1, Census Tract 169, B...",370,347,19,221,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37375,360850208053004,123,360850208053004,"Block 3004, Block Group 3, Census Tract 208.05...",124,113,111,0,0,2,...,0,0,0,0,0,0,0,0,0,0
37376,360850277042003,121,360850277042003,"Block 2003, Block Group 2, Census Tract 277.04...",246,243,131,7,0,75,...,0,0,0,0,0,0,0,0,0,0
37377,360850244013006,123,360850244013006,"Block 3006, Block Group 3, Census Tract 244.01...",96,90,84,1,1,1,...,0,0,0,0,0,0,0,0,0,0
37378,360850020011014,120,360850020011014,"Block 1014, Block Group 1, Census Tract 20.01,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### P2 Hispanic & Latino

In [118]:
# census P1 for NYC
p2_df = pd.read_csv(f'{mypath}/DECENNIALPL2020.P2_2022-02-06T162813/DECENNIALPL2020.P2_data_with_overlays_2022-02-06T162757.csv', skiprows=[1])


In [119]:
p2_df['GEO_ID'] = p2_df['GEO_ID'].str.replace('1000000US','')

In [120]:
p2_df

Unnamed: 0,GEO_ID,NAME,P2_001N,P2_002N,P2_003N,P2_004N,P2_005N,P2_006N,P2_007N,P2_008N,...,P2_064N,P2_065N,P2_066N,P2_067N,P2_068N,P2_069N,P2_070N,P2_071N,P2_072N,P2_073N
0,360050001000001,"Block 0001, Block Group 0, Census Tract 1, Bro...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,360050001000002,"Block 0002, Block Group 0, Census Tract 1, Bro...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,360050001001000,"Block 1000, Block Group 1, Census Tract 1, Bro...",171,75,96,95,8,87,0,0,...,0,0,0,0,0,0,0,0,0,0
3,360050001001001,"Block 1001, Block Group 1, Census Tract 1, Bro...",1344,475,869,861,145,676,6,32,...,0,0,0,0,0,0,0,0,0,0
4,360050001001002,"Block 1002, Block Group 1, Census Tract 1, Bro...",367,110,257,256,17,234,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37980,360859901000007,"Block 0007, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37981,360859901000008,"Block 0008, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37982,360859901000009,"Block 0009, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37983,360859901000010,"Block 0010, Block Group 0, Census Tract 9901, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [121]:
p2_df.drop(columns=['NAME'], inplace=True)

In [122]:
## merge onto the rosetta file
rosetta_p1_p2 = rosetta_p1.merge(p2_df, left_on='GEOID20', right_on='GEO_ID', how='left')

In [123]:
rosetta_p1_p2

Unnamed: 0,GEOID20,precinct,GEO_ID_x,NAME,P1_001N,P1_002N,P1_003N,P1_004N,P1_005N,P1_006N,...,P2_064N,P2_065N,P2_066N,P2_067N,P2_068N,P2_069N,P2_070N,P2_071N,P2_072N,P2_073N
0,360050054003000,43,360050054003000,"Block 3000, Block Group 3, Census Tract 54, Br...",538,473,32,92,19,23,...,0,0,0,0,0,0,0,0,0,0
1,360050442003002,47,360050442003002,"Block 3002, Block Group 3, Census Tract 442, B...",155,129,3,117,2,4,...,0,0,0,0,0,0,0,0,0,0
2,360050462083001,47,360050462083001,"Block 3001, Block Group 3, Census Tract 462.08...",264,250,10,215,0,0,...,0,0,0,0,0,0,0,0,0,0
3,360050296001007,49,360050296001007,"Block 1007, Block Group 1, Census Tract 296, B...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,360050169001005,42,360050169001005,"Block 1005, Block Group 1, Census Tract 169, B...",370,347,19,221,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37375,360850208053004,123,360850208053004,"Block 3004, Block Group 3, Census Tract 208.05...",124,113,111,0,0,2,...,0,0,0,0,0,0,0,0,0,0
37376,360850277042003,121,360850277042003,"Block 2003, Block Group 2, Census Tract 277.04...",246,243,131,7,0,75,...,0,0,0,0,0,0,0,0,0,0
37377,360850244013006,123,360850244013006,"Block 3006, Block Group 3, Census Tract 244.01...",96,90,84,1,1,1,...,0,0,0,0,0,0,0,0,0,0
37378,360850020011014,120,360850020011014,"Block 1014, Block Group 1, Census Tract 20.01,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [124]:
rosetta_p1_p2.drop(columns=['GEO_ID_x','GEO_ID_y'], inplace=True)

In [125]:
rosetta_p1_p2

Unnamed: 0,GEOID20,precinct,NAME,P1_001N,P1_002N,P1_003N,P1_004N,P1_005N,P1_006N,P1_007N,...,P2_064N,P2_065N,P2_066N,P2_067N,P2_068N,P2_069N,P2_070N,P2_071N,P2_072N,P2_073N
0,360050054003000,43,"Block 3000, Block Group 3, Census Tract 54, Br...",538,473,32,92,19,23,0,...,0,0,0,0,0,0,0,0,0,0
1,360050442003002,47,"Block 3002, Block Group 3, Census Tract 442, B...",155,129,3,117,2,4,0,...,0,0,0,0,0,0,0,0,0,0
2,360050462083001,47,"Block 3001, Block Group 3, Census Tract 462.08...",264,250,10,215,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,360050296001007,49,"Block 1007, Block Group 1, Census Tract 296, B...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,360050169001005,42,"Block 1005, Block Group 1, Census Tract 169, B...",370,347,19,221,0,2,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37375,360850208053004,123,"Block 3004, Block Group 3, Census Tract 208.05...",124,113,111,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
37376,360850277042003,121,"Block 2003, Block Group 2, Census Tract 277.04...",246,243,131,7,0,75,0,...,0,0,0,0,0,0,0,0,0,0
37377,360850244013006,123,"Block 3006, Block Group 3, Census Tract 244.01...",96,90,84,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
37378,360850020011014,120,"Block 1014, Block Group 1, Census Tract 20.01,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [126]:
## Check for duplicates
rosetta_p1_p2.duplicated(subset='GEOID20', keep='first').sum()

0

In [133]:
## Save this out
rosetta_p1_p2.to_csv('../data/nyc/nyc_block_precinct_2020pop.csv', index=False)

In [128]:
import numpy as np

In [129]:
pivot = pd.pivot_table(rosetta_p1_p2, index="precinct", aggfunc=np.sum)

In [130]:
pivot

Unnamed: 0_level_0,P1_001N,P1_002N,P1_003N,P1_004N,P1_005N,P1_006N,P1_007N,P1_008N,P1_009N,P1_010N,...,P2_064N,P2_065N,P2_066N,P2_067N,P2_068N,P2_069N,P2_070N,P2_071N,P2_072N,P2_073N
precinct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,84799,76937,57501,2655,96,14624,55,2006,7862,7409,...,1,0,0,0,0,0,0,0,0,0
5,50598,47663,13363,3556,109,27507,69,3059,2935,2749,...,0,1,0,1,0,0,0,0,0,0
6,64643,59481,50320,1782,72,5635,34,1638,5162,4867,...,0,0,0,0,0,0,0,0,0,0
7,57985,51746,18416,6934,396,17779,46,8175,6239,5672,...,0,0,0,0,0,0,0,0,0,0
9,75951,68112,41843,6610,480,11322,84,7773,7839,7193,...,0,3,3,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,179134,154691,26261,10054,4894,34726,119,78637,24443,23033,...,1,2,0,2,0,0,0,0,2,2
120,122308,109715,46574,28782,1410,14002,63,18884,12593,11604,...,0,1,0,1,0,0,0,0,0,0
121,128149,117516,61832,17664,1517,19670,71,16762,10633,9892,...,0,2,0,2,0,0,0,0,0,0
122,144552,135413,104075,4494,464,18771,66,7543,9139,8581,...,0,2,0,2,0,0,0,0,0,0


In [131]:
pivot.reset_index(inplace=True)

In [132]:
pivot

Unnamed: 0,precinct,P1_001N,P1_002N,P1_003N,P1_004N,P1_005N,P1_006N,P1_007N,P1_008N,P1_009N,...,P2_064N,P2_065N,P2_066N,P2_067N,P2_068N,P2_069N,P2_070N,P2_071N,P2_072N,P2_073N
0,1,84799,76937,57501,2655,96,14624,55,2006,7862,...,1,0,0,0,0,0,0,0,0,0
1,5,50598,47663,13363,3556,109,27507,69,3059,2935,...,0,1,0,1,0,0,0,0,0,0
2,6,64643,59481,50320,1782,72,5635,34,1638,5162,...,0,0,0,0,0,0,0,0,0,0
3,7,57985,51746,18416,6934,396,17779,46,8175,6239,...,0,0,0,0,0,0,0,0,0,0
4,9,75951,68112,41843,6610,480,11322,84,7773,7839,...,0,3,3,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,115,179134,154691,26261,10054,4894,34726,119,78637,24443,...,1,2,0,2,0,0,0,0,2,2
73,120,122308,109715,46574,28782,1410,14002,63,18884,12593,...,0,1,0,1,0,0,0,0,0,0
74,121,128149,117516,61832,17664,1517,19670,71,16762,10633,...,0,2,0,2,0,0,0,0,0,0
75,122,144552,135413,104075,4494,464,18771,66,7543,9139,...,0,2,0,2,0,0,0,0,0,0


In [134]:
pivot.to_csv('../data/nyc/nyc_precinct_2020pop.csv', index=False)

Done!