In [1]:
% matplotlib inline
import pandas as pd
import math
import matplotlib.pyplot as plt
import os
from datetime import datetime as dtdt

In [3]:
os.listdir('./metadata/')

['ANC1P.txt', 'LANP.txt', 'PUMA.txt', 'POB.txt', 'POBP.txt']

In [4]:
with open('./metadata/LANP.txt', 'r') as f:
    lanp = f.readlines()
    LanpMap = dict([(l[:4], l[6:].strip()) for l in lanp])
    
with open('./metadata/POBP.txt', 'r') as f:
    pobp = f.readlines()
    PobpMap = dict([(l[:3], l[5:].strip()) for l in pobp])

with open('./metadata/ANC1P.txt', 'r') as f:
    ancp = f.readlines()
    AncpMap = dict([(l[:3], l[5:].strip()) for l in ancp])
    
LanpMapDf = pd.DataFrame(LanpMap, ['language']).transpose()
LanpMapDf.index.name = 'LANP'

PobpMapDf = pd.DataFrame(PobpMap, ['birthplace']).transpose()
PobpMapDf.index.name = 'POBP'

AncpMapDf = pd.DataFrame(AncpMap, ['ancestry']).transpose()
AncpMapDf.index.name = 'ANCP'

with open('./metadata/PUMA.txt', 'r') as f:
    puma = f.readlines()

PumaMapDf = pd.DataFrame([{'state': ln[:2], 'puma': ln[3:8], 'name': ln[9:].strip()} for ln in puma])[['state', 'puma', 'name']]

NyPumaMapDf = PumaMapDf[PumaMapDf.state=='36'].set_index('puma')[['name']]
CaPumaMapDf = PumaMapDf[PumaMapDf.state=='06'].set_index('puma')[['name']]

pumasNYC = NyPumaMapDf[NyPumaMapDf.name.str.startswith('NYC')].index
pumasLA = CaPumaMapDf[CaPumaMapDf.name.str.startswith('Los Angeles County')].index

In [5]:
print dtdt.now(), 'read file'
CaPums = pd.read_csv('data/csv_pca/psam_p06.csv')
print dtdt.now(), 'format columns'
CaPums.LANP = CaPums.LANP.fillna('bbbb').apply(lambda n: str(n)[:4])
CaPums.POBP = CaPums.POBP.apply(lambda n: '%03d' % n)
CaPums.ANC1P = CaPums.ANC1P.apply(lambda n: '%03d' % n)
CaPums.PUMA = CaPums.PUMA.apply(lambda n: '%05d' % n)
print dtdt.now(), 'done'

CaPums

2021-08-12 18:47:53.955952 read file
2021-08-12 18:48:24.383988 format columns
2021-08-12 18:48:28.587865 done


Unnamed: 0,RT,SERIALNO,DIVISION,SPORDER,PUMA,REGION,ST,ADJINC,PWGTP,AGEP,...,PWGTP71,PWGTP72,PWGTP73,PWGTP74,PWGTP75,PWGTP76,PWGTP77,PWGTP78,PWGTP79,PWGTP80
0,P,2019GQ0000003,9,1,03704,4,6,1010145,21,58,...,0,40,0,20,41,40,41,21,0,2
1,P,2019GQ0000009,9,1,07322,4,6,1010145,34,66,...,62,58,33,7,59,7,34,34,32,34
2,P,2019GQ0000013,9,1,05904,4,6,1010145,28,18,...,27,27,49,49,7,27,27,29,6,5
3,P,2019GQ0000023,9,1,07107,4,6,1010145,127,58,...,117,234,226,135,210,217,128,116,125,21
4,P,2019GQ0000024,9,1,08900,4,6,1010145,103,18,...,180,22,104,104,100,182,24,103,103,23
5,P,2019GQ0000048,9,1,03729,4,6,1010145,6,65,...,0,2,7,7,0,15,8,14,14,6
6,P,2019GQ0000055,9,1,11105,4,6,1010145,4,21,...,0,3,7,0,0,4,5,9,3,4
7,P,2019GQ0000064,9,1,03703,4,6,1010145,81,54,...,7,154,80,7,5,81,83,83,82,85
8,P,2019GQ0000070,9,1,11104,4,6,1010145,16,82,...,0,15,0,16,34,17,33,0,15,31
9,P,2019GQ0000074,9,1,02901,4,6,1010145,58,38,...,57,101,60,104,12,13,59,58,56,106


In [None]:
print dtdt.now(), 'read file'
NyPums = pd.read_csv('data/csv_pny/psam_p36.csv')
print dtdt.now(), 'format columns'
NyPums.LANP = NyPums.LANP.fillna('bbbb').apply(lambda n: str(n)[:4])
NyPums.POBP = NyPums.POBP.apply(lambda n: '%03d' % n)
NyPums.ANC1P = NyPums.ANC1P.apply(lambda n: '%03d' % n)
NyPums.PUMA = NyPums.PUMA.apply(lambda n: '%05d' % n)
print dtdt.now(), 'done'

NyPums

2021-08-12 18:48:49.954534 read file


In [None]:
nycSub = NyPums[NyPums.PUMA.isin(pumasNYC)][['LANP', 'POBP', 'ANC1P', 'PUMA', 'PWGTP']]
laSub = CaPums[CaPums.PUMA.isin(pumasLA)][['LANP', 'POBP', 'ANC1P', 'PUMA', 'PWGTP']]

In [180]:
laSub.set_index('LANP').join(LanpMapDf).set_index('POBP').join(PobpMapDf).set_index('ANC1P'
                                ).join(AncpMapDf).set_index('PUMA').join(CaPumaMapDf).sort_values('PWGTP', 
                                                                                                  ascending=False).reset_index().to_csv('data/pums_micro_LA.csv', index=False)

In [185]:
nycSub.set_index('LANP').join(LanpMapDf).set_index('POBP').join(PobpMapDf).set_index('ANC1P'
                                ).join(AncpMapDf).set_index('PUMA').join(NyPumaMapDf).sort_values('PWGTP', 
                                                                                                  ascending=False).reset_index().to_csv('data/pums_micro_NYC.csv', index=False)

In [129]:
CaPums.groupby('LANP')[['PWGTP']].sum().sort_values('PWGTP', ascending=False).join(LanpMapDf)

Unnamed: 0_level_0,PWGTP,language
LANP,Unnamed: 1_level_1,Unnamed: 2_level_1
bbbb,22856501,N/A (GQ/vacant)
1200,10525320,Spanish
2920,661574,Tagalog
1970,604235,Chinese
1960,551489,Vietnamese
2575,372142,Korean
2050,279079,Cantonese
2000,265349,Mandarin
1290,194171,Farsi
1288,192922,Armenian


In [127]:
CaPums.groupby('POBP')[['PWGTP']].sum().sort_values('PWGTP', ascending=False).join(PobpMapDf)

Unnamed: 0_level_0,PWGTP,birthplace
POBP,Unnamed: 1_level_1,Unnamed: 2_level_1
006,21468960,California/CA
303,4306098,Mexico
233,884983,Philippines
207,624326,China
036,615544,New York/NY
247,529559,Vietnam
048,481798,Texas/TX
210,474849,India
017,453858,Illinois/IL
312,437385,El Salvador


In [126]:
CaPums.groupby('ANC1P')[['PWGTP']].sum().sort_values('PWGTP', ascending=False).join(AncpMapDf)

Unnamed: 0_level_0,PWGTP,ancestry
ANC1P,Unnamed: 1_level_1,Unnamed: 2_level_1
210,8311824,Mexican
999,4285350,Not reported
032,1916228,German
902,1465658,African American
050,1458112,Irish
211,1405198,Mexican American
022,1346902,English
706,1314693,Chinese
924,1254234,White
720,1205589,Filipino


In [131]:
CaPums.groupby('PUMA')[['PWGTP']].sum().sort_values('PWGTP', ascending=False).join(CaPumaMapDf)

Unnamed: 0_level_0,PWGTP,name
PUMA,Unnamed: 1_level_1,Unnamed: 2_level_1
05904,235455,Orange County (Central)--Irvine City (Central)
00110,229118,"Alameda County (East)--Livermore, Pleasanton &..."
02902,216503,Kern County (Central)--Bakersfield City (West)
07703,216196,"San Joaquin County (South)--Tracy, Manteca & L..."
05905,213206,"Orange County (Northeast)--Lake Forest, Irvine..."
11300,212251,"Yolo County--Davis, Woodland & West Sacramento..."
08303,209226,Santa Barbara County--South Coast Region
05903,208559,"Orange County (West Central)--Newport Beach, A..."
08501,207446,"Santa Clara County (Northwest)--Mountain View,..."
03729,206730,Los Angeles County (West Central)--LA City (We...
