In [1]:
import pandas as pd

### Load datasets

In [54]:
df_lea_tract = pd.read_csv("../data/lea_tract.csv")

In [55]:
df_lea_tract.head()

Unnamed: 0,LEAID,NAME_LEA19,TRACT
0,100001,Fort Rucker School District (AL),1031010300
1,100001,Fort Rucker School District (AL),1045020000
2,100003,Maxwell AFB School District (AL),1101000900
3,100003,Maxwell AFB School District (AL),1101001000
4,100003,Maxwell AFB School District (AL),1101006000


In [56]:
df_lea_tract.shape

(113520, 3)

#### Centers of Population by Census Tract

https://www.census.gov/geographies/reference-files/2010/geo/2010-centers-population.html

In [4]:
df_tx_pop = pd.read_csv("../data/CenPop2010_Mean_TR48")

In [5]:
df_tx_pop.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,POPULATION,LATITUDE,LONGITUDE
0,48,1,950100,4685,31.999364,-95.531821
1,48,1,950401,5422,31.755614,-95.823901
2,48,1,950402,7535,31.784637,-95.902841
3,48,1,950500,4377,31.776938,-95.631712
4,48,1,950600,6405,31.747594,-95.666055


In [23]:
df_tx_pop.shape

(5265, 6)

### Preprocessing lea_tract

In [57]:
# create tract_only column in df_lea_tract
# take only texas (state code 48)

def select_tract_only(row):
    return int(str(row)[-6:])

def select_tx_only(row):
    return int(str(row)[:2])

def select_county(row):
    return int(str(row)[2:5])

df_lea_tract['TRACT_ONLY'] = df_lea_tract['TRACT'].apply(select_tract_only)
df_lea_tract['STATEFP1'] = df_lea_tract['TRACT'].apply(select_tx_only)
df_lea_tract['COUNTYFP1'] = df_lea_tract['TRACT'].apply(select_county)

In [58]:
df_lea_tract_tx = df_lea_tract[df_lea_tract.STATEFP1 == 48]
df_lea_tract_tx.shape

(8660, 6)

In [59]:
df_lea_tract_tx.head()

Unnamed: 0,LEAID,NAME_LEA19,TRACT,TRACT_ONLY,STATEFP1,COUNTYFP1
94939,4800001,Crosbyton Consolidated Independent School Dist...,48107950100,950100,48,107
94940,4800001,Crosbyton Consolidated Independent School Dist...,48107950200,950200,48,107
94941,4800001,Crosbyton Consolidated Independent School Dist...,48169950100,950100,48,169
94942,4800002,Spur Independent School District (TX),48125950300,950300,48,125
94943,4800002,Spur Independent School District (TX),48263950100,950100,48,263


In [65]:
# join tables by df_tx_pop.TRACTCE and df_lea_tract.TRACT_ONLY

df_result_practice = pd.merge(df_lea_tract, df_tx_pop, 
                              how='inner', left_on=['STATEFP1', 'COUNTYFP1', 'TRACT_ONLY'], 
                              right_on=['STATEFP','COUNTYFP','TRACTCE'])

# clean up, drop redundant columns
df_result_practice = df_result_practice.drop(['STATEFP1', 'COUNTYFP1', 'TRACT_ONLY'], axis=1)

In [66]:
df_result_practice.shape

(8660, 9)

In [68]:
df_result_practice.sort_values('LEAID')

Unnamed: 0,LEAID,NAME_LEA19,TRACT,STATEFP,COUNTYFP,TRACTCE,POPULATION,LATITUDE,LONGITUDE
0,4800001,Crosbyton Consolidated Independent School Dist...,48107950100,48,107,950100,2234,33.648090,-101.227149
2,4800001,Crosbyton Consolidated Independent School Dist...,48107950200,48,107,950200,2346,33.678198,-101.381889
5,4800001,Crosbyton Consolidated Independent School Dist...,48169950100,48,169,950100,6461,33.198856,-101.385674
10,4800002,Spur Independent School District (TX),48263950100,48,263,950100,808,33.247856,-100.629728
8,4800002,Spur Independent School District (TX),48125950300,48,125,950300,2444,33.534982,-100.852578
...,...,...,...,...,...,...,...,...,...
1844,4846770,Zephyr Independent School District (TX),48049950300,48,49,950300,3143,31.768371,-98.837187
8286,4848285,Hallettsville Independent School District (9-1...,48285000400,48,285,400,3586,29.428260,-97.164082
5704,4848285,Hallettsville Independent School District (9-1...,48285000200,48,285,200,3544,29.450155,-96.966021
5212,4899130,Benavides Independent School District (TX),48131950500,48,131,950500,3178,27.527459,-98.397895


In [69]:
df_result_practice.LEAID.nunique()

1022

In [73]:
# sum population by school district
# NOTE: this is the estimated population for ALL census tracts in the school district, NOT school district population
df_result_pop_tx = df_result_practice.groupby('LEAID')['POPULATION'].sum().to_frame().reset_index()

In [74]:
df_result_pop_tx.head()

Unnamed: 0,LEAID,POPULATION
0,4800001,11041
1,4800002,3252
2,4800003,7607
3,4800005,19119
4,4800006,1490


In [75]:
# save as csv
df_result_pop_tx.to_csv('tx_pop_by_leaid.csv', index=False)