# Total Population Change
- https://censo.estadisticas.pr/EstimadosPoblacionales

In [110]:
import numpy as np
import pandas as pd

In [111]:
# Import the pop change factors Excel and clean useless formatting rows.
pob_total_df = pd.read_excel("../../data/raw/po-pr-change-factors.xlsx", sheet_name="Pob. Total", skiprows=[0, 1, 2])
pob_total_df = pob_total_df[:-5]
pob_total_df = pob_total_df.drop(56)
pob_total_df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,2020,2021,2022
0,United States,331449520.0,331511512.0,332031554.0,333287557.0
1,Northeast,57609156.0,57448898.0,57259257.0,57040406.0
2,Midwest,68985537.0,68961043.0,68836505.0,68787595.0
3,South,126266262.0,126450613.0,127346029.0,128716192.0
4,West,78588565.0,78650958.0,78589763.0,78743364.0
5,.Alabama,5024356.0,5031362.0,5049846.0,5074296.0
6,.Alaska,733378.0,732923.0,734182.0,733583.0
7,.Arizona,7151507.0,7179943.0,7264877.0,7359197.0
8,.Arkansas,3011555.0,3014195.0,3028122.0,3045637.0
9,.California,39538245.0,39501653.0,39142991.0,39029342.0


In [112]:
# Drop all rows in pob_total_df except row 57 (Puerto Rico).
pob_total_df = pob_total_df.drop(pob_total_df.index[0:56]) if len(pob_total_df.index) > 1 else pob_total_df
pob_total_df

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,2020,2021,2022
57,Puerto Rico,3285874.0,3281557.0,3262693.0,3221789.0


In [113]:
# Clean up column names.
pob_total_df.columns = ["State", "2021", "2020", "2021", "2022"]
pob_total_df.reset_index(drop=True, inplace=True)
pob_total_df

Unnamed: 0,State,2021,2020,2021.1,2022
0,Puerto Rico,3285874.0,3281557.0,3262693.0,3221789.0


# Population Change Factors

In [114]:
# Import the pop change factors Excel and clean useless formatting rows.
pob_change_factors = pd.read_excel("../../data/raw/po-pr-change-factors.xlsx", sheet_name="Componentes de cambio", skiprows=4)
pob_change_factors = pob_change_factors[:-8]
pob_change_factors

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Natural\nChange,Births,Deaths,Total,International2,Domestic,Unnamed: 8,Natural\nChange.1,Births.1,Deaths.1,Total.1,International2.1,Domestic.1
0,United States,1256003.0,245080.0,3688179.0,3443099.0,1010923.0,1010923.0,(X),1838037.0,431192.0,8164738.0,7733546.0,1406845.0,1406845.0,(X)
1,Northeast,-218851.0,29215.0,593222.0,564007.0,-246272.0,214464.0,-460736,-568750.0,-5705.0,1312124.0,1317829.0,-553194.0,299344.0,-852538
2,Midwest,-48910.0,-8638.0,759934.0,768572.0,-28781.0,145268.0,-174049,-197942.0,-19096.0,1697765.0,1716861.0,-192192.0,203644.0,-395836
3,South,1370163.0,70098.0,1476296.0,1406198.0,1282675.0,414740.0,867935,2449930.0,125506.0,3250442.0,3124936.0,2316894.0,578919.0,1737975
4,West,153601.0,154405.0,858727.0,704322.0,3301.0,236451.0,-233150,154799.0,330487.0,1904407.0,1573920.0,-164663.0,324938.0,-489601
5,.Alabama,24450.0,-8590.0,58280.0,66870.0,33206.0,4597.0,28609,49940.0,-21832.0,129259.0,151091.0,71884.0,6529.0,65355
6,.Alaska,-599.0,3092.0,9488.0,6396.0,-3755.0,2371.0,-6126,205.0,8432.0,21355.0,12923.0,-8106.0,3306.0,-11412
7,.Arizona,94320.0,-566.0,78763.0,79329.0,93203.0,22219.0,70984,207690.0,-5532.0,172257.0,177789.0,212851.0,30489.0,182362
8,.Arkansas,17515.0,-3936.0,36251.0,40187.0,21523.0,3314.0,18209,34082.0,-8980.0,79874.0,88854.0,42809.0,4754.0,38055
9,.California,-113649.0,106155.0,424652.0,318497.0,-217515.0,125715.0,-343230,-508903.0,201459.0,939969.0,738510.0,-699904.0,171223.0,-871127


In [115]:
# Drop all rows in pob_total_df except row 57 (Puerto Rico).
pob_change_factors = pob_change_factors.drop(pob_change_factors.index[0:57]) if len(pob_change_factors.index) > 1 else pob_change_factors
pob_total_df.reset_index(drop=True, inplace=True)
pob_change_factors

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Natural\nChange,Births,Deaths,Total,International2,Domestic,Unnamed: 8,Natural\nChange.1,Births.1,Deaths.1,Total.1,International2.1,Domestic.1
57,Puerto Rico,-40904.0,-14457.0,19645.0,34102.0,-26447.0,-26447.0,(X),-64085.0,-31760.0,42984.0,74744.0,-32325.0,-32325.0,(X)


In [116]:
# Turn this into a MultiIndex DataFrame.
level1 = ['Annual Change', 'Cumulative Change']
level2 = ['Total Change', 'Natural Change', 'Births', 'Deaths', 'Net Migration (Total)', 'Net Migration (International)', 'Net Migration (Domestic)']
labels = [(l1, l2) for l1 in level1 for l2 in level2]
columns = pd.MultiIndex.from_tuples(labels)
pob_change_factors.columns = columns.insert(0, 'State')
pob_change_factors.reset_index(drop=True, inplace=True)
pob_change_factors

Unnamed: 0_level_0,State,Annual Change,Annual Change,Annual Change,Annual Change,Annual Change,Annual Change,Annual Change,Cumulative Change,Cumulative Change,Cumulative Change,Cumulative Change,Cumulative Change,Cumulative Change,Cumulative Change
Unnamed: 0_level_1,Unnamed: 1_level_1,Total Change,Natural Change,Births,Deaths,Net Migration (Total),Net Migration (International),Net Migration (Domestic),Total Change,Natural Change,Births,Deaths,Net Migration (Total),Net Migration (International),Net Migration (Domestic)
0,Puerto Rico,-40904.0,-14457.0,19645.0,34102.0,-26447.0,-26447.0,(X),-64085.0,-31760.0,42984.0,74744.0,-32325.0,-32325.0,(X)


In [117]:
# Puerto Rico doesn't have domestic migration, so we'll drop that column.
pob_change_factors = pob_change_factors.drop(columns=[('Annual Change', 'Net Migration (Domestic)'), ('Cumulative Change', 'Net Migration (Domestic)')])
pob_change_factors

Unnamed: 0_level_0,State,Annual Change,Annual Change,Annual Change,Annual Change,Annual Change,Annual Change,Cumulative Change,Cumulative Change,Cumulative Change,Cumulative Change,Cumulative Change,Cumulative Change
Unnamed: 0_level_1,Unnamed: 1_level_1,Total Change,Natural Change,Births,Deaths,Net Migration (Total),Net Migration (International),Total Change,Natural Change,Births,Deaths,Net Migration (Total),Net Migration (International)
0,Puerto Rico,-40904.0,-14457.0,19645.0,34102.0,-26447.0,-26447.0,-64085.0,-31760.0,42984.0,74744.0,-32325.0,-32325.0


# Final Results
Two datasets are extracted from multiple Excel sheets.

In [119]:
# Output the DataFrame to CSV, and overwrite if the file already exists.
pob_total_df.to_csv("../../data/clean/po-pr-total-change-clean.csv", index=False)
pob_change_factors.to_csv("../../data/clean/po-pr-change-factors-clean.csv", index=False)