<a href="https://colab.research.google.com/github/linyuehzzz/census_privacy/blob/main/franklin_microsim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


#### **Select variables**
Constraint variables:
- sex (SEX)
- race (RAC1P)
- age band (AGEP)  

Target variables: 
- disability (DIS)
- married, spouse present/spouse absent (MSP)
- mobility status (MIG)
- military service (MIL)
- educational attainment (SCHL)
- recoded field of degree - first entry (FOD1P)
- health insurance coverage (HICOV)
- private health insurance coverage recode (PRIVCOV)
- public health coverage recode (PUBCOV)
- total person's income (PINCP)
- income-to-poverty ratio recode (POVPIP)
- class of worker (COW)
- employment status recode (ESR)
- Standard Occupational Classification (SOC) codes for 2018 and later based on 2018 SOC codes (SOCP)
- travel time to work (JWMNP)
- vehicle occupancy (JWRIP)
- means of transportation to work (JWTRNS)

In [None]:
%cd "/content/gdrive/My Drive/Colab Notebooks/census_privacy"
import pandas as pd

filename_pums = 'franklin/pums/franklin_pums10.csv'
data_pums = pd.read_csv(filename_pums)
pums_subset = data_pums[["PUMAID", "SEX", "RAC1P", "AGEP", "DIS", "MSP", "MIG", "MIL", "SCHL", 
"FOD1P", "HICOV", "PRIVCOV", "PUBCOV", "PINCP", "POVPIP", "COW", "ESR", "SOCP", "JWMNP", "JWRIP", "JWTRNS"]]
pums_subset.to_csv("franklin/pums/franklin_pums10v2.csv", index=False)
pums_subset

/content/gdrive/My Drive/Colab Notebooks/census_privacy


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,PUMAID,SEX,RAC1P,AGEP,DIS,MSP,MIG,MIL,SCHL,FOD1P,HICOV,PRIVCOV,PUBCOV,PINCP,POVPIP,COW,ESR,SOCP,JWMNP,JWRIP,JWTRNS
0,3904103,2,1,48,2,1.0,1.0,4.0,21.0,6203.0,1,1,2,15000.0,501.0,1.0,1.0,132052,20.0,1.0,1.0
1,3904103,1,1,48,2,1.0,1.0,4.0,21.0,6207.0,1,1,2,155000.0,501.0,1.0,1.0,1191XX,15.0,1.0,1.0
2,3904103,1,1,18,2,6.0,1.0,4.0,14.0,,1,1,2,0.0,501.0,,6.0,,,,
3,3904103,1,1,16,2,6.0,1.0,,12.0,,1,1,2,0.0,501.0,,6.0,,,,
4,3904103,1,1,14,2,,1.0,,10.0,,1,1,2,,501.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52846,3904102,2,1,63,2,4.0,1.0,4.0,19.0,,1,1,2,37000.0,281.0,1.0,1.0,311131,20.0,1.0,1.0
52847,3904102,2,1,36,2,4.0,3.0,4.0,19.0,,1,1,2,25800.0,197.0,1.0,1.0,439061,10.0,1.0,1.0
52848,3904106,1,1,94,1,6.0,1.0,2.0,19.0,,1,1,1,152100.0,501.0,,6.0,,,,
52849,3904105,2,1,46,2,6.0,1.0,4.0,21.0,5502.0,1,1,2,83000.0,501.0,4.0,1.0,291141,15.0,1.0,1.0


### **Microsimulation**

First, we read the microdata (1163414 records) with only the constraint variables (race/sex/age band). This data set is an enumeration of the population in Franklin County, which is partitioned into 22,826 blocks.

In [None]:
%cd "/content/gdrive/My Drive/Colab Notebooks/census_privacy"
import pandas as pd

filename_c = 'franklin/microdata/franklin_people_all.csv'
data_c = pd.read_csv(filename_c)
data_c

/content/gdrive/My Drive/Colab Notebooks/census_privacy


Unnamed: 0,GEOID10,race,sex,age,age_v,PUMA
0,390490001101001,1,1,9,28,3904102
1,390490001101001,7,2,9,27,3904102
2,390490001101002,1,2,1,0,3904102
3,390490001101002,1,2,1,3,3904102
4,390490001101002,1,2,2,7,3904102
...,...,...,...,...,...,...
1163409,390490107001020,1,2,21,75,3904102
1163410,390490107001020,1,1,23,95,3904102
1163411,390490107001020,1,2,23,97,3904102
1163412,390499800001027,1,1,20,70,3904106


We then read the PUMS that covers approximately 5% of the population (52851 records) in Franklin County. This data set is at the PUA level, which partitions the Franklin County into 11 PUMAs. Compared to the previous data set, it has more other variables of interest.

In [None]:
%cd "/content/gdrive/My Drive/Colab Notebooks/census_privacy"
filename_pums = 'franklin/pums/franklin_pums10v2.csv'
data_pums = pd.read_csv(filename_pums)
data_pums

/content/gdrive/My Drive/Colab Notebooks/census_privacy


Unnamed: 0,PUMAID,SEX,RAC1P,AGEP,DIS,MSP,MIG,MIL,SCHL,FOD1P,HICOV,PRIVCOV,PUBCOV,PINCP,POVPIP,COW,ESR,SOCP,JWMNP,JWRIP,JWTRNS
0,3904103,2,1,48,2,1.0,1.0,4.0,21.0,6203.0,1,1,2,15000.0,501.0,1.0,1.0,132052,20.0,1.0,1.0
1,3904103,1,1,48,2,1.0,1.0,4.0,21.0,6207.0,1,1,2,155000.0,501.0,1.0,1.0,1191XX,15.0,1.0,1.0
2,3904103,1,1,18,2,6.0,1.0,4.0,14.0,,1,1,2,0.0,501.0,,6.0,,,,
3,3904103,1,1,16,2,6.0,1.0,,12.0,,1,1,2,0.0,501.0,,6.0,,,,
4,3904103,1,1,14,2,,1.0,,10.0,,1,1,2,,501.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52846,3904102,2,1,63,2,4.0,1.0,4.0,19.0,,1,1,2,37000.0,281.0,1.0,1.0,311131,20.0,1.0,1.0
52847,3904102,2,1,36,2,4.0,3.0,4.0,19.0,,1,1,2,25800.0,197.0,1.0,1.0,439061,10.0,1.0,1.0
52848,3904106,1,1,94,1,6.0,1.0,2.0,19.0,,1,1,1,152100.0,501.0,,6.0,,,,
52849,3904105,2,1,46,2,6.0,1.0,4.0,21.0,5502.0,1,1,2,83000.0,501.0,4.0,1.0,291141,15.0,1.0,1.0


#### **Traditional method**
- Reweighting existing survey data (not creating new data)
- Treating individuals with the same constraint variables as identical

For each PUMA, we randomly sample the records in PUMS that match the constraint variables in the enumeration data set.

In [None]:
%cd "/content/gdrive/My Drive/Colab Notebooks/census_privacy"
import numpy as np 
import pandas as pd
import csv
from lib.target_constraints import convert_race, convert_sex, convert_age

output = 'franklin/microdata/franklin_peoplev0.csv'
a = ["GEOID10"]
head = list(a)
head.extend(data_pums.columns.tolist())

with open(output, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(head)
    for index, row in data_c.iterrows():
        # constraint variables
        puma = row["PUMA"]
        race = convert_race(row["race"])
        sex = convert_sex(row["sex"])
        age_band = convert_age(row["age"])

        # match
        sub_pums = data_pums[(data_pums["PUMAID"] == puma) & (data_pums["SEX"] == sex) & (data_pums["RAC1P"] == race) 
        & (data_pums["AGEP"] >= age_band[0]) & (data_pums["AGEP"] <= age_band[1])]

        if len(sub_pums) > 0:
            sub_pums = sub_pums.sample(n=1)
            GEOID10 = [row["GEOID10"]]
            var = GEOID10 + sub_pums.values.tolist()[0]
            writer.writerow(var)

The traditional method is designed to "weight" each individual in the sample survey to match the aggregated statistics of the constraint variables in the census. However, one limitation of such a method is that if the survey does not cover some minorities (e.g., old man that is still working) in the population, they can not appear in the final microdata product.

Here, we calculate how many people are not able to find a match in the survey and thus are excluded in the final product.

In [None]:
%cd "/content/gdrive/My Drive/Colab Notebooks/census_privacy"
filename_c = 'franklin/microdata/franklin_people_all.csv'
data_c = pd.read_csv(filename_c)

filename_o = 'franklin/microdata/franklin_peoplev1.csv'
data_o = pd.read_csv(filename_o)

m_cnt = len(data_c) - len(data_o)

print(str(m_cnt) + " records do not have matched survey data.")

10810 records do not have matched survey data.


#### **SDV**


The `sdv` library is used here. We load the PUMS data as the training data. We will approaximate the joint probability distribution of all the variables based on this data set, from which we will sample our synthetic individuals.

In [None]:
!pip install sdv

#####**Define custom constraints**

Define some custom constraints based on reject sampling using the `is_valid` function. Constraints to be considered based on the 2015-2019 ACS PUMS Data Dictionary:
- People aged below 15 should not be married (MSP = b).
- People aged below 1 should not move a year ago (MIG = b).
- People aged below 17 should not serve in military (MIL = b).
- People aged below 3 do not have educational attainment (SCHL = b).
- People with less than bachelor's degree (SCHL = 01 to 20) do not have field of degree (FOD1P = bbbb).
- People aged below 15 do not have income (PINCP = bbbbbbb).
- People aged below 16 should not be working (COW = ESR = SOCP =b).
- People aged below 16 or unemployed/not in labor force (COW = 9 or ESR = 3/6 or SOCP = 999920) do not travel to work (JWMNP = JWTRNS = b).
- People not working (COW = 9 or ESR = 3/6 or SOCP = 999920) or not driving to work (JWTRNS != 01) do not occupy a car (JWRIP = bb).
- Any code not in the dictionary.

Pack everything together in `CustomConstraint`.

In [None]:
from sdv.constraints import CustomConstraint
from lib.custom_constraints import is_valid

constraint = CustomConstraint(is_valid=is_valid)

##### **Fit models**

Gaussian Copula, CTGAN, and TVAE

Fit the `GaussainCopula` model.

In [None]:
%cd "/content/gdrive/My Drive/Colab Notebooks/census_privacy"
from sdv.tabular import GaussianCopula

for puma in data_pums['PUMAID'].unique():
    print(puma)
    sub_pums = data_pums[data_pums['PUMAID'] == puma]
    model_gau = GaussianCopula(constraints=[constraint])
    model_gau.fit(sub_pums)
    model_gau.save('models/model_gau_' + str(puma) + '.pkl')

Fit the `CTGAN` model.

In [None]:
%cd "/content/gdrive/My Drive/Colab Notebooks/census_privacy"
from sdv.tabular import CTGAN

for puma in data_pums['PUMAID'].unique():
    print(puma)
    sub_pums = data_pums[data_pums['PUMAID'] == puma]
    model_ctgan = CTGAN(constraints=[constraint])
    model_ctgan.fit(sub_pums)
    model_ctgan.save('models/model_ctgan_' + str(puma) + '.pkl')

Fit the `TVAE` model.

In [None]:
%cd "/content/gdrive/My Drive/Colab Notebooks/census_privacy"
from sdv.tabular import TVAE

for puma in data_pums['PUMAID'].unique():
    print(puma)
    sub_pums = data_pums[data_pums['PUMAID'] == puma]
    model_tvae = TVAE(constraints=[constraint])
    model_tvae.fit(sub_pums)
    model_tvae.save('models/model_tvae_' + str(puma) + '.pkl')

##### **Conditional sampling**

Now that we have the model that allows us to approximate the joint distribution of the variables, we can create a sample from that distribution that matches our constraint variables (sex/race/age) in the enumeration data set. We use conditional sampling to generate the synthetic population.

In [None]:
%cd "/content/gdrive/My Drive/Colab Notebooks/census_privacy"
from sdv.tabular import GaussianCopula
import numpy as np
from numpy import nan 
import pandas as pd
import csv
from lib.target_constraints import convert_race, convert_sex, convert_age
from lib.custom_constraints import is_valid

output = 'franklin/microdata/franklin_peoplev1.csv'
a = ["GEOID10"]
head = list(a)
head.extend(data_pums.columns.tolist())

N = 1000

with open(output, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(head)
    for index, row in data_c[:10].iterrows():
        # load model
        puma = row["PUMA"]
        model_gau = GaussianCopula.load('models/model_gau_' + str(puma) + '.pkl')

        # constraint variables
        race = convert_race(row["race"])
        sex = convert_sex(row["sex"])
        age_band = convert_age(row["age"])

        # conditions = {
        #     'RAC1P': race,
        #     'SEX': sex,
        #     'AGEP': age
        # }

        # rejection sampling
        sub_pums = []
        flag = 0
        while True:
            # sex, race, and age constraints
            # sub_pums = model_gau.sample(1, conditions=conditions)
            sub_pums = model_gau.sample(N)
            sub_pums = sub_pums[(data_pums["SEX"] == sex) & (sub_pums["RAC1P"] == race) 
            & (sub_pums["AGEP"] >= age_band[0]) & (sub_pums["AGEP"] <= age_band[1])]
            flag += 1
            if len(sub_pums) > 0:
                break
            if flag > 50:
                print("Consider increasing sampling size.")

        sub_pums = sub_pums.sample(n=1)
        sub_pums["PUMAID"] = puma
        print(sub_pums)
        GEOID10 = [row["GEOID10"]]
        var = GEOID10 + sub_pums.values.tolist()[0]
        writer.writerow(var)