In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import math

import requests
import json

In [20]:
#constants
weight = "PWGTP"
housing_identifier = "SERIALNO"
person_identifier = "SPORDER"

In [21]:
def import_census_data(variables, target, year, limiters): #acs data
  '''
  limiters: specified values to limit data requested
  '''

  vars = variables[:]

  base_request = "https://api.census.gov/data/" + year + "/acs/acs1/pums?get="
  base_request += weight + "," + target
  base_request += "," + housing_identifier + "," + person_identifier
  request = base_request

  #limit request
  limiter = ""
  for l in limiters:
    if l == "ST":
      limiter += "&ucgid=" + limiters[l]
    else:
      limiter += "&" + l + "=" + limiters[l]
      if l in vars:
        vars.remove(l)
  request += limiter

  #remove base vars
  vars.remove(target)
  vars.remove(weight)
  vars.remove(housing_identifier)
  vars.remove(person_identifier)

  #get target/weight/ids
  response = requests.get(request)
  json_data = json.dumps(response.json())
  df = pd.read_json(json_data)
  df = df.rename(columns=df.iloc[0]).loc[1:].reset_index()
  df = df[[target, weight, housing_identifier, person_identifier]]

  target_col = df[target]
  weight_col = df[weight]
  housing_identifier_col = df[housing_identifier]
  person_identifier_col = df[person_identifier]

  #requesting all data
  curr = 0
  while curr < len(vars):
    df, curr = request_vars_and_merge(vars, df, base_request, limiter, target,
                                      4, curr, target_col, weight_col,
                                      housing_identifier_col, person_identifier_col)

  #todo:delete
  df.drop([housing_identifier, person_identifier],axis=1,inplace=True)
  print(target_corr(df, target, weight))

  df.to_csv('/content/drive/My Drive/' + target + "_" + limiter + ".csv", index=False) #save to csv

In [22]:
def request_vars_and_merge(vars, df, base_request, limiter, target, num_vars, curr,
                           target_col, weight_col, housing_identifier_col,
                           person_identifier_col):
  '''
  df: existing df
  base_request: request for weight, target, & identifiers
  limiter: any limiters on the data requested (e.g. state)
  target: target variable
  num_vars: vars in base_request
  target_col: target var data
  weight_col: weights
  housing_identifier_col: housing ids
  person_identifier_col: person ids within a household
  '''

  #get new vars
  new_df, curr = request_vars(vars, base_request, limiter, target, num_vars,
                              curr, weight_col)
  new_df.loc[:,target] = target_col
  new_df.loc[:,weight] = weight_col
  new_df.loc[:,housing_identifier] = housing_identifier_col
  new_df.loc[:,person_identifier] = person_identifier_col

  new_cols = new_df.columns

  #merge with existing df
  df = pd.merge(df, new_df, on=[housing_identifier, person_identifier, weight, target])
  df.drop_duplicates(keep='first', inplace=True, ignore_index=True)

  #pca on whole df (todo)

  return df, curr

In [23]:
def request_vars(vars, base_request, limiter, target, num_vars, curr, weight_col):
  #request data
  request = base_request

  #create request
  while num_vars < 50 and curr < len(vars):
    request += ','+vars[curr]
    curr += 1
    num_vars += 1
  request += limiter

  #request
  response = requests.get(request)
  json_data = json.dumps(response.json())
  new_df = pd.read_json(json_data)
  new_df = new_df.rename(columns=new_df.iloc[0]).loc[1:].reset_index(drop=True)

  #drop non-numeric (can't check correlation & don't need to)
  new_df.drop([housing_identifier, person_identifier],axis=1,inplace=True)

  #converting N (meaning N/A) to 0, have individually checked each var to confirm this is true
  new_df.replace('N', 0, inplace = True)

  #indp = based on industry codes, naicsp = based on NAICS codes (indp is
  #derived from naicsp and is less detailed to protect individual respondents)
  #indp also has higher correlation with income so choosing to keep indp over naicsp
  if "NAICSP" in new_df.columns:
    new_df.drop("NAICSP", axis=1, inplace=True)

  #only one unique value, don't need
  if "ADJINC" in new_df.columns:
    new_df.drop("ADJINC", axis=1, inplace=True)

  #recode SOCP
  if "SOCP" in new_df.columns:
    new_df["SOCP"] = recode(new_df, "SOCP")

  new_df = new_df.astype(float)

  #pearson correlation coefficient analysis w target (PINCP)
  threshold = 0.2 #todo: change?
  correlations = target_corr(new_df, target, weight)
  features = correlations[correlations>=threshold]
  print("selected features:") #todo:delete
  print(features)

  #pca (todo)

  #fod1p/fod2p have same info but fod2p doesn't have a high enough correlation so already dropped (todo:2x check)

  #filter new_df
  new_df = new_df[features.index]
  return new_df, curr

In [24]:
def recode(df, col):
  '''
  returns recoded col
  '''
  unique = dict(enumerate(df[col].unique()))
  unique = dict([(value, key) for key, value in unique.items()])
  return df[col].replace(unique)

In [25]:
def mean(x, w):
  '''weighted mean'''
  return np.sum(x * w) / np.sum(w)

def cov(x, y, w):
  '''weighted covariance'''
  return np.sum(w * (x - mean(x, w)) * (y - mean(y, w))) / np.sum(w)

def corr(x, y, w):
  '''weighted correlation'''
  #return cov(x, y, w) / np.sqrt(cov(x, x, w) * cov(y, y, w))

  cov_xy = cov(x, y, w)
  cov_xx = cov(x, x, w)
  cov_yy = cov(y, y, w)

  if np.isclose(cov_xx, 0.0) or np.isclose(cov_yy, 0.0):
      return 0.0  # or handle the zero division case appropriately

  if np.any(np.isnan([cov_xy, cov_xx, cov_yy])):
      return np.nan  # or handle the case with NaN values

  result = cov_xy / np.sqrt(cov_xx * cov_yy)
  return result

def target_corr(df, target, weight):
  '''
  returns new df with index as col names and value as correlation with target
  '''
  cols = df.columns
  cols = cols.drop([target, weight])

  l = len(cols)-1
  results = []
  for col in cols:
    try:
        results.append(corr(df[col], df[target], df[weight]))
    except TypeError:
        print(col)
        print(type(col))
        try:
            df[col].astype('int32')
        except ValueError:
            print(col + " couldn't be casted")

  results = pd.DataFrame(results, index = cols, columns=["corr"])

  results = results["corr"].sort_values(ascending=False)[1:]
  return results

In [26]:
curr_vars = ["PINCP", "PWGTP", "SERIALNO", "HHLDRAGEP", "SSIP", "ELEP", "RAC2P",
             "RAC3P", "RAC1P", "RACNUM", "WATP", "MHP", "RETP", "SSP", "HINCP",
             "RMSP", "INTP", "SEMP", "SMP", "PERNP", "PAP", "GASP", "WKWN",
             "WAGP", "FULP", "SMOCP", "FINCP", "OIP", "TAXAMT", "CONP",
             "INSP", "OCPIP", "GRNTP", "MRGP", "VALP", "BDSP", "NOC", "NP",
             "NRC", "SPORDER", "NPF", "RNTP", "WKHP", "POVPIP", "GRPIP",
             "JWMNP", "AGEP", "ADJHSG", "ADJINC", "MV", "FPARC", "DRIVESP",
             "RACSOR", "NATIVITY", "JWAP", "HICOV", "PRIVCOV", "R60",
             "RELSHIPP", "VACDUR", "MLPIK", "PLM", "VPS", "DEAR", "R18", "MLPJ",
             "GCL", "STOV", "TEL", "ELEFP", "WATFP", "YOEP", "SMX", "OTHSVCEX",
             "MLPCD", "ANC2P", "FHINS4C", "WRK", "POBP", "RACAIAN", "LAPTOP",
             "HHT2", "MLPFG", "FOD1P", "FOD2P", "SMARTPHONE", "NAICSP", "INDP",
             "WAOB", "SOCP", "GASFP", "HIMRKS", "FHINS3C", "FHINS5C",
             "ACCESSINET", "HOTWAT", "NWLA", "CITWP", "JWTRNS", "REFR", "PSF",
             "DECADE", "PUBCOV", "FULFP", "MRGT", "VACOTH", "BROADBND", "LANP",
             "ANC1P", "TEN", "POWPUMA", "HISPEED", "PLMPRP", "CPLT", "YRBLT",
             "DRAT", "NR", "MRGX", "HINS7", "MARHYP", "COMPOTHX", "SINK",
             "MARHT", "SATELLITE", "WIF", "HISP", "MAR", "SCHL", "NWLK", "DPHY",
             "DEYE", "MIGSP", "HHLANP", "PARTNER", "RACNH", "WKL", "VEH",
             "DDRS", "MIGPUMA", "LNGI", "HINS2", "QTRBIR", "SFN", "RACBLK",
             "MLPH", "ESR", "NPP", "DIS", "DIALUP", "HHLDRRAC1P", "TABLET",
             "MLPB", "DOUT", "SCH", "RACPI", "POWSP", "ANC", "MIL", "OC",
             "HUGCL", "RWAT", "HHLDRHISP", "HINS3", "RESMODE", "MARHW", "SFR",
             "ESP", "RACASN", "HINS5", "MLPE", "OCCP", "MARHD", "SCHG", "MRGI",
             "MIG", "HINS1", "MSP", "FER", "MULTG", "WORKSTAT", "MARHM", "KIT",
             "GCR", "HUPARC", "HINS6", "GCM", "ACR", "HINS4", "PAOC", "RNTM",
             "DRATX", "FS", "SVAL", "RACWHT", "NWAB", "HUPAOC", "R65", "RC",
             "BATH", "SEX", "HFL", "WKEXREL", "VACS", "HHL", "SRNT", "NWAV",
             "NWRE", "BLD", "LANX", "MLPA", "HHT", "DREM", "COW", "HUPAC",
             "CIT", "AGS", "ENG", "JWRIP", "JWDP", "NOP"]

In [27]:
#aapi

#do aapi, then asian + pi, then by regions, then by ethnicity

#limiters = {"RACASN":"1", "RACPI":"1", "ST":"0400000US06"} #limting to CA
#limiters = {"RAC3P":"004", "RAC3P":"006", "RAC3P":"013", "ST":"0400000US06"} #limting to CA, just indian, filipino & samoan
limiters = {"RACASN":"1", "RACPI":"1"} #entire U.S.
#limiters = {"ST":"0400000US06"} #all races in CA #todo:check correlation w race
import_census_data(curr_vars, "PINCP", "2021", limiters)

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
selected features:
WAGP      0.849112
WKHP      0.540183
WKWN      0.526363
INTP      0.515548
AGEP      0.457396
SEMP      0.400029
HINCP     0.373228
POVPIP    0.354100
JWMNP     0.276409
RETP      0.238917
FINCP     0.209862
Name: corr, dtype: float64
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<cl

In [None]:
#checking relationship between specific RAC3P (ethnicity) values + income

limiters = {"RAC3P":"006", "RAC3P":"013", "ST":"0400000US06"} #limting to CA, just indian, filipino & samoan
request = "https://api.census.gov/data/2021/acs/acs1/pums?get=PWGTP,PINCP,SERIALNO,SPORDER&RAC3P=004&RAC3P=006&RAC3P=013"

response = requests.get(request)
json_data = json.dumps(response.json())
df = pd.read_json(json_data)
df = df.rename(columns=df.iloc[0]).loc[1:].reset_index()
df.drop([housing_identifier, person_identifier],axis=1,inplace=True)
df = df.astype('int32')

In [None]:
#histogram
unique = df['RAC3P'].unique()
ethnicities = {6:"filipino",4:"indian",13:"samoan"}

for val in unique:
  new_df = df[df["RAC3P"] == val]
  pyplot.hist(new_df["PINCP"], weights=new_df["PWGTP"], label=ethnicities[val],alpha=.2)

pyplot.legend(loc='upper right')
pyplot.show()

In [None]:
#predict race
limiters = {"ST":"0400000US06"} #limting to CA
import_census_data(curr_vars, "RAC1P", "2021", limiters)