In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json

def import_census_data(vars, target, year, limiter):
  '''
  limiter: geographies/years/certain values to be appended to base request
  '''
  curr = 0
  weight = "PWGTP"
  housing_identifier = "SERIALNO"
  person_identifier = "SPORDER"
  base_request = "https://api.census.gov/data/" + year + "/acs/acs1/pums?get=" + weight + "," + target + "," + housing_identifier + "," + person_identifier
  request = base_request + limiter
  num_vars = 4
  vars.remove(target)
  vars.remove(weight)
  vars.remove(housing_identifier)
  vars.remove(person_identifier)

  #get target/weight
  print(request)
  response = requests.get(request)
  json_data = json.dumps(response.json())
  df = pd.read_json(json_data)
  df = df.rename(columns=df.iloc[0]).loc[1:].reset_index()
  df = df[[target, weight, housing_identifier, person_identifier]]
  print(df) #todo:delete

  target_col = df[target]
  weight_col = df[weight]
  housing_identifier_col = df[housing_identifier]
  person_identifier_col = df[person_identifier]

  #requesting all data #todo: make body a func
  while curr < len(vars):
    request = base_request

    #create request
    while num_vars < 50:
      request += ','+vars[curr]
      curr += 1
      num_vars += 1
      #print(num_vars) #todo:delete
    request += limiter

    #request data
    print(request) #todo:delete
    response = requests.get(request)
    json_data = json.dumps(response.json())
    new_df = pd.read_json(json_data)
    new_df = new_df.rename(columns=new_df.iloc[0]).loc[1:].reset_index(drop=True)
    print("requested data:")
    print(new_df)

    #pearson correlation coefficient analysis w target (PINCP)

    #deal with problem vars
    new_df.drop([housing_identifier, person_identifier],axis=1,inplace=True)
    if "FOD1P" in new_df.columns:
      new_df["FOD1P"] = new_df["FOD1P"].replace('N', 0)
    if "FOD2P" in new_df.columns:
      new_df["FOD2P"] = new_df["FOD2P"].replace('N', 0)
    if "NAICSP" in new_df.columns: #duplicate of indp
      new_df.drop("NAICSP", inplace = True, axis=1)
    if "INDP" in new_df.columns: #duplicate of indp
      new_df["INDP"] = new_df["INDP"].replace('N', 0)
    if "SOCP" in new_df.columns:
      new_df["SOCP"] = recode(new_df, "SOCP")

    #go thru vars and see what can cast (todo;delete)
    for col in new_df.columns:
      try:
          pd.to_numeric(new_df[col])
      except ValueError:
          print(col + " couldn't be casted")

    threshold = 0.5 #todo: change?
    corr = abs(new_df.corr(method='pearson', numeric_only=False)) #absolute val

    target_corr = corr[target].sort_values(ascending=False)[1:]
    features = target_corr[target_corr>=threshold]

    #pca (todo)

    #filter new_df
    new_df = new_df[features.index]
    new_df[target] = target_col
    new_df[weight] = weight_col
    new_df[housing_identifier] = housing_identifier_col
    new_df[person_identifier] = person_identifier_col

    print("with selected features:")
    print(new_df)

    #join new_df with existing df
    df = pd.merge(df, new_df, on=[housing_identifier, person_identifier, weight, target])
    df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
    print("merged")
    print(df)

    #todo:compare FOD1P and FOD2P if they're the exact same just drop but if diff, figure out how to merge

    #pca on whole df (todo)

    #reset request
    request = base_request
    num_vars = 4

  df.to_csv(target + "_" + limiter + ".csv", sep=',', index=False, encoding='utf-8') #save to csv

In [None]:
def recode(df, col):
  '''
  returns recoded col
  '''
  unique = dict(enumerate(df["SOCP"].unique()))
  unique = dict([(value, key) for key, value in unique.items()])
  return df[col].replace(unique)

In [None]:
curr_vars = ["PINCP", "PWGTP", "SERIALNO", "HHLDRAGEP", "SSIP", "ELEP", "RACNUM", "WATP", "MHP", "RETP",
        "SSP", "HINCP", "RMSP", "INTP", "SEMP", "SMP", "PERNP", "PAP", "GASP",
        "WKWN", "WAGP", "FULP", "SMOCP", "FINCP", "OIP", "TAXAMT", "CONP",
        "INSP", "OCPIP", "GRNTP", "MRGP", "VALP", "BDSP", "NOC", "NP", "NRC",
        "SPORDER", "NPF", "RNTP", "WKHP", "POVPIP", "GRPIP", "JWMNP", "AGEP",
        "ADJHSG", "ADJINC", "MV", "FPARC", "DRIVESP", "RACSOR",
        "NATIVITY", "JWAP", "HICOV", "PRIVCOV", "R60", "RELSHIPP", "VACDUR",
        "MLPIK", "PLM", "VPS", "DEAR", "R18", "MLPJ", "GCL", "STOV", "TEL",
        "ELEFP", "WATFP", "YOEP", "SMX", "OTHSVCEX", "MLPCD", "ANC2P",
        "FHINS4C", "WRK", "POBP", "RACAIAN", "LAPTOP", "HHT2", "MLPFG",
        "FOD1P", "SMARTPHONE", "NAICSP", "WAOB", "SOCP", "GASFP", "HIMRKS",
        "FHINS3C", "FHINS5C", "ACCESSINET", "HOTWAT", "NWLA", "CITWP",
        "JWTRNS", "REFR", "PSF", "DECADE", "PUBCOV", "FULFP", "MRGT", "INDP",
        "VACOTH", "FOD2P", "BROADBND", "LANP", "ANC1P", "TEN", "POWPUMA",
        "HISPEED", "PLMPRP", "CPLT", "YRBLT", "DRAT", "NR", "MRGX", "HINS7",
        "MARHYP", "COMPOTHX", "SINK", "RAC3P", "MARHT", "SATELLITE", "WIF",
        "HISP", "MAR", "SCHL", "NWLK", "DPHY", "DEYE", "MIGSP", "RAC1P",
        "HHLANP", "PARTNER", "RACNH", "WKL", "VEH", "DDRS", "MIGPUMA", "LNGI",
        "HINS2", "QTRBIR", "SFN", "RACBLK", "MLPH", "ESR", "NPP", "DIS",
        "DIALUP", "HHLDRRAC1P", "TABLET", "RAC2P", "MLPB", "DOUT", "SCH",
        "RACPI", "POWSP", "ANC", "MIL", "OC", "HUGCL", "RWAT", "HHLDRHISP",
        "HINS3", "RESMODE", "MARHW", "SFR", "ESP", "RACASN", "HINS5", "MLPE",
        "OCCP", "MARHD", "SCHG", "MRGI", "MIG", "HINS1", "MSP", "FER",
        "MULTG", "WORKSTAT", "MARHM", "KIT", "GCR", "HUPARC", "HINS6",
        "GCM", "ACR", "HINS4", "PAOC", "RNTM", "DRATX", "FS", "SVAL",
        "RACWHT", "NWAB", "HUPAOC", "R65", "RC", "BATH", "SEX", "HFL",
        "WKEXREL", "VACS", "HHL", "SRNT", "NWAV", "NWRE", "BLD", "LANX",
        "MLPA", "HHT", "DREM", "COW", "HUPAC", "CIT", "AGS", "ENG", "JWRIP",
        "JWDP", "NOP"]

In [None]:
#aapi
import_census_data(curr_vars, "PINCP", "2021", "&RACASN=1&RACPI=1&ucgid=0400000US06") #todo: try limiting not just to CA

In [None]:
#predict race
import_census_data(curr_vars, "RAC1P", "2021", "&ucgid=0400000US06")

In [None]:
#income across race
'''years = ["2005", "2007", "2009", "2011", "2013", "2015", "2017", "2019", "2021"]

for y in years:
  import_census_data(curr_vars, "PINCP", y, )''' #need to insert geographies, also probably need a diff vars list bc not all vars every year