# Population Fluidity

To Do: 
Create connection to data source and perform basic EDA

[Variable information for ACS data](https://www.census.gov/data/developers/data-sets/acs-1year/notes-on-acs-api-variable-types.html)

[Guidance for Developers](https://www.census.gov/data/developers/guidance.html)

[Example of building API Call From US Census](https://www.census.gov/library/video/2020/using-api-all-results-for-acs-table.html)

[Query String Wiki](https://en.wikipedia.org/wiki/Query_string)

[Examples for 2020 ACS flow data](https://api.census.gov/data/2020/acs/flows/examples.html)

# Libraries

In [22]:
# Libraries

## Base -------
import pandas as pd
import numpy as np
import requests
import glob
import datetime
import h5py
import time
import timeit

## Handling requests -------
from requests.exceptions import HTTPError

## Timeouts if the server takes too long -------
from requests.exceptions import Timeout
import json

## Graphing --------
import seaborn as sns

## Google drive connections ---------
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Functions

In [23]:
def get_data(url, timeout = 1): 
  """ 
    Used to get the data from a url
    Add in limits
  """
  try:
      response = requests.get(url)

      # If the response was successful, no Exception will be raised
      response.raise_for_status()
      
  except HTTPError as http_err:
      print(f'HTTP error occurred: {http_err}')  # Python 3.6
  except Timeout:
    print('The request timed out')
  except Exception as err:
      print(f'Other error occurred: {err}')  # Python 3.6
  else:
      print('Success!')
      return response

In [24]:
import pickle
import os

def pickler(ob, pklFileName):
  """ 
    Create pickle object to load later or in different environments for use later

    ob: object, python object intended to pickle
    pklFileName: string, name of the pickle file e.g. "User/content/drive/209/ThisDataFrame.pkl"
  """
  
  with open(pklFileName, "wb+") as f:
    pickle.dump(ob, f)

def load_pickles(pklFilePath):
  """
    path: string, path to object
    pklFileName: string, pickle file name with extension .pkl
  """

  return pd.read_pickle(pklFilePath)

In [27]:
def json_to_df(response):
  """ 

  """

  df = pd.DataFrame(response.json())
  df = df.rename(columns=df.iloc[0]).drop(df.index[0]) # replace column names with the first row

  return df

# Import Data

In [25]:
# get state and country codes
stateCodes = pd.read_csv("https://www2.census.gov/geo/docs/reference/state.txt", sep="|")
stateCodes


Unnamed: 0,STATE,STUSAB,STATE_NAME,STATENS
0,1,AL,Alabama,1779775
1,2,AK,Alaska,1785533
2,4,AZ,Arizona,1779777
3,5,AR,Arkansas,68085
4,6,CA,California,1779778
5,8,CO,Colorado,1779779
6,9,CT,Connecticut,1779780
7,10,DE,Delaware,1779781
8,11,DC,District of Columbia,1702382
9,12,FL,Florida,294478


In [None]:
# getting all the info! 
# !wget "https://geonames.usgs.gov/docs/stategaz/NationalFile.zip" &> dev/null

In [None]:
# county codes: 
# https://geonames.usgs.gov/docs/federalcodes/AllStatesFedCodes.zip
# comprehensive file for counties, states, coordinates
# https://geonames.usgs.gov/docs/stategaz/NationalFile.zip

# Demographic Data

In [None]:
"https://api.census.gov/data/2010/acs/flows?get=MOVEDIN,GEOID1,GEOID2,MOVEDOUT,FULL1_NAME,FULL2_NAME,MOVEDNET&for=county:*&in=state:36" # state and county are required for 2010

Buidling out the parameters to pass to requests

In [None]:
def json_to_df(response):
  """ 

  """

  df = pd.DataFrame(response.json())
  df = df.rename(columns=df.iloc[0]).drop(df.index[0]) # replace column names with the first row

  return df

In [20]:
from time import sleep
from tqdm.notebook import tqdm # progress bar library

In [None]:
data = {}

In [None]:
variables = ["MOVEDIN",
    "GEOID1",
    "GEOID2",
    "MOVEDOUT",
    "FULL1_NAME",
    "FULL2_NAME",
    "MOVEDNET"
  ]

variables = ",".join(variables)
variables

'MOVEDIN,GEOID1,GEOID2,MOVEDOUT,FULL1_NAME,FULL2_NAME,MOVEDNET'

In [None]:
for y in tqdm(range(2010, 2022)):
  print("Adding year: ", y)

  if y in data.keys(): # skip over if the data is already there. 
    continue
  
  # create year url
  url = f"https://api.census.gov/data/{y}/acs/flows?" # can add back MOVEDIN,MOVEDOUT

  # change payload based on year
  if y < 2016: # loop over states since I can only  call one at a time apparently before 2016
    
    y_df = pd.DataFrame() # empty dataframe to store year data

    for s in tqdm(stateCodes["STATE"]):

      if s < 10: 
        s = "0"+str(s) # need two digit code
      payload = {"get": variables,"for": "county:*", "in":"state:"+str(s)}
      r = requests.get(url, params = payload)
      if r.status_code == 200:
        y_df = pd.concat([y_df, json_to_df(r)])

    # add year
    y_df["YEAR"] = y

    print(f"{y} dimensions {y_df.shape}")
    
    # write out data to larger dictionary
    data[y] = y_df
    
    del y_df

  else: 
    payload = {"get": variables,"for": "county:*"}
    r = requests.get(url, params = payload)
    
    if r.status_code == 200: # then add to the dataframe else skip
      y_df = json_to_df(r)
      y_df['YEAR'] = y
      print(f"{y} dimensions {y_df.shape}")
      data[y] = y_df
      del y_df
    
    data[y] = ""
      


  0%|          | 0/12 [00:00<?, ?it/s]

Adding year:  2010
Adding year:  2011
Adding year:  2012


  0%|          | 0/57 [00:00<?, ?it/s]

In [None]:
sum(data[2010].duplicated())

0

In [None]:
data = {}

for y,url in url_years.items():
  start_time = timeit.default_timer()

  key = "migration_" + str(y)   
  print("Adding year: ", y)

  try:
    r = get_data(url)

    # in case it fails
    data[key] = pd.DataFrame(r.json())

  except: 
    pass

  elapsed = timeit.default_timer() - start_time
  print("Elapsed time: ", round(elapsed/60, 2))


In [None]:
# for s in stateCodes["STATE"]: 

In [None]:
url1 = "https://api.census.gov/data/2011/acs/flows?get=MOVEDIN,GEOID1,GEOID2,MOVEDOUT,FULL1_NAME,FULL2_NAME,MOVEDNET&for=county:*&in=state:50"
url2 = "https://api.census.gov/data/2011/acs/flows?get=MOVEDIN,GEOID1,GEOID2,MOVEDOUT,FULL1_NAME,FULL2_NAME,MOVEDNET&for=county:*&in=state:49"

r1 = requests.get(url1)
r2 = requests.get(url2)

In [None]:
d1 = json_to_df(r1)

In [None]:
d2 = json_to_df(r2)

In [None]:
df = pd.concat([d1, d2])

In [None]:
df['YEAR'] = 2010

In [None]:
df

Unnamed: 0,MOVEDIN,GEOID1,GEOID2,MOVEDOUT,FULL1_NAME,FULL2_NAME,MOVEDNET,state,county,YEAR
1,58,50001,,,"Addison County, Vermont",Asia,,50,001,2010
2,69,50001,,,"Addison County, Vermont",Central America,,50,001,2010
3,169,50001,,,"Addison County, Vermont",Europe,,50,001,2010
4,50,50001,,,"Addison County, Vermont",Northern America,,50,001,2010
5,17,50001,,,"Addison County, Vermont",Oceania and At Sea,,50,001,2010
...,...,...,...,...,...,...,...,...,...,...
4423,0,49049,5508100475,9,"Utah County, Utah","Adrian town, Monroe County, Wisconsin",-9,49,049,2010
4424,8,49049,5508702375,0,"Utah County, Utah","Appleton city, Outagamie County, Wisconsin",8,49,049,2010
4425,6,49049,5508782400,0,"Utah County, Utah","Vandenbroek town, Outagamie County, Wisconsin",6,49,049,2010
4426,0,49049,5511773050,3,"Utah County, Utah","Sheboygan Falls town, Sheboygan County, Wisconsin",-3,49,049,2010


# Demographics

In [9]:
import requests
import pandas as pd

In [4]:
# YEAR VARIABLES
r = requests.get("https://api.census.gov/data/2010/acs/flows/variables")
r.status_code

200

In [15]:
variables = pd.DataFrame(r.json())
variables = variables.rename(columns=variables.iloc[0]).drop(variables.index[0])

In [17]:
variables

Unnamed: 0,name,label,concept
1,for,Census API FIPS 'for' clause,Census API Geography Specification
2,in,Census API FIPS 'in' clause,Census API Geography Specification
3,FROMDIFFCTY_M,"Movers from different county, same state margi...",Estimate Variable
4,TODIFFCTY_M,"Movers to different county, same state margin ...",Estimate Variable
5,STATE1,FIPS State code of reference geography,Selectable Geographies
6,STATE2,State code/world region code of second geography,Selectable Geographies
7,MOVEDNET_M,Total net migration margin of error,Estimate Variable
8,STATE1_NAME,FIPS State name of reference geography,Selectable Geographies
9,POP1YR_M,Population 1 year and over margin of error,Estimate Variable
10,SUMLEV2,Geographic summary level of second geography,Selectable Geographies


In [18]:
variables = ["MOVEDIN",
    "GEOID1",
    "GEOID2",
    "MOVEDOUT",
    "FULL1_NAME",
    "FULL2_NAME",
    "MOVEDNET", 
    "TODIFFSTATE",
    "FROMDIFFSTATE",
    "SEX",
    "AGE",
    "RACE"
  ]

variables = ",".join(variables)
variables

'MOVEDIN,GEOID1,GEOID2,MOVEDOUT,FULL1_NAME,FULL2_NAME,MOVEDNET,TODIFFSTATE,FROMDIFFSTATE,SEX,AGE,RACE'

In [16]:
data = {}

In [28]:
for y in tqdm(range(2010, 2012)):
  print("Adding year: ", y)

  if y in data.keys(): # skip over if the data is already there. 
    continue
  
  # create year url
  url = f"https://api.census.gov/data/{y}/acs/flows?" # can add back MOVEDIN,MOVEDOUT

  # change payload based on year
  if y < 2016: # loop over states since I can only  call one at a time apparently before 2016
    
    y_df = pd.DataFrame() # empty dataframe to store year data

    for s in tqdm(stateCodes["STATE"]):

      if s < 10: 
        s = "0"+str(s) # need two digit code
      payload = {"get": variables,"for": "county:*", "in":"state:"+str(s)}
      r = requests.get(url, params = payload)
      if r.status_code == 200:
        y_df = pd.concat([y_df, json_to_df(r)])

    # add year
    y_df["YEAR"] = y

    print(f"{y} dimensions {y_df.shape}")
    
    # write out data to larger dictionary
    data[y] = y_df
    
    del y_df

  else: 
    payload = {"get": variables,"for": "county:*"}
    r = requests.get(url, params = payload)
    
    if r.status_code == 200: # then add to the dataframe else skip
      y_df = json_to_df(r)
      y_df['YEAR'] = y
      print(f"{y} dimensions {y_df.shape}")
      data[y] = y_df
      del y_df
    
    data[y] = ""
      


  0%|          | 0/2 [00:00<?, ?it/s]

Adding year:  2010


  0%|          | 0/57 [00:00<?, ?it/s]

ChunkedEncodingError: ignored