# Population Fluidity

To Do: 
Create connection to data source and perform basic EDA

[Variable information for ACS data](https://www.census.gov/data/developers/data-sets/acs-1year/notes-on-acs-api-variable-types.html)

[Guidance for Developers](https://www.census.gov/data/developers/guidance.html)

[Example of building API Call From US Census](https://www.census.gov/library/video/2020/using-api-all-results-for-acs-table.html)

[Query String Wiki](https://en.wikipedia.org/wiki/Query_string)

[Examples for 2020 ACS flow data](https://api.census.gov/data/2020/acs/flows/examples.html)

# Libraries

In [None]:
# Libraries

## Base -------
import pandas as pd
import numpy as np
import requests
import glob
import datetime
import time
from tqdm.notebook import tqdm # progress bar library

## Handling requests -------
from requests.exceptions import HTTPError

## Timeouts if the server takes too long -------
from requests.exceptions import Timeout
import json

## Graphing --------
import seaborn as sns

## Google drive connections ---------
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Functions

In [None]:
def get_data(url, timeout = 1): 
  """ 
    Used to get the data from a url
    Add in limits
  """
  try:
      response = requests.get(url)

      # If the response was successful, no Exception will be raised
      response.raise_for_status()
      
  except HTTPError as http_err:
      print(f'HTTP error occurred: {http_err}')  # Python 3.6
  except Timeout:
    print('The request timed out')
  except Exception as err:
      print(f'Other error occurred: {err}')  # Python 3.6
  else:
      print('Success!')
      return response

In [None]:
import pickle
import os

def pickler(ob, pklFileName):
  """ 
    Create pickle object to load later or in different environments for use later

    ob: object, python object intended to pickle
    pklFileName: string, name of the pickle file e.g. "User/content/drive/209/ThisDataFrame.pkl"
  """
  
  with open(pklFileName, "wb+") as f:
    pickle.dump(ob, f)

def load_pickles(pklFilePath):
  """
    path: string, path to object
    pklFileName: string, pickle file name with extension .pkl
  """

  return pd.read_pickle(pklFilePath)

# Import Data

In [None]:
# get state and country codes
stateCodes = pd.read_csv("https://www2.census.gov/geo/docs/reference/state.txt", sep="|")
stateCodes.head()

In [None]:
# getting all the info! 
# !wget "https://geonames.usgs.gov/docs/stategaz/NationalFile.zip" &> dev/null

In [None]:
# county codes: 
# https://geonames.usgs.gov/docs/federalcodes/AllStatesFedCodes.zip
# comprehensive file for counties, states, coordinates
# https://geonames.usgs.gov/docs/stategaz/NationalFile.zip

# Data from years 2010 - 2020

In [None]:
def json_to_df(response):
  """ Pull data from response object and rename column headers
    response: (response obj) returned object from requests.get(...)
    returns: (dataframe) cleaned dataframe
  """

  df = pd.DataFrame(response.json())
  df = df.rename(columns=df.iloc[0]).drop(df.index[0]) # replace column names with the first row

  return df

In [None]:
data = {}

In [None]:
variables = ["MOVEDIN",
    "GEOID1",
    "GEOID2",
    "MOVEDOUT",
    "FULL1_NAME",
    "FULL2_NAME",
    "MOVEDNET"
  ]

variables = ",".join(variables)
variables

'MOVEDIN,GEOID1,GEOID2,MOVEDOUT,FULL1_NAME,FULL2_NAME,MOVEDNET'

In [None]:
for y in tqdm(range(2010, 2022)):
  print("Adding year: ", y)

  if y in data.keys(): # skip over if the data is already there. 
    continue
  
  # create year url
  url = f"https://api.census.gov/data/{y}/acs/flows?" # can add back MOVEDIN,MOVEDOUT

  # change payload based on year
  if y < 2016: # loop over states since I can only  call one at a time apparently before 2016
    
    y_df = pd.DataFrame() # empty dataframe to store year data

    for s in tqdm(stateCodes["STATE"]):

      if s < 10: 
        s = "0"+str(s) # need two digit code for state

      payload = {"get": variables,"for": "county:*", "in":"state:"+str(s)}
      r = requests.get(url, params = payload)
      if r.status_code == 200:
        y_df = pd.concat([y_df, json_to_df(r)])

    # add year
    y_df["YEAR"] = y

    print(f"{y} dimensions {y_df.shape}")
    print("-" * 20)
    
    # write out data to larger dictionary
    data[y] = y_df
    
    del y_df
  
  else: 
    payload = {"get": variables,"for": "county:*"}
    r = requests.get(url, params = payload)
    
    if r.status_code == 200: # then add to the dataframe else skip
      y_df = json_to_df(r)
      y_df['YEAR'] = y

      print(f"{y} dimensions {y_df.shape}")
      print("-" * 20)

      data[y] = y_df
      del y_df
      
    else: 
      data[y] = ""
      


  0%|          | 0/12 [00:00<?, ?it/s]

Adding year:  2010
Adding year:  2011
Adding year:  2012
Adding year:  2013
Adding year:  2014
Adding year:  2015
Adding year:  2016
2016 dimensions (523811, 10)
--------------------
Adding year:  2017
2017 dimensions (517364, 10)
--------------------
Adding year:  2018
2018 dimensions (513337, 10)
--------------------
Adding year:  2019
2019 dimensions (505936, 10)
--------------------
Adding year:  2020
2020 dimensions (480130, 10)
--------------------
Adding year:  2021


In [None]:
# Join all the data together
df_final = pd.DataFrame()

for y, df in tqdm(data.items()):
  print("Adding year: ", y)
  if type(df) != str:
    df_final = pd.concat([df_final, df])
  
  del df

df_final.shape

In [None]:
# Cleanup final dataset
df_final.reset_index(drop = True, inplace = True)
df_final = df_final[df_final["MOVEDNET"].isnull() != True]
df_final = df_final.astype({"MOVEDNET":"int32", "state":"int32", "county":"int32"})
df_final = df_final.merge(stateCodes, how = "left", left_on = "state", right_on = "STATE")
print("DF shape: ", df_final.shape)
print("DF dtypes: ", df_final.dtypes)

In [None]:
today = datetime.date.today()
today = datetime.date.strftime(today, "%Y%m%d")

# Write out pickle object to load faster later for analysis
pickler(df_final,"/content/drive/MyDrive/W209_final_project/Data/PickleFiles/df_"+today+".pkl") # Added date to end YYYYMMDD

In [None]:
# Write out csv
df_final.to_csv("/content/drive/MyDrive/W209_final_project/Data/df_"+today+".csv")