# Human Mobility in San Diego County
Jessica Embury

### Download LODES data for select states, decompress the files, then create/populate PostgreSQL tables and set indexes.

### Import Modules

In [None]:
#import needed libraries
from bs4 import BeautifulSoup
import requests
import os
import glob
import gzip
import psycopg2

### User Entered Variables

In [None]:
#directory where this file is located and where data files will be stored
abs_path = ''

#for database 
host = ''
db = ''
user = ''
password = ''

schema_name = 'lodes'

#For fresh start (get all files for all states), use state_num = 0 and file_num = 0
#Change state_num and file_num if data retrieval needs to be resumed (after SSL Error, etc.)

#Enter CURRENT state number
#'az' = 0, 'ca' = 1, 'co' = 2, 'dc' = 3, 'fl' = 4, 'il' = 5, 'ma' = 6, 'mi'  = 7, 
#'mn' = 8, 'nv' = 9, 'ny' = 10, 'tx' = 11, 'wa' = 12
state_num = 0

#Enter last SUCCESSFUL file number for the current state (For example, in 'az', 71 out of 136, first file will be 72)
file_num = 0

In [None]:
#list object containing states to get lodes data from
states = ['az', 'ca', 'co', 'dc', 'fl', 'il', 'ma', 'mi', 'mn', 'nv', 'ny', 'tx', 'wa']
#states = ['az', 'ca']

In [None]:
#create main folders to store state .gz and .csv files

if not os.path.exists('lodes_gz'):
    os.makedirs('lodes_gz')

if not os.path.exists('lodes_csv'):
    os.makedirs('lodes_csv')
    
#list of all csv folders, for deletion later
csv_folders = []

### Functions

In [None]:
#Function to get all file names from the associated url 
#reference: https://stackoverflow.com/questions/11023530/python-to-list-http-files-and-directories
def listFD(url, ext=''):
    page = requests.get(url).text
    #print(page)
    soup = BeautifulSoup(page, 'html.parser')
    return [url + '/' + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]

#function for psycopg2 to connect to the PostgreSQL database server
#reference: https://github.com/NaysanSaran/pandas2postgresql/blob/master/notebooks/CompleteExample.ipynb
def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)

    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    return conn

### Database Implementation

In [None]:
#specify the psql connection parameters
param_dic = {
    "host"      : host,
    "database"  : db,
    "user"      : user,
    "password"  : password
}

#connect to the database server
conn = connect(param_dic)

#define the cursor
cur = conn.cursor()

In [None]:
#For all files for each state: download file, decompress gzip, save as csv, create and populate table is PostgreSQL
for y in range(len(states) - state_num):
    
    #adjust to current state
    i = y + state_num
    
    #list object to contain names of all files to be downloaded from url - for current state
    filenames = []
    
    #variable for url to the current state's lodes files
    url = 'https://lehd.ces.census.gov/data/lodes/LODES7/' + states[i] + '/od/'
    
    #file extension of lodes data
    ext = 'csv.gz'

    #get all file names for the current state
    for file in listFD(url, ext):
        #print(file)
        filenames.append(os.path.basename(file))
    
    print('State: ' +states[i] + ', Number of files for ' + url +' = ' + str(len(filenames)))
    
    #create a state specific folder for the lodes files
    state_lodes_folder_gz = 'lodes_gz/' + states[i]
    if not os.path.exists(state_lodes_folder_gz):
        os.makedirs(state_lodes_folder_gz)
    
    #create a folder within the above directory to save the decompressed csv files
    state_lodes_folder_csv = 'lodes_csv/' + states[i]
    if not os.path.exists(state_lodes_folder_csv):
        os.makedirs(state_lodes_folder_csv)       
    csv_folders.append(state_lodes_folder_csv)
        
    #for each file for the current state:
    for z in range(len(filenames) - file_num):
            
        #adjust to next file
        file = z + file_num
        
        #download the file
        r = requests.get(url + filenames[file]) 
        
        #create variable to store save paths for the .gz and .csv files
        #remove .gz from end of string to save decompressed .csv.gz as a .csv file
        gz_save_path = state_lodes_folder_gz + '/' + filenames[file]
        csv_save_path = state_lodes_folder_csv + '/' + filenames[file][:-3]
        
        #Save as a .csv.gz file
        with open(gz_save_path, 'wb') as f:
            f.write(r.content)
                
        #decompress the .gz file and save as a .csv file
        with open(csv_save_path, 'wb') as f:
            f.write(gzip.decompress(r.content))
        
        print('State: ' + states[i] + ', Number ' + str(file + 1) + ' of ' + str(len(filenames)) + ' has been saved: ' + filenames[file][:-3])
        
        #if last file for state, then reset file_num to zero to start at first file for next state
        if(file >= (len(filenames) - 1)):
            file_num = 0
      
        #DATABASE SET UP
        
        #database table name - same as filename, but remove extensions (.csv.gz)
        table_name = schema_name + '.' + filenames[file][:-7]
        
        #drop table if exists
        drop_table = ('DROP TABLE IF EXISTS {};'.format(table_name))

        try:
            cur.execute(drop_table)
            print('Table dropped if exists.')
        except:
            print("Error. Table (if exists) not dropped.")
        
        #create table
        create_table = ('CREATE TABLE {}(w_geocode text, h_geocode text, s000 int, sa01 int, sa02 int, sa03 int, se01 int, se02 int, se03 int, si01 int, si02 int, si03 int, createdate text);'.format(table_name))

        try:
            cur.execute(create_table)
            print('Table created.')
        except:
            print("Error. Table not created.")
            
        #populate table from csv
        #reference: https://www.mydatahack.com/how-to-bulk-load-data-into-postgresql-with-python/

        f = open(csv_save_path, "r")

        try:
            cur.copy_expert("copy {} from STDIN CSV HEADER QUOTE '\"'".format(table_name), f)
            print('Table populated.')
        except:
            print("Error. Database not populated.")
            
        #create an index
        index_name = filenames[file][:-7]
        create_index = ('CREATE INDEX idx_{}_wh_geocode ON {} (w_geocode, h_geocode);'.format(index_name, table_name))
        
        try:
            cur.execute(create_index)
            print('Index created.')
        except:
            print("Error. Index not created.")
            
    #commit database changes for each state
    conn.commit()

In [None]:
#COMMIT AND CLOSE DATABASE
#commit changes to the database
conn.commit()

#close the database connection
conn.close()

#close the cursor
cur.close()

### Delete unnecessary local files and folders

In [None]:
#delete LODES csv files
#reference: https://linuxize.com/post/python-delete-files-and-directories/

files = glob.glob(abs_path +'/lodes_csv/**/*.csv', recursive=True)

for f in files:
    try:
        os.remove(f)
    except OSError as e:
        print("Error: %s : %s" % (f, e.strerror))

In [None]:
#delete associated state csv folders
for i in range(len(csv_folders)):
    dir_path = abs_path + csv_folders[i]

    try:
        os.rmdir(dir_path)
    except OSError as e:
        print("Error: %s : %s" % (dir_path, e.strerror))

In [None]:
#delete main csv folder
try:
    os.rmdir(abs_path + 'lodes_csv')
except OSError as e:
    print("Error: %s : %s" % (dir_path, e.strerror))
