# Code to import census CSVs to SQL

### Settings

In [61]:
inputpath = 'to_import'
outputpath = 'sql'
flavour = 'mysql'
insertdata = False

## Get the CSV Files and make .SQL files from them

#### Function to get csv files to parse

In [62]:
def get_files(inputpath):
    try:
        files = [f for f in listdir(inputpath) if isfile(join(inputpath, f))]
        return(files)
    except:
        print('Couldn\'t get files for some reason')

In [63]:
# Test get_files()
files = get_files(inputpath)
print(files)

['2016Census_G01_AUS_SA4.csv', '2016Census_G02_AUS_SA4.csv', '2016Census_G03_AUS_SA4.csv', '2016Census_G04A_AUS_SA4.csv', '2016Census_G04B_AUS_SA4.csv']


#### Functions to actually make the schemas

In [64]:
def make_table_schema(file, inputpath, outputpath, flavour):
    try:
        # set to mysql, postgresql, etc
        command = 'csvsql --dialect ' + flavour + ' '
        # use the first 10k lines to guess column type
        command += '--snifflimit 10000' + ' '
        # feed input and output paths
        command += inputpath + '/' + file + ' > ' + outputpath + '/' + file[:-4] + '.sql'
        
        return command
    
    except:
        print('Couldn\'t make command to build table schema for file: ' + file)

def make_table_schemas(files, inputpath = 'to_import', outputpath = 'sql', flavour = 'mysql'):
    for file in files:
        try:
            #print(file)
            command = make_table_schema(file, inputpath, outputpath, flavour)
            #print(command)
            os.system(command)
            print('Made schema for: ' + file)
        except:
            print('Fell over making schema for: ' + file)

#### Go! Make the schemas

In [65]:
import csvkit
import os
from os import listdir
from os.path import isfile, join

make_table_schemas(files)

Made schema for: 2016Census_G01_AUS_SA4.csv
Made schema for: 2016Census_G02_AUS_SA4.csv
Made schema for: 2016Census_G03_AUS_SA4.csv
Made schema for: 2016Census_G04A_AUS_SA4.csv
Made schema for: 2016Census_G04B_AUS_SA4.csv


## Execute the .SQL Files

#### Function to get the .SQL file names

In [66]:
def get_sql_files(path = 'sql'):
    try:
        files = [f for f in listdir(path) if isfile(join(path, f))]
        return(files)
    except:
        print('Couldn\'t get the sql files for some reason')

#### Function to get the .SQL file paths

In [67]:
def get_sql_file_contents(file, filepath=False):
    if filepath:
        file = filepath + '/' + file
    
    try:
        fd = open(file, 'r')
        sqlFile = fd.read()
        fd.close()
        return sqlFile
    except:
        print('Couldn\'t get the sql file contents for some reason: ' + file)

#### Function to monkey patch the average columns - explained in readme

In [68]:
def monkey_patch_averages(contents):
    contents = contents.replace('`Average_num_psns_per_bedroom` DECIMAL NOT NULL', '`Average_num_psns_per_bedroom` DECIMAL (4,2) NOT NULL')
    contents = contents.replace('`Average_household_size` DECIMAL NOT NULL', '`Average_household_size` DECIMAL (4,2) NOT NULL')
    return contents

#### Function to execute the .SQL files contents

In [69]:
def execute_sql_file_contents(connection, contents):
    try:
        with connection.cursor() as cursor:
            cursor.execute(contents)

        connection.commit()
    except:
         print('SQL Error')

### Actually run it all - Execute the .SQL Files

In [70]:
import pymysql
from db import host, port, user, password, db
# these variables are stored in the file db.py so they aren't listed in GIT!
# you could define them here if you are just using Jupyter on a laptop

# make sql connection
connection = pymysql.connect(host=host,
                             port=port,
                             user=user,
                             password=password,
                             db=db,
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

# get the list of sql files we made before
sql_files = get_sql_files()
print('For instance we found this one: ' + sql_files[0])

# loop through them
for sql_file in sql_files:
    try:
        # read them
        query = get_sql_file_contents(sql_file, filepath='sql')
        query = monkey_patch_averages(query)
        execute_sql_file_contents(connection, query)
    except:
        'Failed to execute sql for some reason' + sql_file

# close connection
connection.close()

For instance we found this one: 2016Census_G01_AUS_SA4.sql


## Write data from the .CSVs into the tables

#### Function to create a mysql connection string to use with Pandas

In [71]:
def create_mysql_engine_string(user,password,host,db,port=3306):
    enginestr = 'mysql://'
    enginestr += user
    enginestr += ':'
    enginestr += password
    enginestr += '@'
    enginestr += host
    enginestr += ':'
    enginestr += str(port)
    enginestr += '/'
    enginestr += db
    
    return enginestr

#### Function to take a filename, read it into a Pandas dataframe, and write that dataframe to a mysql table

In [72]:
def insert_into_mysql(file, engine, path=False):
    
    try:
        tablename = file[:-4] # strip '.csv'

        if path:
            file = path + '/' + file
        
        #header=0 makes it treat the first row as headers
        df = pandas.read_csv(file, header=0, sep=',')
        
        #if_exists = append means insert into
        #index=False means don't try to write the Pandas index as a column
        df.to_sql(con=engine, name=tablename, if_exists='append', index=False)   
    except:
        print('Error: Couldn\'t insert into mysql')

## Actually run it - Insert data into the tables

In [73]:
from sqlalchemy import create_engine
import pandas

# these both exist if you ran these in one hit, but here in case
inputpath = 'to_import'
files = get_files(inputpath)
print('for instance we found: ' + files[0])

# again, user, password et al are defined in db.py
engine = create_mysql_engine_string(user,password,host,db,port)

# for each .sql, write
for file in files:
    insert_into_mysql(file, engine, path=inputpath)
    print('Inserted data for: ' + file)

for instance we found: 2016Census_G01_AUS_SA4.csv
Inserted data for: 2016Census_G01_AUS_SA4.csv
Inserted data for: 2016Census_G02_AUS_SA4.csv
Inserted data for: 2016Census_G03_AUS_SA4.csv
Inserted data for: 2016Census_G04A_AUS_SA4.csv
Inserted data for: 2016Census_G04B_AUS_SA4.csv
