# Data loader 

This file is loading the CSV as downloaded and saving
it to SQL with only a few minor changes. Additional 
column tweaking is in Data_clean.ipynb

In [1]:
import pandas as pd
import csv

In [2]:
%load_ext sql

In [3]:
!psql -c"CREATE DATABASE bankcalls;"

CREATE DATABASE


In [4]:
%sql postgresql://localhost/bankcalls

'Connected: @bankcalls'

In [5]:
%%sql
CREATE TABLE bank_addl (
    bank_addl_id SERIAL UNIQUE,
    age INT,
    job VARCHAR(15),
    marital VARCHAR(10),
    education VARCHAR(20),
    in_default VARCHAR(10),
    housing VARCHAR(10),
    loan VARCHAR(10),
    contact VARCHAR(10),
    month VARCHAR(5),
    day_of_week VARCHAR(5),
    duration INT,
    campaign INT,
    pdays INT,
    previous INT,
    poutcome VARCHAR(15),
    emp_var_rate FLOAT,
    cons_price_idx FLOAT,
    cons_conf_idx FLOAT,
    euribor3m FLOAT,
    nr_employed FLOAT,
    success VARCHAR(5)    
);

 * postgresql://localhost/bankcalls
Done.


[]

In [6]:
dirname = "../data/bank-additional/"
filename = "bank-additional-full.csv"

In [7]:
# Rename a few columns from the csv file 
# because default is a sql keyword, rename the loan in default column to "in_default"
# because the target variable 'y' is a cryptic column name, rename
# that one to 'success' to indicate a successful customer subscription
# because the period is also sql syntax, rename all periods to underscores
def fix_names(csv_names):
    postgres_names = []
    for name in csv_names:
        if name == "default":
            postgres_names += ["in_default"]
        elif name == "y":
            postgres_names += ["success"]
        else:
            postgres_names += [name.replace('.','_')]
    return postgres_names
        

In [8]:
# Explicitly adds a single quote around strings that SQL is supposed to interpret
# as strings
def quote_strings(row_df):
    categorical_columns = [ 'job', 'marital', 'education', 
                            'in_default', 'housing',
                            'loan', 'contact', 'month', 
                            'day_of_week', 'poutcome', 'success' ]
    row_df.loc[categorical_columns] = "'" + row_df.loc[categorical_columns] + "'"
    
    

In [9]:
# 41188 postgresql feedback strings is not pretty 
# The %%capture in the next cell captures even more output
%config SqlMagic.feedback   = False
%config SqlMagic.autocommit = False

In [10]:
%%capture
with open(dirname + filename, newline='') as csvfile:

    # figure out how to read the csv file
    dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=';')
    csvfile.seek(0)
    reader = csv.reader(csvfile, dialect)

    # pull off the header and turn it into my postgres column names
    header = next(reader)
    postgres_names = fix_names(header)
    postgres_columns = ", ".join(postgres_names)

    for row in reader:
        row_df = pd.DataFrame(index=postgres_names, data=row)
        quote_strings(row_df)
        row_values = ", ".join(row_df.loc[:, 0])
        command = ("INSERT INTO bank_addl ( " + postgres_columns + ") " +
                   " VALUES (" + row_values + " );")
        result = %sql $command
        if result == None:
            raise Exception('SQL command failed>', command)


In [11]:
%config SqlMagic.feedback = True
%config SqlMagic.autocommit = True


In [12]:
%sql commit; select count(*) from bank_addl;

 * postgresql://localhost/bankcalls
Done.
1 rows affected.


count
41188
