In [18]:
import pandas as pd
import numpy as np
import csv

In [19]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [20]:
%sql postgresql://localhost/bankcalls

'Connected: @bankcalls'

In [21]:
# Saves me from constantly calling the get dataframe
# method on the sql magic output
%config SqlMagic.autopandas = True

# Overview of categorical columns

In [22]:
%%sql 
select column_name, data_type
from information_schema.columns where
table_name='bank_addl' and
data_type = 'character varying';

 * postgresql://localhost/bankcalls
11 rows affected.


Unnamed: 0,column_name,data_type
0,job,character varying
1,marital,character varying
2,education,character varying
3,in_default,character varying
4,housing,character varying
5,loan,character varying
6,contact,character varying
7,month,character varying
8,day_of_week,character varying
9,poutcome,character varying


In [23]:
text_cols = _.column_name

In [29]:
# Get the values for each categorical column
# Print the number of values for each category
category_values = {}
for column in text_cols:
    distinct = %sql SELECT distinct($column) FROM bank_addl;
    category_values[column] =  (distinct.loc[:,column])
    
for column in text_cols:
    print(column, len(category_values[column]))
    

 * postgresql://localhost/bankcalls
12 rows affected.
 * postgresql://localhost/bankcalls
4 rows affected.
 * postgresql://localhost/bankcalls
8 rows affected.
 * postgresql://localhost/bankcalls
3 rows affected.
 * postgresql://localhost/bankcalls
3 rows affected.
 * postgresql://localhost/bankcalls
3 rows affected.
 * postgresql://localhost/bankcalls
2 rows affected.
 * postgresql://localhost/bankcalls
10 rows affected.
 * postgresql://localhost/bankcalls
5 rows affected.
 * postgresql://localhost/bankcalls
3 rows affected.
 * postgresql://localhost/bankcalls
2 rows affected.
job 12
marital 4
education 8
in_default 3
housing 3
loan 3
contact 2
month 10
day_of_week 5
poutcome 3
success 2


So the category with the largest number of categorica values is job, which has 12 values. Interestingly, there are only
10 distinct months in the dataset, so less than a year's worth of data or the bank only does telemarketing 10 months out of
the year.

# Make some integer columns where categories might be ordinal.

## Months

In [31]:
category_values['month']

0    oct
1    apr
2    jul
3    may
4    nov
5    dec
6    aug
7    jun
8    mar
9    sep
Name: month, dtype: object

In [32]:
# let's put those in sequential order
months = [ "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec" ]
month_map = {}
for i,m in enumerate(months):
    month_map[m] = i + 1

In [33]:
month_map

{'jan': 1,
 'feb': 2,
 'mar': 3,
 'apr': 4,
 'may': 5,
 'jun': 6,
 'jul': 7,
 'aug': 8,
 'sep': 9,
 'oct': 10,
 'nov': 11,
 'dec': 12}

In [34]:
%%sql 
ALTER TABLE bank_addl
ADD COLUMN month_num int;

 * postgresql://localhost/bankcalls
Done.


In [35]:
for m in months:
    qm = "'" + m + "'"
    n = month_map[m]
    %sql  UPDATE bank_addl SET month_num = $n WHERE month = $qm;

 * postgresql://localhost/bankcalls
0 rows affected.
 * postgresql://localhost/bankcalls
0 rows affected.
 * postgresql://localhost/bankcalls
546 rows affected.
 * postgresql://localhost/bankcalls
2632 rows affected.
 * postgresql://localhost/bankcalls
13769 rows affected.
 * postgresql://localhost/bankcalls
5318 rows affected.
 * postgresql://localhost/bankcalls
7174 rows affected.
 * postgresql://localhost/bankcalls
6178 rows affected.
 * postgresql://localhost/bankcalls
570 rows affected.
 * postgresql://localhost/bankcalls
718 rows affected.
 * postgresql://localhost/bankcalls
4101 rows affected.
 * postgresql://localhost/bankcalls
182 rows affected.


## Day of week

Apparently the bank telemarketing department doesn't work weekends.

In [36]:
category_values['day_of_week']

0    mon
1    thu
2    tue
3    fri
4    wed
Name: day_of_week, dtype: object

In [37]:
# let's put those in sequential order
days = [ 'mon', 'tue', 'wed', 'thu', 'fri' ]
day_map = {}
for i,d in enumerate(days):
    day_map[d] = i + 1
day_map

{'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5}

In [38]:
%%sql 
ALTER TABLE bank_addl
ADD COLUMN day_of_week_num int;

 * postgresql://localhost/bankcalls
Done.


In [39]:
for d in days:
    qd = "'" + d + "'"
    n = day_map[d]
    %sql UPDATE bank_addl SET day_of_week_num = $n WHERE day_of_week = $qd;

 * postgresql://localhost/bankcalls
8514 rows affected.
 * postgresql://localhost/bankcalls
8090 rows affected.
 * postgresql://localhost/bankcalls
8134 rows affected.
 * postgresql://localhost/bankcalls
8623 rows affected.
 * postgresql://localhost/bankcalls
7827 rows affected.


In [42]:
# Quick check that my updates affected all rows
%sql SELECT * FROM bank_addl WHERE day_of_week_num IS NULL OR  month_num IS NULL;

 * postgresql://localhost/bankcalls
0 rows affected.


In [43]:
# Should be in autocommit mode, but just to make sure :-)
%sql commit;

 * postgresql://localhost/bankcalls
Done.
