In [1]:
# Get pandas and postgres to work together
import psycopg2 as pg
import pandas as pd
import numpy as np
import pandas.io.sql as pd_sql
import pickle

# We are also going to do some basic viz
import matplotlib.pyplot as plt
%matplotlib inline

  return f(*args, **kwds)


In [2]:
# There is a bug in matplotlib. You cannot set the rc parameters in the same
# cell that you use the "%matplotlib inline" magic command
plt.style.use('ggplot')
plt.rc('font', size=18) 

In [3]:
# Postgres info to connect

connection_args = {
    'host': '54.153.60.68', # You'll have to update this to your IP
    'user': 'ubuntu',    # username
    'dbname': 'airbnb',   # DB that we are connecting to
    'port': 5432         # port we opened on AWS
}

# We will talk about this magic Python trick!
connection = pg.connect(**connection_args)

In [4]:
cursor = connection.cursor()

In [110]:
query = """
CREATE TABLE trainuser (
  id VARCHAR(10) PRIMARY KEY,
  date_account_created TEXT,
  timestamp_first_active NUMERIC,
  date_first_booking TEXT,
  gender TEXT,
  age DOUBLE PRECISION,
  signup_method TEXT,
  signup_flow INT,
  language TEXT,
  affiliate_channel TEXT,
  affiliate_provider TEXT,
  first_affiliate_tracked TEXT,
  signup_app TEXT,
  first_device_type TEXT,
  first_browser TEXT,
  country_destination TEXT
);
"""
cursor.execute(query)

In [109]:
#cursor.execute('commit;')
#cursor.execute("rollback;")

In [111]:
query = """
CREATE TABLE session (
  user_id VARCHAR(10),
  action TEXT,
  action_type TEXT,
  action_detail TEXT,
  device_type TEXT,
  secs_elapsed REAL
);
"""
cursor.execute(query)

In [None]:
query = """
CREATE TABLE country (
  country_destination VARCHAR(2) PRIMARY KEY,
  lat_destination REAL,
  lng_destination REAL,
  distance_km REAL,
  destination_km2 REAL,
  destination_language VARCHAR(3),
  language_levenshtein_distance REAL
);
"""
cursor.execute(query)

In [None]:
query = """
CREATE TABLE population (
  age_bucket TEXT,
  country_destination VARCHAR(2) REFERENCES country,
  gender TEXT,
  population_in_thousands REAL,
  year REAL
);
"""
cursor.execute(query)

In [None]:
"""
To copy the data into the database
scp user_subset.csv myaws:.

Log on the database and run command
COPY session FROM '/home/ubuntu/session_subset.csv' DELIMITER ',' CSV HEADER;
"""

## Data cleaning

In [8]:
def get_table_info(cursor, table):
    # Check column data types
    query = """
    SELECT column_name, data_type 
    FROM information_schema.columns
    WHERE table_name = '{}'
    """.format(table)

    cursor.execute(query)
    return cursor.fetchall()

## Convert the date_account_created to date

In [118]:
# Convert date_account_created from text to timestamp, for further processsing
query = """
ALTER TABLE trainuser ALTER COLUMN date_account_created TYPE TIMESTAMP
using to_date(date_account_created, 'YYYY-MM-DD');
"""

cursor.execute(query)

In [119]:
# Convert the timestamp into continous variable
query = """
ALTER TABLE trainuser ADD COLUMN account_age INT
"""

cursor.execute(query)

In [120]:
query = """
UPDATE trainuser 
SET account_age = cast(12 * (date_part('year', date_account_created) - 1970) + date_part('month', date_account_created) AS int8)
"""

cursor.execute(query)

## Convert timestamp_first_active

In [121]:
## Convert date_account_created from text to timestamp, for further processsing
query = """
ALTER TABLE trainuser ALTER COLUMN timestamp_first_active TYPE TIMESTAMP
using to_date(cast(timestamp_first_active as text), 'YYYYMMDDHH24MISS');
"""

cursor.execute(query)

In [122]:
# Convert the timestamp into continous variable
query = """
ALTER TABLE trainuser ADD COLUMN active_age INT
"""

cursor.execute(query)

In [123]:
query = """
UPDATE trainuser 
SET active_age = cast(12 * (date_part('year', timestamp_first_active) - 1970) + date_part('month', timestamp_first_active) AS int8)
"""

cursor.execute(query)

## Convert date_first_booking

In [124]:
# Seems Postgresql ignores none value pretty smartly
query = """
ALTER TABLE trainuser ALTER COLUMN date_first_booking TYPE TIMESTAMP
using to_date(date_first_booking, 'YYYY-MM-DD');
"""

cursor.execute(query);

In [125]:
# Convert the timestamp into continous variable
query = """
ALTER TABLE trainuser ADD COLUMN booking_age INT
"""

cursor.execute(query)

In [126]:
# Convert the timestamp into continous variable
query = """
UPDATE trainuser
SET booking_age = CASE
    WHEN date_first_booking IS NULL THEN 0
    ELSE cast(12 * (date_part('year', date_first_booking) - 1970) + date_part('month', date_first_booking) AS int8)
    END;
"""

cursor.execute(query)

In [134]:
query = """
SELECT *
FROM trainuser
LIMIT 5;
"""

cursor.execute(query)
pd.DataFrame(cursor.fetchall())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,6fsk4w7bya,2014-06-13,20140613045344,2014-06-13,MALE,,basic,0,en,sem-brand,google,omg,Web,iPad,Mobile Safari,US
1,atws2zzacv,2013-10-09,20131009180549,,-unknown-,,basic,0,en,direct,direct,untracked,Web,iPad,Mobile Safari,NDF
2,tleefrh3ox,2012-07-05,20120705233747,,FEMALE,,basic,0,en,other,other,omg,Web,Windows Desktop,Firefox,NDF
3,8b0lo9judh,2010-10-03,20101003021957,2010-10-03,-unknown-,,basic,3,en,sem-non-brand,vast,,Web,Other/Unknown,-unknown-,CA
4,nwjtkv07ef,2012-05-10,20120510151537,,-unknown-,,basic,0,en,direct,direct,product,Web,Mac Desktop,Safari,NDF


In [132]:
cursor.execute("rollback")

## Check gender column

In [None]:
query = """
SELECT gender, count(*)
FROM trainuser
GROUP BY gender
"""

cursor.execute(query)
cursor.fetchall()

# It is clear that gender needs further data cleaning

## Check age range

In [None]:
query = """
SELECT cast(age as int8) as age_int, count(*)
FROM trainuser
GROUP BY cast(age as int8)
ORDER BY cast(age as int8) DESC
"""

cursor.execute(query)
cursor.fetchall()

# A lot of wrong values and missing values

## Check signup method

In [6]:
def check_col_hist(cursor, table, column):
    query = """
    SELECT {}, count(*)
    FROM {}
    GROUP BY {}
    ORDER BY count DESC
    """.format(column, table, column)

    cursor.execute(query)
    return pd.DataFrame(cursor.fetchall())

In [128]:
check_col_hist(cursor, 'trainuser', 'signup_method')

Unnamed: 0,0,1
0,basic,15269
1,facebook,6014
2,google,62


## Check signup flow

In [None]:
check_col_hist(cursor, 'trainuser', 'signup_flow')

# What do these signup flow numbers mean? Index of signup pages - treat it as categorical variable

## Check language

In [None]:
check_col_hist(cursor, 'trainuser','language')

In [None]:
check_col_hist(cursor, 'trainuser','affiliate_channel')

#sem: search engine marketing
#seo: SEO is a marketing discipline focused on growing visibility in organic (non-paid) search engine results.


In [None]:
check_col_hist(cursor, 'trainuser','affiliate_provider')

In [None]:
check_col_hist(cursor, 'trainuser','first_affiliate_tracked')

In [None]:
check_col_hist(cursor, 'trainuser', 'signup_app')

In [None]:
check_col_hist(cursor, 'trainuser', 'first_device_type')

In [None]:
check_col_hist(cursor, 'trainuser', 'first_browser')

In [None]:
check_col_hist(cursor, 'trainuser', 'country_destination')

## Add splitseed to trainuser data

In [136]:
# Insert a new column
query = """
ALTER TABLE trainuser
ADD COLUMN splitseed REAL;
"""

cursor.execute(query)

In [137]:
# Insert data into new column
query = """
UPDATE trainuser
SET splitseed=random();
"""

cursor.execute(query)

In [138]:
query = """
SELECT COUNT(*)
FROM trainuser
WHERE splitseed>.7;
"""

cursor.execute(query)
cursor.fetchone()

(6418,)

In [139]:
query = """
SELECT COUNT(*)
FROM trainuser
WHERE splitseed<=.7;
"""

cursor.execute(query)
cursor.fetchone()

(14927,)

## Check session data

In [None]:
# CREATE VIEW
query = """
DROP VIEW session_stats;
"""

cursor.execute(query)

In [None]:
# CREATE VIEW
query = """
CREATE VIEW session_stats AS
SELECT user_id,  avg(secs_elapsed) AS mean_time, count(*) AS session_counts
FROM session
GROUP BY user_id;
"""

cursor.execute(query)

In [None]:
query = """
SELECT *
FROM session_stats
ORDER BY session_counts DESC
LIMIT 5;
"""

cursor.execute(query)
pd.DataFrame(cursor.fetchall())

In [None]:
get_table_info(cursor, 'session')

In [None]:
check_col_hist(cursor, 'session', 'action')

In [None]:
check_col_hist(cursor, 'session', 'action_type')

In [None]:
check_col_hist(cursor, 'session', 'action_detail')

In [None]:
check_col_hist(cursor, 'session', 'device_type')

In [None]:
check_col_hist(cursor, 'session', 'secs_elapsed')

In [100]:
query = """
SELECT count(*)
FROM session;
"""

cursor.execute(query)
cursor.fetchone()

(10567737,)

In [101]:
query = """
SELECT count(*)
FROM trainuser;
"""

cursor.execute(query)
cursor.fetchone()

(213451,)

In [102]:
query = """
SELECT count(*)
FROM session LEFT JOIN trainuser
ON (session.user_id = trainuser.id)
WHERE trainuser.splitseed <= 0.1
;
"""

cursor.execute(query)
cursor.fetchone()

(564114,)

In [171]:
cursor.execute("rollback")

In [172]:
## Join two tables together

query = """
CREATE VIEW session_booking AS
SELECT session.*, trainuser.*
FROM session LEFT JOIN trainuser 
ON (session.user_id = trainuser.id);
"""

cursor.execute(query)

In [99]:
## Join two tables together

query = """
SELECT *
FROM session_booking
WHERE splitseed <= 0.7;
"""



cursor.execute(query)
data = cursor.fetchall()

user_cols = [x[0] for x in get_table_info(cursor, 'trainuser')]
session_cols = [x[0] for x in get_table_info(cursor, 'session')]

train_session = pd.DataFrame(data, columns = user_cols+session_cols)


KeyboardInterrupt: 

In [None]:
query = """
SELECT *
FROM session_booking_train
LIMIT 5
"""

cursor.execute(query)
pd.DataFrame(cursor.fetchall())

## Try to export booking data into python to have a look

In [12]:
query = """
SELECT *
FROM trainuser
WHERE splitseed <= 0.7;
"""
cursor.execute(query)
train_data = cursor.fetchall()

user_cols = [x[0] for x in get_table_info(cursor, 'trainuser')]

train_user = pd.DataFrame(train_data, columns = user_cols)

In [13]:
train_user.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,splitseed
0,ppr8sr6z33,2014-06-15,20140615004156,,-unknown-,,basic,25,en,direct,direct,untracked,iOS,iPhone,-unknown-,NDF,0.478538
1,eiwzl341qa,2013-12-16,20131216153039,,-unknown-,,basic,25,en,direct,direct,linked,iOS,Mac Desktop,Safari,NDF,0.629914
2,lor9srems2,2014-04-07,20140407202112,,-unknown-,,basic,25,en,direct,direct,untracked,iOS,iPhone,-unknown-,NDF,0.18026
3,uk2031aezt,2014-01-11,20140111154240,,-unknown-,,basic,25,en,direct,direct,untracked,iOS,iPad,-unknown-,NDF,0.561323
4,q7k5aflli0,2013-06-01,20130601223530,,-unknown-,,basic,0,en,sem-non-brand,vast,omg,Web,Windows Desktop,Firefox,NDF,0.244916


In [17]:
y_train = pd.DataFrame(train_user['country_destination'].values, columns=['country'], index=train_user['id'])
X_train = train_user.drop(('country_destination'), axis=1)

In [18]:
y_train.head()

Unnamed: 0_level_0,country
id,Unnamed: 1_level_1
ppr8sr6z33,NDF
eiwzl341qa,NDF
lor9srems2,NDF
uk2031aezt,NDF
q7k5aflli0,NDF


In [144]:
with open('./data/processed/X_train.pkl', 'wb') as picklefile:
    pickle.dump(X_train, picklefile)

In [19]:
with open('./data/processed/y_train.pkl', 'wb') as picklefile:
    pickle.dump(y_train, picklefile)

In [146]:
query = """
SELECT *
FROM trainuser
WHERE splitseed > 0.7;
"""
cursor.execute(query)
test_data = cursor.fetchall()
user_cols = [x[0] for x in get_table_info(cursor, 'trainuser')]
test_user = pd.DataFrame(test_data, columns = user_cols)

y_test = test_user['country_destination'][:,np.newaxis]
X_test = test_user.drop(('country_destination'), axis=1)

with open('./data/processed/X_test.pkl', 'wb') as picklefile:
    pickle.dump(X_test, picklefile)
    
with open('./data/processed/y_test.pkl', 'wb') as picklefile:
    pickle.dump(y_test, picklefile)

### Deal with session data

In [174]:
query = """
SELECT user_id, action, action_type, action_detail, device_type, secs_elapsed
FROM session_booking
WHERE splitseed <=0.7;
"""

cursor.execute(query)
session_train = cursor.fetchall()

In [175]:
session_cols = [x[0] for x in get_table_info(cursor, 'session')]

session_train = pd.DataFrame(session_train, columns=session_cols)

In [177]:
with open('./data/raw/session_train.pkl', 'wb') as picklefile:
    pickle.dump(session_train, picklefile)

In [178]:
query = """
SELECT user_id, action, action_type, action_detail, device_type, secs_elapsed
FROM session_booking
WHERE splitseed > 0.7;
"""

cursor.execute(query)
session_test = cursor.fetchall()
session_test = pd.DataFrame(session_test, columns=session_cols)

with open('./data/raw/session_test.pkl', 'wb') as picklefile:
    pickle.dump(session_test, picklefile)

In [157]:
query = """
SELECT *
FROM trainuser
LIMIT 5
"""

cursor.execute(query)
pd.DataFrame(cursor.fetchall())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,ppr8sr6z33,2014-06-15,20140615004156,,-unknown-,,basic,25,en,direct,direct,untracked,iOS,iPhone,-unknown-,NDF,0.478538
1,bieah0w2fy,2013-06-11,20130611044134,,FEMALE,,facebook,12,en,api,other,,Moweb,Other/Unknown,-unknown-,NDF,0.740336
2,eiwzl341qa,2013-12-16,20131216153039,,-unknown-,,basic,25,en,direct,direct,linked,iOS,Mac Desktop,Safari,NDF,0.629914
3,lor9srems2,2014-04-07,20140407202112,,-unknown-,,basic,25,en,direct,direct,untracked,iOS,iPhone,-unknown-,NDF,0.18026
4,uk2031aezt,2014-01-11,20140111154240,,-unknown-,,basic,25,en,direct,direct,untracked,iOS,iPad,-unknown-,NDF,0.561323


In [8]:
get_table_info(cursor, 'session')

[('user_id', 'character varying'),
 ('action', 'text'),
 ('action_type', 'text'),
 ('action_detail', 'text'),
 ('device_type', 'text'),
 ('secs_elapsed', 'real')]

In [16]:
# Convert the timestamp into continous variable
query = """
ALTER TABLE session ADD COLUMN session_under_2 INT
"""
cursor.execute(query)

In [17]:
query = """
UPDATE session
SET session_under_2 = 
CASE WHEN secs_elapsed <= 120 THEN 1
ELSE 0 END;
"""
cursor.execute(query)

In [18]:
# Convert the timestamp into continous variable
query = """
ALTER TABLE session ADD COLUMN session_between_2_and_5 INT
"""
cursor.execute(query)

In [19]:
query = """
UPDATE session
SET session_between_2_and_5 = 
CASE WHEN (secs_elapsed >= 120 AND secs_elapsed <= 300) THEN 1
ELSE 0 END;
"""
cursor.execute(query)

In [90]:
query = """
SELECT secs_elapsed
FROM session;
"""

cursor.execute(query)
mins = cursor.fetchall()