In [249]:
# Get pandas and postgres to work together
import psycopg2 as pg
import pandas as pd
import numpy as np
import pandas.io.sql as pd_sql
import pickle

# We are also going to do some basic viz
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# There is a bug in matplotlib. You cannot set the rc parameters in the same
# cell that you use the "%matplotlib inline" magic command
plt.style.use('ggplot')
plt.rc('font', size=18) 

In [3]:
# Postgres info to connect

connection_args = {
    'host': '54.153.60.68', # You'll have to update this to your IP
    'user': 'ubuntu',    # username
    'dbname': 'airbnb',   # DB that we are connecting to
    'port': 5432         # port we opened on AWS
}

# We will talk about this magic Python trick!
connection = pg.connect(**connection_args)

In [4]:
cursor = connection.cursor()

In [9]:
query = """
CREATE TABLE trainuser (
  id VARCHAR(10) PRIMARY KEY,
  date_account_created TEXT,
  timestamp_first_active NUMERIC,
  date_first_booking TEXT,
  gender TEXT,
  age DOUBLE PRECISION,
  signup_method TEXT,
  signup_flow INT,
  language TEXT,
  affiliate_channel TEXT,
  affiliate_provider TEXT,
  first_affiliate_tracked TEXT,
  signup_app TEXT,
  first_device_type TEXT,
  first_browser TEXT,
  country_destination TEXT
);
"""
cursor.execute(query)

In [7]:
cursor.execute('commit;')

In [8]:
cursor.execute("rollback;")

In [16]:
query = """
CREATE TABLE session (
  user_id VARCHAR(10),
  action TEXT,
  action_type TEXT,
  action_detail TEXT,
  device_type TEXT,
  secs_elapsed REAL
);
"""
cursor.execute(query)

In [11]:
query = """
CREATE TABLE country (
  country_destination VARCHAR(2) PRIMARY KEY,
  lat_destination REAL,
  lng_destination REAL,
  distance_km REAL,
  destination_km2 REAL,
  destination_language VARCHAR(3),
  language_levenshtein_distance REAL
);
"""
cursor.execute(query)

In [17]:
query = """
CREATE TABLE population (
  age_bucket TEXT,
  country_destination VARCHAR(2) REFERENCES country,
  gender TEXT,
  population_in_thousands REAL,
  year REAL
);
"""
cursor.execute(query)

## Data cleaning

In [183]:
def get_table_info(cursor, table):
    # Check column data types
    query = """
    SELECT column_name, data_type 
    FROM information_schema.columns
    WHERE table_name = '{}'
    """.format(table)

    cursor.execute(query)
    return cursor.fetchall()
get_table_info(cursor, 'trainuser')

[('id', 'character varying'),
 ('date_account_created', 'timestamp without time zone'),
 ('timestamp_first_active', 'timestamp without time zone'),
 ('date_first_booking', 'timestamp without time zone'),
 ('gender', 'text'),
 ('age', 'double precision'),
 ('signup_method', 'text'),
 ('signup_flow', 'integer'),
 ('language', 'text'),
 ('affiliate_channel', 'text'),
 ('affiliate_provider', 'text'),
 ('first_affiliate_tracked', 'text'),
 ('signup_app', 'text'),
 ('first_device_type', 'text'),
 ('first_browser', 'text'),
 ('country_destination', 'text'),
 ('splitseed', 'real')]

##### Only 1 entry per user in the trainuser table

In [None]:
query = """
SELECT *
FROM trainuser
LIMIT 5;
"""

cursor.execute(query);

pd.DataFrame(cursor.fetchall())

## Convert the date_account_created to date

In [51]:
# Convert date_account_created from text to timestamp, for further processsing
query = """
ALTER TABLE trainuser ALTER COLUMN date_account_created TYPE TIMESTAMP
using to_date(date_account_created, 'YYYY-MM-DD');
"""

cursor.execute(query)

In [206]:
# Convert the timestamp into continous variable
query = """
ALTER TABLE trainuser ADD COLUMN account_age INT
"""

cursor.execute(query)

In [207]:
query = """
UPDATE trainuser 
SET account_age = cast(12 * (date_part('year', date_account_created) - 1970) + date_part('month', date_account_created) AS int8)
"""

cursor.execute(query)

## Convert timestamp_first_active

In [45]:
## Convert date_account_created from text to timestamp, for further processsing
query = """
ALTER TABLE trainuser ALTER COLUMN timestamp_first_active TYPE TIMESTAMP
using to_date(cast(timestamp_first_active as text), 'YYYYMMDDHH24MISS');
"""

cursor.execute(query)

In [215]:
# Convert the timestamp into continous variable
query = """
ALTER TABLE trainuser ADD COLUMN active_age INT
"""

cursor.execute(query)

In [217]:
query = """
UPDATE trainuser 
SET active_age = cast(12 * (date_part('year', timestamp_first_active) - 1970) + date_part('month', timestamp_first_active) AS int8)
"""

cursor.execute(query)

## Convert date_first_booking

In [None]:
# Seems Postgresql ignores none value pretty smartly
query = """
ALTER TABLE trainuser ALTER COLUMN date_first_booking TYPE TIMESTAMP
using to_date(date_first_booking, 'YYYY-MM-DD');
"""

cursor.execute(query);

In [222]:
# Convert the timestamp into continous variable
query = """
ALTER TABLE trainuser ADD COLUMN booking_age INT
"""

cursor.execute(query)

In [None]:
UPDATE campaigns AS cmp
    SET name = CASE
                 WHEN rc.office IS NULL OR rc.office = '' THEN ptn.first_name || ' ' || ptn.last_name
                ELSE ptn.first_name || ' ' || ptn.last_name || ' for ' || rc.office
              END
from politicians ptn, races rc 
where ptn.id = cmp.politician_id
  and rc.id = cmp.race_id

In [230]:
# Convert the timestamp into continous variable
query = """
UPDATE trainuser
SET booking_age = CASE
    WHEN date_first_booking IS NULL THEN 0
    ELSE cast(12 * (date_part('year', date_first_booking) - 1970) + date_part('month', date_first_booking) AS int8)
    END;
"""

cursor.execute(query)

## Check gender column

In [55]:
query = """
SELECT gender, count(*)
FROM trainuser
GROUP BY gender
"""

cursor.execute(query)
cursor.fetchall()

# It is clear that gender needs further data cleaning

[('MALE', 54440), ('-unknown-', 95688), ('FEMALE', 63041), ('OTHER', 282)]

## Check age range

In [58]:
query = """
SELECT cast(age as int8) as age_int, count(*)
FROM trainuser
GROUP BY cast(age as int8)
ORDER BY cast(age as int8) DESC
"""

cursor.execute(query)
cursor.fetchall()

# A lot of wrong values and missing values

[(None, 87990),
 (2014, 710),
 (2013, 39),
 (2008, 1),
 (1995, 1),
 (1953, 1),
 (1952, 1),
 (1949, 3),
 (1947, 2),
 (1942, 1),
 (1938, 1),
 (1936, 2),
 (1935, 1),
 (1933, 1),
 (1932, 3),
 (1931, 3),
 (1929, 2),
 (1928, 2),
 (1927, 1),
 (1926, 1),
 (1925, 1),
 (1924, 2),
 (150, 1),
 (132, 1),
 (115, 13),
 (113, 4),
 (112, 1),
 (111, 2),
 (110, 196),
 (109, 31),
 (108, 15),
 (107, 23),
 (106, 17),
 (105, 1131),
 (104, 48),
 (103, 26),
 (102, 34),
 (101, 23),
 (100, 26),
 (99, 17),
 (98, 15),
 (97, 10),
 (96, 25),
 (95, 49),
 (94, 12),
 (93, 18),
 (92, 14),
 (91, 12),
 (90, 18),
 (89, 13),
 (88, 12),
 (87, 31),
 (86, 27),
 (85, 26),
 (84, 20),
 (83, 25),
 (82, 26),
 (81, 30),
 (80, 46),
 (79, 50),
 (78, 50),
 (77, 72),
 (76, 68),
 (75, 94),
 (74, 123),
 (73, 149),
 (72, 189),
 (71, 180),
 (70, 259),
 (69, 276),
 (68, 373),
 (67, 402),
 (66, 435),
 (65, 516),
 (64, 549),
 (63, 573),
 (62, 593),
 (61, 680),
 (60, 734),
 (59, 780),
 (58, 823),
 (57, 915),
 (56, 941),
 (55, 1011),
 (54, 1019)

## Check signup method

In [92]:
def check_col_hist(cursor, table, column):
    query = """
    SELECT {}, count(*)
    FROM {}
    GROUP BY {}
    ORDER BY count DESC
    """.format(column, table, column)

    cursor.execute(query)
    return pd.DataFrame(cursor.fetchall())

In [93]:
check_col_hist(cursor, 'trainuser', 'signup_method')

Unnamed: 0,0,1
0,basic,152897
1,facebook,60008
2,google,546


## Check signup flow

In [94]:
check_col_hist(cursor, 'trainuser', 'signup_flow')

# What do these signup flow numbers mean? Index of signup pages - treat it as categorical variable

Unnamed: 0,0,1
0,0,164739
1,25,14659
2,12,9329
3,3,8822
4,2,6881
5,24,4328
6,23,2835
7,1,1047
8,6,301
9,8,240


## Check language

In [95]:
check_col_hist(cursor, 'trainuser','language')

Unnamed: 0,0,1
0,en,206314
1,zh,1632
2,fr,1172
3,es,915
4,ko,747
5,de,732
6,it,514
7,ru,389
8,pt,240
9,ja,225


In [96]:
check_col_hist(cursor, 'trainuser','affiliate_channel')

#sem: search engine marketing
#seo: SEO is a marketing discipline focused on growing visibility in organic (non-paid) search engine results.


Unnamed: 0,0,1
0,direct,137727
1,sem-brand,26045
2,sem-non-brand,18844
3,other,8961
4,seo,8663
5,api,8167
6,content,3948
7,remarketing,1096


In [97]:
check_col_hist(cursor, 'trainuser','affiliate_provider')

Unnamed: 0,0,1
0,direct,137426
1,google,51693
2,other,12549
3,craigslist,3471
4,bing,2328
5,facebook,2273
6,vast,829
7,padmapper,768
8,facebook-open-graph,545
9,yahoo,496


In [98]:
check_col_hist(cursor, 'trainuser','first_affiliate_tracked')

Unnamed: 0,0,1
0,untracked,109232
1,linked,46287
2,omg,43982
3,tracked-other,6156
4,,6065
5,product,1556
6,marketing,139
7,local ops,34


In [99]:
check_col_hist(cursor, 'trainuser', 'signup_app')

Unnamed: 0,0,1
0,Web,182717
1,iOS,19019
2,Moweb,6261
3,Android,5454


In [100]:
check_col_hist(cursor, 'trainuser', 'first_device_type')

Unnamed: 0,0,1
0,Mac Desktop,89600
1,Windows Desktop,72716
2,iPhone,20759
3,iPad,14339
4,Other/Unknown,10667
5,Android Phone,2803
6,Android Tablet,1292
7,Desktop (Other),1199
8,SmartPhone (Other),76


In [101]:
check_col_hist(cursor, 'trainuser', 'first_browser')

Unnamed: 0,0,1
0,Chrome,63845
1,Safari,45169
2,Firefox,33655
3,-unknown-,27266
4,IE,21068
5,Mobile Safari,19274
6,Chrome Mobile,1270
7,Android Browser,851
8,AOL Explorer,245
9,Opera,188


In [102]:
check_col_hist(cursor, 'trainuser', 'country_destination')

Unnamed: 0,0,1
0,NDF,124543
1,US,62376
2,other,10094
3,FR,5023
4,IT,2835
5,GB,2324
6,ES,2249
7,CA,1428
8,DE,1061
9,NL,762


## Add splitseed to trainuser data

In [128]:
# Insert a new column
query = """
ALTER TABLE trainuser
ALTER COLUMN splitseed REAL;
"""

cursor.execute(query)

In [132]:
# Insert data into new column
query = """
UPDATE trainuser
SET splitseed=random();
"""

cursor.execute(query)

In [133]:
query = """
SELECT COUNT(*)
FROM trainuser
WHERE splitseed>.7;
"""

cursor.execute(query)
cursor.fetchone()

(64151,)

In [134]:
query = """
SELECT COUNT(*)
FROM trainuser
WHERE splitseed<=.7;
"""

cursor.execute(query)
cursor.fetchone()

(149300,)

In [140]:
query = """
ALTER TABLE session
ADD COLUMN splitseed REAL;
"""
cursor.execute(query)

In [143]:
query = """
SELECT *
FROM session
LIMIT 5;
"""
cursor.execute(query)
pd.DataFrame(cursor.fetchall())

Unnamed: 0,0,1,2,3,4,5,6
0,jrqykh9y8x,show,view,p3,Mac Desktop,53901.0,
1,jrqykh9y8x,show,,,Mac Desktop,72.0,
2,1ltangt0hg,show,view,p3,Mac Desktop,2667.0,
3,zc77z91crs,lookup,,,Windows Desktop,1004.0,
4,i7svun9mus,show,,,Mac Desktop,1458.0,


In [145]:
[('user_id', 'character varying'),
 ('action', 'text'),
 ('action_type', 'text'),
 ('action_detail', 'text'),
 ('device_type', 'text'),
 ('secs_elapsed', 'real'),
 ('splitseed', 'real')]



[('id', 'character varying'),
 ('date_account_created', 'timestamp without time zone'),
 ('timestamp_first_active', 'timestamp without time zone'),
 ('date_first_booking', 'timestamp without time zone'),
 ('gender', 'text'),
 ('age', 'double precision'),
 ('signup_method', 'text'),
 ('signup_flow', 'integer'),
 ('language', 'text'),
 ('affiliate_channel', 'text'),
 ('affiliate_provider', 'text'),
 ('first_affiliate_tracked', 'text'),
 ('signup_app', 'text'),
 ('first_device_type', 'text'),
 ('first_browser', 'text'),
 ('country_destination', 'text')]

## Check session data

In [157]:
# CREATE VIEW
query = """
DROP VIEW session_stats;
"""

cursor.execute(query)

In [158]:
# CREATE VIEW
query = """
CREATE VIEW session_stats AS
SELECT user_id,  avg(secs_elapsed) AS mean_time, count(*) AS session_counts
FROM session
GROUP BY user_id;
"""

cursor.execute(query)

In [160]:
# CREATE VIEW
query = """
SELECT *
FROM session_stats
ORDER BY session_counts DESC
LIMIT 5;
"""

cursor.execute(query)
pd.DataFrame(cursor.fetchall())

Unnamed: 0,0,1,2
0,,17741.764287,34496
1,mxqbh3ykxl,2121.081955,2722
2,0hjoc5q8nf,2758.781309,2644
3,mjbl6rrj52,8621.766465,2476
4,l5lgm3w5pc,8940.843995,2424


In [146]:
get_table_info(cursor, 'session')

[('user_id', 'character varying'),
 ('action', 'text'),
 ('action_type', 'text'),
 ('action_detail', 'text'),
 ('device_type', 'text'),
 ('secs_elapsed', 'real')]

In [115]:
check_col_hist(cursor, 'session', 'action')

Unnamed: 0,0,1
0,show,2768278
1,index,843699
2,search_results,725226
3,personalize,706824
4,search,536057
5,ajax_refresh_subtotal,487744
6,update,365130
7,similar_listings,364624
8,social_connections,339000
9,reviews,320591


In [116]:
check_col_hist(cursor, 'session', 'action_type')

Unnamed: 0,0,1
0,view,3560902
1,data,2103770
2,click,1996183
3,,1126204
4,-unknown-,1031170
5,submit,623357
6,message_post,87103
7,partner_callback,19132
8,booking_request,18773
9,modify,1139


In [117]:
check_col_hist(cursor, 'session', 'action_detail')

Unnamed: 0,0,1
0,view_search_results,1776885
1,p3,1376550
2,,1126204
3,-unknown-,1031141
4,wishlist_content_update,706824
5,user_profile,656839
6,change_trip_characteristics,487744
7,similar_listings,364624
8,user_social_connections,336799
9,update_listing,269779


In [118]:
check_col_hist(cursor, 'session', 'device_type')

Unnamed: 0,0,1
0,Mac Desktop,3594286
1,Windows Desktop,2658539
2,iPhone,2105031
3,Android Phone,839637
4,iPad Tablet,683414
5,Android App Unknown Phone/Tablet,273652
6,-unknown-,211279
7,Tablet,139886
8,Linux Desktop,28373
9,Chromebook,22348


In [120]:
check_col_hist(cursor, 'session', 'secs_elapsed')

Unnamed: 0,0,1
0,,136031
1,0.0,104624
2,1.0,27378
3,2.0,23004
4,3.0,20375
5,5.0,19589
6,4.0,19485
7,7.0,19067
8,6.0,19022
9,8.0,18917


In [87]:
query = """
SELECT *
FROM session
LIMIT 5
"""

cursor.execute(query)
pd.DataFrame(cursor.fetchall())

Unnamed: 0,0,1,2,3,4,5
0,d1mm9tcy42,show,,,Windows Desktop,102.0
1,ncf87guaf0,lookup,,,Windows Desktop,2689.0
2,qtw88d9pbl,lookup,,,Mac Desktop,453.0
3,qtw88d9pbl,lookup,,,Mac Desktop,347.0
4,ucgks2fyez,show,,,iPad Tablet,62.0


## Try to export booking data into python to have a look

In [237]:
query = """
SELECT *
FROM trainuser
WHERE splitseed <= 0.7;
"""
cursor.execute(query)
train_data = cursor.fetchall()

user_cols = [x[0] for x in get_table_info(cursor, 'trainuser')]

train_user = pd.DataFrame(train_data, columns = user_cols)

In [240]:
train_user.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,splitseed,account_age,active_age,booking_age
0,1ok8je1zsw,2010-03-25,2010-03-25,NaT,FEMALE,45.0,basic,2,en,other,craigslist,untracked,Web,Mac Desktop,Safari,NDF,0.227831,483,483,0
1,4wxqtytl2t,2010-05-08,2010-05-08,NaT,-unknown-,,basic,3,en,direct,direct,,Web,Other/Unknown,-unknown-,NDF,0.593557,485,485,0
2,la3wo57cq4,2010-04-08,2010-04-08,NaT,-unknown-,,basic,2,en,direct,direct,omg,Web,Mac Desktop,Safari,NDF,0.563787,484,484,0
3,97oxlyiw48,2010-03-26,2010-03-26,2010-03-26,-unknown-,,basic,3,en,other,craigslist,,Web,Other/Unknown,-unknown-,US,0.364527,483,483,483
4,36m2hgioax,2010-03-26,2010-03-26,NaT,-unknown-,42.0,basic,2,en,direct,direct,untracked,Web,Mac Desktop,Safari,NDF,0.572908,483,483,0


In [241]:
y_train = train_user['country_destination'][:,np.newaxis]
X_train = train_user.drop(('country_destination'), axis=1)

In [252]:
with open('./data/processed/X_train.pkl', 'wb') as picklefile:
    pickle.dump(X_train, picklefile)

In [253]:
with open('./data/processed/y_train.pkl', 'wb') as picklefile:
    pickle.dump(y_train, picklefile)