# Redshift Notebook
- Contents
    - **ETL Part III: create Redshift model and copy in the data**
    - **Querying the final model**   
- Note: paths are defined in emr-etl-notebook.ipynb, use the same kernel

In [6]:
import configparser
import psycopg2

In [7]:
%load_ext sql

In [8]:
config = configparser.ConfigParser()
config.read('../config/prj.cfg')

aws_region = config.get("AWS", "REGION") 
access_id = config.get("AWS", "AWS_ACCESS_KEY_ID") 
access_key = config.get("AWS", "AWS_SECRET_ACCESS_KEY")
iam_role_arn = config.get('AWS','IAM_ROLE_ARN')

In [9]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(*config['REDSHIFT'].values())
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh


# Part III: create Redshift model and copy in the data

In [10]:
# DROP TABLES
listings_drop = "DROP TABLE IF EXISTS listings"
reviews_drop = "DROP TABLE IF EXISTS reviews"
hosts_drop = "DROP TABLE IF EXISTS hosts"
weather_drop = "DROP TABLE IF EXISTS weather"
reviewers_drop = "DROP TABLE IF EXISTS reviewers"

# CREATE TABLES
listings_create = ("""
CREATE TABLE listings(
accommodates  integer,
amenities  varchar(max),
availability_30  varchar,
availability_365  varchar,
availability_60  varchar,
availability_90  varchar,
bathrooms  varchar,
bathrooms_text  varchar,
bedrooms  integer,
beds  integer,
calculated_host_listings_count  integer,
calculated_host_listings_count_entire_homes  integer,
calculated_host_listings_count_private_rooms  integer,
calculated_host_listings_count_shared_rooms  integer,
calendar_last_scraped  varchar,
calendar_updated  varchar,
city  varchar(max),
description  varchar(max),
first_review  date,
has_availability  varchar,
host_id  integer,
listing_id  integer,
instant_bookable  varchar,
last_review  date,
last_scraped  date,
latitude  float,
license  varchar(max),
listing_url  varchar,
longitude  float,
maximum_maximum_nights  integer,
maximum_minimum_nights  integer,
maximum_nights  integer,
maximum_nights_avg_ntm  float,
minimum_maximum_nights  integer,
minimum_minimum_nights  integer,
minimum_nights  integer,
minimum_nights_avg_ntm  float,
name  varchar(max),
neighborhood_overview  varchar(max),
neighbourhood  varchar,
neighbourhood_cleansed  varchar,
neighbourhood_group_cleansed  varchar,
number_of_reviews  integer,
number_of_reviews_l30d  integer,
number_of_reviews_ltm  integer,
picture_url  varchar,
price  varchar,
property_type  varchar,
review_scores_accuracy  integer,
review_scores_checkin  integer,
review_scores_cleanliness  integer,
review_scores_communication  integer,
review_scores_location  integer,
review_scores_rating  integer,
review_scores_value  integer,
reviews_per_month  float,
room_type  varchar,
scrape_id  varchar,
scrape_month  integer,
scrape_year  integer
);
""")

weather_create = ("""
CREATE TABLE weather(
weather_id varchar,
date date,
temperature float,
rain float,
city varchar
);
""")

reviews_create = ("""
CREATE TABLE reviews(
review_id integer,
reviewer_id integer,
listing_id integer,
host_id integer,
weather_id varchar,
date date,
reviewer_name varchar,
comments varchar(max),
comment_language varchar,
sentiment varchar
);
""")

hosts_create = ("""
CREATE TABLE hosts(
host_id  integer,
host_name  varchar,
host_url  varchar,
host_since  varchar,
host_location  varchar(max),
host_about  varchar(max),
host_response_time  varchar,
host_response_rate  varchar,
host_acceptance_rate  varchar,
host_is_superhost  varchar,
host_thumbnail_url  varchar,
host_picture_url  varchar,
host_neighbourhood  varchar,
host_listings_count  integer,
host_total_listings_count  integer,
host_verifications  varchar,
host_has_profile_pic  varchar,
host_identity_verified  varchar,
last_scraped  varchar
);
""")

reviewers_create = ("""
CREATE TABLE reviewers(
reviewer_id integer, 
reviewer_name varchar, 
languages_spoken varchar,
last_updated date
);
""")

weather_copy = ("""
copy weather
from {path}
iam_role {iam_role}
csv
IGNOREHEADER 1
""").format(path=f"'{dim_model_weather}'", iam_role=iam_role_arn)


listings_copy = ("""
copy listings
from {path}
iam_role {iam_role}
csv
IGNOREHEADER 1
""").format(path=f"'{dim_model_listings}'", iam_role=iam_role_arn)

reviews_copy = ("""
copy reviews
from {path}
iam_role {iam_role}
csv
IGNOREHEADER 1
""").format(path=f"'{dim_model_reviews}'", iam_role=iam_role_arn)

hosts_copy = ("""
copy hosts
from {path}
iam_role {iam_role}
csv
IGNOREHEADER 1
""").format(path=f"'{dim_model_hosts}'", iam_role=iam_role_arn)

reviewers_copy = ("""
copy reviewers
from {path}
iam_role {iam_role}
csv
IGNOREHEADER 1
""").format(path=f"'{dim_model_reviewers}'", iam_role=iam_role_arn)

## Listings

In [None]:
%sql $listings_drop

In [11]:
%sql $listings_create

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
Done.


[]

In [12]:
%sql $listings_copy

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
Done.


[]

In [13]:
%sql SELECT COUNT(*) from listings

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
630731


In [None]:
%sql SELECT * from listings LIMIT 3

## Reviews

In [None]:
%sql $reviews_drop

In [15]:
%sql $reviews_create

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
Done.


[]

In [16]:
%sql $reviews_copy

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
Done.


[]

In [17]:
%sql SELECT COUNT(*) from reviews

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
3313766


In [18]:
%sql SELECT * from reviews LIMIT 3

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
3 rows affected.


review_id,reviewer_id,listing_id,host_id,weather_id,date,reviewer_name,comments,comment_language,sentiment
18259,50988,7071,17391,Berlin_2009-11-25,2009-11-25,Tarnia,"This room is really lovely! It is quiet, comfortable and bright and in a great location with lots of bars, cafe's, shops and transport right at your door step. A perfect way to introduce yourself to Berlin's charm. Can and his family (the cat Ginger included!) are very friendly and accomodating.",en,pos
40998,41676,19306,73530,Paris_2010-05-09,2010-05-09,C.,"The apt was extremely clean and spacious for 2 people -and very charming. It was a good choice and in a nice neigborhood, and concierge was available on hand for any questions/assistance.",en,pos
53782,140693,31435,135024,Paris_2010-06-16,2010-06-16,Dianne,"The place was very nice and functional, all amenities are available and should satisfy anybodies needs. you are in a non-traditional area of Paris for visiting but because of the great subway system, everything is close. recommended for all that don't mind walking up 6 flights of stairs. We would definitely book this unit again.",en,pos


## Hosts

In [None]:
%sql $hosts_drop

In [19]:
%sql $hosts_create

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
Done.


[]

In [20]:
%sql $hosts_copy

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
Done.


[]

In [21]:
%sql SELECT COUNT(*) FROM hosts

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
449889


In [22]:
%sql SELECT * from hosts LIMIT 3

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
3 rows affected.


host_id,host_name,host_url,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,last_scraped
2723,Jeremy,https://www.airbnb.com/users/show/2723,2008-09-04,"San Francisco, CA",I was a twenty-something who enjoys a wide range of activities. Formerly a member Airbnb customer support team and still an avid fan! Love to travel and uncle to five of the cutest kids ever!,,,,,https://a0.muscache.com/im/users/2723/profile_pic/1379623958/original.jpg?aki_policy=profile_small,https://a0.muscache.com/im/users/2723/profile_pic/1379623958/original.jpg?aki_policy=profile_x_medium,Shaw,1,1,"email,phone,facebook,reviews,jumio,work_email",,,2017-05-10
3207,Bernadine,https://www.airbnb.com/users/show/3207,2008-09-25,"Long Beach, California, United States","Fair, open, honest and very informative for new guests to the area.",,,,,https://a0.muscache.com/im/pictures/8b82a267-bc4b-4d8b-935a-463a39c8c5ae.jpg?aki_policy=profile_small,https://a0.muscache.com/im/pictures/8b82a267-bc4b-4d8b-935a-463a39c8c5ae.jpg?aki_policy=profile_x_medium,Bellflower,1,1,"email,phone,facebook",,,2017-05-03
4240,Giuseppe,https://www.airbnb.com/users/show/4240,2008-11-11,"Boston, Massachusetts, United States",Pretty low key guy. No drugs or drama,within a few hours,90.0,78%,,https://a2.muscache.com/im/users/4240/profile_pic/1357156475/original.jpg?aki_policy=profile_small,https://a2.muscache.com/im/users/4240/profile_pic/1357156475/original.jpg?aki_policy=profile_x_medium,Allston-Brighton,6,6,"email,phone,facebook,reviews,kba",,,2016-09-07


## Reviewers

In [None]:
%sql $reviewers_drop

In [23]:
%sql $reviewers_create

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
Done.


[]

In [24]:
%sql $reviewers_copy

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
Done.


[]

In [25]:
%sql SELECT COUNT(*) FROM reviewers

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
2788446


In [26]:
%sql SELECT * FROM reviewers LIMIT 5

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
5 rows affected.


reviewer_id,reviewer_name,languages_spoken,last_updated
81,Horace,en,2019-05-27
1769,Christopher,en,2019-09-26
3226,Simone Nino,en,2013-01-22
4784,Audrey,fr,2018-10-26
7103,Andrew,en,2017-06-24


## Weather

In [None]:
%sql $weather_drop

In [27]:
%sql $weather_create

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
Done.


[]

In [28]:
%sql $weather_copy

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
Done.


[]

In [29]:
%sql SELECT city, avg(temperature) as avgtemp, avg(rain) as avgrain FROM weather GROUP BY city 

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
4 rows affected.


city,avgtemp,avgrain
Paris,12.0477191413238,1.07108676207513
Amsterdam,10.323904293381,2.22906976744186
Berlin,10.0508050089445,1.54456618962433
London,11.4053890876565,1.03032200357782


In [30]:
%sql SELECT count(*) from weather

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
17888


## Redshift copy error debugging query

In [None]:
query="""
select d.query, substring(d.filename,14,20), 
d.line_number as line, 
substring(d.value,1,16) as value,
substring(le.err_reason,1,48) as err_reason
from stl_loaderror_detail d, stl_load_errors le
where d.query = le.query
and d.query = pg_last_copy_id(); 
"""

query="""
SELECT *
FROM stl_load_errors
"""
%sql $query

# Querying the final model

## Are there more positive or negative reviews of Airbnb stays? 

In [70]:
query = """
SELECT sentiment, count(*)::decimal/(SELECT COUNT(*) FROM reviews WHERE sentiment in ('pos','neg')) as share_perc
FROM reviews
WHERE sentiment in ('pos','neg')
GROUP BY sentiment
""" 
%sql $query

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
2 rows affected.


sentiment,share_perc
pos,0.9734946247506224
neg,0.0265053752493774


## Are comments more positive after a period of warm weather?

In [73]:
# Calculate average temperature over last 3 days for each review. For each category of sentiment, calculate average of the 3-day-average
query = """
SELECT sentiment, avg(moving_avg_temperature) as average_past_temperature_degC
FROM 
    (SELECT sentiment, review_id, reviews.date, rain, city, moving_avg_temperature 
    FROM reviews
    JOIN (SELECT *, avg(temperature) over (partition by weather.city order by weather.date, weather.date ROWS 7 PRECEDING ) as moving_avg_temperature
          FROM weather) as subq   
    USING (weather_id)    
    ORDER BY reviews.date) as reviews_weather
WHERE sentiment in ('pos','neg')
GROUP BY sentiment
ORDER BY average_past_temperature_degC DESC
""" 
%sql $query

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
2 rows affected.


sentiment,average_past_temperature_degc
neg,13.1020061534843
pos,12.8265396391706


## Are comments more negative after a rainy period?

In [72]:
# Calculate sum of rainfall over last 3 days for each review. For each category of sentiment, calculate average of the 7-day-sum
query = """
SELECT sentiment, avg(cumsum_rain) as average_past_rain_mm
FROM 
    (SELECT sentiment, review_id, reviews.date, rain, city, cumsum_rain 
    FROM reviews
    JOIN (SELECT *, sum(rain) over (partition by weather.city order by weather.date, weather.date ROWS 3 PRECEDING ) as cumsum_rain
          FROM weather) as subq   
    USING (weather_id)    
    ORDER BY reviews.date) as reviews_weather
WHERE sentiment in ('pos','neg')
GROUP BY sentiment
ORDER BY average_past_rain_mm DESC
""" 
%sql $query

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
2 rows affected.


sentiment,average_past_rain_mm
pos,7.00370532734516
neg,6.92385043505538


## Are all reviews in English?

In [67]:
query = """
SELECT comment_language, 100*count(*)::decimal/(SELECT count(*)
                                   FROM reviews) as share_perc
FROM reviews
GROUP BY comment_language
ORDER BY share_perc DESC
LIMIT 5
""" 
%sql $query

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
5 rows affected.


comment_language,share_perc
en,72.71602762536642
fr,13.624951188466536
de,4.696046733535199
es,2.968073183200021
zh,1.4091520040944352


## Are there hosts with listings in multiple cities?

In [79]:
query = """
SELECT host_id, host_name, host_location, count(*)
FROM
    (SELECT host_id, host_name, host_location, city
    FROM hosts
    JOIN listings
    USING (host_id)    
    GROUP BY host_id, host_name, host_location, city)
GROUP BY host_id, host_name, host_location
ORDER BY count DESC
LIMIT 5

""" 
%sql $query

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
5 rows affected.


host_id,host_name,host_location,count
3625030,Angela,Spain,135
80839530,Franz,"Balearic Islands, Spain",75
43337979,Top Villas Mallorca,"Alcúdia, Illes Balears, Spain",59
7409213,Lisa L'Abode Accommodation,"Sydney, New South Wales, Australia",53
11914644,Luxico Holiday Homes,"Victoria, Australia",49


## How many guests visited more than one city?

In [84]:
query = """
SELECT cities_visited, count(*)
FROM (
    SELECT reviewer_id, count(*) as cities_visited
    FROM
        (SELECT reviewer_id, city
        FROM reviews
        JOIN listings
        ON reviews.listing_id = listings.listing_id
        GROUP BY reviewer_id, city)
    GROUP BY reviewer_id
    ORDER BY cities_visited DESC
    )
GROUP BY cities_visited
ORDER by count ASC
""" 
%sql $query

 * postgresql://dwhuser:***@dwhcluster.ccanoyw7bwhp.us-east-1.redshift.amazonaws.com:5439/dwh
4 rows affected.


cities_visited,count
4,319
3,7525
2,132009
1,2648593
