# EMR ETL Test Notebook
- this notebook is tested to run on AWS EMR cluster with configuration listed in docs/aws_create_cluster.txt and config/spark-config
- run along with emr-etl-notebook.ipynb and assign the same kernel to share variables, e.g. path names
- this notebook contains tests for the first two steps of the ETL
    - **ETL Part I: preprocess raw data into parquet files**
    - **ETL Part II: create dimensional model using the preprocessed data**
        
- first two parts of ETL are implemented in etl_notebooks/emr-etl-test-notebook.ipynb, alternatively run .py files in /apps folder
- to run and test ETL Part III use redshift-etl-notebook.ipynb


In [None]:
sc.install_pypi_package("pandas")
import pandas as pd
pd.set_option("display.max_columns", None)

# Part I - preprocessing raw data

In [None]:
df_listings_global = spark.read.parquet(path_out_global_listings)

In [None]:
df_listings_global.count()

In [None]:
df_listings_global.limit(10).toPandas()

In [None]:
df_city_listings = spark.read.parquet(path_out_city_listings_data)

In [None]:
df_city_listings.count()

In [None]:
df_city_listings.limit(10).toPandas()

In [None]:
df_city_reviews = spark.read.parquet(path_out_city_reviews_data)

In [None]:
df_city_reviews.count()

In [None]:
df_city_reviews.limit(10).toPandas()

In [None]:
df_temp = spark.read.parquet(path_out_city_temperature_data)

In [None]:
df_temp.count()

In [None]:
df_temp.limit(10).toPandas()

In [None]:
df_rain = spark.read.parquet(path_out_city_rain_data)

In [None]:
df_rain.count()

In [None]:
df_rain.limit(10).toPandas()

In [None]:
df_stations = spark.read.parquet(path_out_weather_stations)

In [None]:
df_stations.count()

In [None]:
df_stations.limit(10).toPandas()

# Part II - Dimensional model

## Listings table

In [None]:
df_listings = spark.read.csv(dim_model_listings_new,header="True", inferSchema="True",multiLine="True",escape='"',ignoreLeadingWhiteSpace="True")

In [None]:
df_listings.count()

In [None]:
df_listings.limit(10).toPandas()

In [None]:
df_listings.select("listing_id").dropDuplicates().count()

In [None]:
df_listings.select("listing_id").filter("listing_id IS NULL").show()

## Hosts table

In [None]:
df_hosts = spark.read.csv(dim_model_hosts_new,header="True", inferSchema="True",multiLine="True",escape='"',ignoreLeadingWhiteSpace="True")

In [None]:
df_hosts.count()

In [None]:
df_hosts.limit(10).toPandas()

In [None]:
df_hosts.select("host_id").dropDuplicates().count()

## Weather table

In [None]:
df_weather = spark.read.csv(dim_model_weather_new,header="True", inferSchema="True",multiLine="True",escape='"',ignoreLeadingWhiteSpace="True")

In [None]:
df_weather.count()

In [None]:
df_weather.limit(10).toPandas()

In [None]:
df_weather.select("city").groupBy("city").count().orderBy(F.col("count").desc()).show()

## Reviews table

In [None]:
df_reviews_step1 = spark.read.csv(dim_model_reviews_step1,header="True", inferSchema="True",multiLine="True",escape='"',ignoreLeadingWhiteSpace="True")

In [None]:
df_reviews_step1.count()

In [None]:
df_reviews_step1.limit(10).toPandas()

In [None]:
df_reviews_step1.select("review_id").dropDuplicates().count()

In [None]:
df_reviews_step2 = spark.read.csv(dim_model_reviews_step2,header="True", inferSchema="True",multiLine="True",escape='"',ignoreLeadingWhiteSpace="True")

In [None]:
df_reviews_step2.count()

In [None]:
df_reviews_step2.show(20)

In [None]:
df_reviews = spark.read.csv(dim_model_reviews_new,header="True", inferSchema="True",multiLine="True",escape='"',ignoreLeadingWhiteSpace="True")

In [None]:
df_reviews.count()

In [None]:
df_reviews.show(20)

In [None]:
df_reviews.groupBy("sentiment").count().show()

In [None]:
df_reviews.filter("sentiment == 'neg'").select("comments").limit(3).collect()

## Reviewers table

In [None]:
df_reviewers = spark.read.csv(dim_model_reviewers_new,header="True", inferSchema="True",multiLine="True",escape='"',ignoreLeadingWhiteSpace="True")

In [None]:
df_reviewers.count()

In [None]:
df_reviewers.select("reviewer_id").dropDuplicates().count()

In [None]:
df_reviewers.show(100)

# Move tested model from temporary to final folder

In [None]:
# if there is no new model, fail
keys_new_model = [item['Key'] for item in s3_client.list_objects(Bucket=bucket_name, Prefix=dim_model_folder_new)['Contents']]

# delete old model
try:    
    keys_old_model = [item['Key'] for item in s3_client.list_objects(Bucket=bucket_name, Prefix=dim_model_folder+"/")['Contents']]
    for key in keys_old_model:     
        s3.Object(bucket_name,key).delete() 
except:
    # there is no old model to delete
    pass

# copy new model to final folder (i.e. old model folder)
for key in keys_new_model:    
    s3.Object(bucket_name,key.replace(dim_model_folder_new,dim_model_folder)).copy_from(CopySource=bucket_name+"/"+key)

# delete new model
for key in keys_new_model:     
    s3.Object(bucket_name,key).delete()