# Merges Cleaned Dataframes

In [20]:
import boto3
import sagemaker
import csv
import pandas as pd

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

## Get stored cleaned dataframes

In [21]:
%store -r
%who

S3_PATH	 bookings_grouped	 boto3	 bucket	 cancellations_grouped	 csv	 finaldf	 income_grouped	 income_value_grouped	 
listings_clean	 listings_date	 merge1	 merge2	 merge3	 merge4	 pd	 region	 reviews_clean	 
reviews_date	 role	 sagemaker	 sagemaker_session	 sm	 test_df	 test_size	 testdf	 train_df	 
traindf	 travelstats	 


In [22]:
reviews_date.head(5)

Unnamed: 0,Date,num_stays
0,2009-05-01,2
1,2009-06-01,2
2,2009-08-01,1
3,2009-09-01,1
4,2009-10-01,2


In [23]:
travelstats.head(5)

Unnamed: 0,Date,TotalAirlineTripstoDC,TotalAirlinePassengerstoDC,TotalAmericanTravelers,PercentofAmericanswhoTraveled,TotalTripsbyAmericans
0,2019-01-01,435,1326271.0,265224100.0,81.442421,1137042000.0
1,2019-02-01,385,1261292.0,266240800.0,81.774326,1148667000.0
2,2019-03-01,422,1668002.0,265091900.0,81.799085,1306565000.0
3,2019-04-01,427,1681832.0,264062600.0,81.354347,1446882000.0
4,2019-05-01,439,1766148.0,266134000.0,81.88467,1415057000.0


In [24]:
bookings_grouped.head(5)

Unnamed: 0,Date,total_bookings
0,2014-10-01,286
1,2014-11-01,2
2,2015-01-01,1656
3,2015-02-01,71
4,2015-03-01,144


In [25]:
income_grouped.head(5)

Unnamed: 0,Date,income_by_year
0,2013-02-01,1
1,2013-03-01,1
2,2013-04-01,1
3,2013-05-01,1
4,2013-06-01,1


In [26]:
income_value_grouped.head(5)

Unnamed: 0,Date,income_total
0,2013-02-01,12224.9
1,2013-03-01,12262.1
2,2013-04-01,12299.0
3,2013-05-01,12358.6
4,2013-06-01,12361.7


## Merge stored cleaned dataframes

In [27]:
# modified to include all datasets

merge1 = pd.merge(travelstats, reviews_date, on='Date')
merge1 = merge1.rename(columns={"numstays": "DCAirBnbStays"})

finaldf = pd.merge(merge1, income_value_grouped, on='Date')

finaldf.head(5)

# Archived old code below:

#merge1 = pd.merge(travelstats, reviews_date, on='Date')
#merge1 = merge1.rename(columns={"numstays": "DCAirBnbStays"})

#merge2 = pd.merge(income_grouped, bookings_grouped, on = 'Date')
#merge2 = merge2.rename(columns={"income_by_year": "Income"})
#merge2 = merge2.rename(columns={"total_bookings": "TotalBookings"})

#merge3 = pd.merge(income_value_grouped, cancellations_grouped, on = 'Date')
#merge3 = merge3.rename(columns={"income_total": "TotalIncome"})
#merge3 = merge3.rename(columns={"cancellations": "TotalCancellations"})

#merge4 = pd.merge(merge1, merge2, on='Date')
#finaldf = pd.merge(merge3, merge4, on='Date')

#merge1.head()

Unnamed: 0,Date,TotalAirlineTripstoDC,TotalAirlinePassengerstoDC,TotalAmericanTravelers,PercentofAmericanswhoTraveled,TotalTripsbyAmericans,num_stays,income_total
0,2019-01-01,435,1326271.0,265224100.0,81.442421,1137042000.0,2899,14791.2
1,2019-02-01,385,1261292.0,266240800.0,81.774326,1148667000.0,2639,14835.3
2,2019-03-01,422,1668002.0,265091900.0,81.799085,1306565000.0,5226,14843.9
3,2019-04-01,427,1681832.0,264062600.0,81.354347,1446882000.0,5608,14811.8
4,2019-05-01,439,1766148.0,266134000.0,81.88467,1415057000.0,6110,14814.7


## Store Train, test, split in S3

In [28]:
S3_PATH = "s3://aurelia-resort-data/model_train/"

test_size = 6

train_df = finaldf[:-test_size]
test_df = finaldf[-test_size:]

# Storing to S3 to remove need to rerun data scrubbing notebooks
train_df.to_csv(S3_PATH+"train.csv", index=False)
test_df.to_csv(S3_PATH+"test.csv", index=False)


## Release Resources

In [29]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [30]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>