# Merges Cleaned Dataframes

In [2]:
import boto3
import sagemaker
import csv
import pandas as pd

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

## Get stored cleaned dataframes

In [3]:
%store -r
%who

bookings_grouped	 boto3	 bucket	 cancellations_grouped	 csv	 finaldf	 income_grouped	 income_value_grouped	 listings_clean	 
listings_date	 pd	 region	 reviews_clean	 reviews_date	 role	 sagemaker	 sagemaker_session	 sm	 
testdf	 traindf	 travelstats	 


In [4]:
reviews_date.tail(5)

Unnamed: 0,Date,num_stays
158,2022-08-01,7866
159,2022-09-01,8091
160,2022-10-01,9588
161,2022-11-01,6964
162,2022-12-01,2211


In [5]:
travelstats.tail(5)

Unnamed: 0,Date,TotalAirlineTripstoDC,TotalAirlinePassengerstoDC,TotalAmericanTravelers,PercentofAmericanswhoTraveled,TotalTripsbyAmericans
37,2022-02-01,415,1069976.0,773009400.0,82.098828,3757569000.0
38,2022-03-01,448,1529721.0,776089400.0,81.843971,3843540000.0
39,2022-04-01,429,1636140.0,776088300.0,81.854948,3993425000.0
40,2022-05-01,451,1707995.0,777826500.0,82.0106,4021791000.0
41,2022-06-01,469,1649175.0,773990800.0,81.766838,3830852000.0


In [6]:
bookings_grouped.head(5)

Unnamed: 0,Date,total_bookings
0,2014-10-01,286
1,2014-11-01,2
2,2015-01-01,1656
3,2015-02-01,71
4,2015-03-01,144


In [7]:
income_grouped.head(5)

Unnamed: 0,Date,income_by_year
0,2013-02-01,1
1,2013-03-01,1
2,2013-04-01,1
3,2013-05-01,1
4,2013-06-01,1


In [8]:
income_value_grouped.tail(5)

Unnamed: 0,Date,income_total
116,2022-10-01,15274.2
117,2022-11-01,15332.9
118,2022-12-01,15367.3
119,2023-01-01,15591.1
120,2023-02-01,15621.5


## Merge stored cleaned dataframes

In [9]:
# modified to include all datasets

merge1 = pd.merge(travelstats, reviews_date, on='Date')
merge1 = merge1.rename(columns={"numstays": "DCAirBnbStays"})

finaldf = pd.merge(merge1, income_value_grouped, on='Date')

finaldf.tail(5)

# Archived old code below:

#merge1 = pd.merge(travelstats, reviews_date, on='Date')
#merge1 = merge1.rename(columns={"numstays": "DCAirBnbStays"})

#merge2 = pd.merge(income_grouped, bookings_grouped, on = 'Date')
#merge2 = merge2.rename(columns={"income_by_year": "Income"})
#merge2 = merge2.rename(columns={"total_bookings": "TotalBookings"})

#merge3 = pd.merge(income_value_grouped, cancellations_grouped, on = 'Date')
#merge3 = merge3.rename(columns={"income_total": "TotalIncome"})
#merge3 = merge3.rename(columns={"cancellations": "TotalCancellations"})

#merge4 = pd.merge(merge1, merge2, on='Date')
#finaldf = pd.merge(merge3, merge4, on='Date')

#merge1.head()

Unnamed: 0,Date,TotalAirlineTripstoDC,TotalAirlinePassengerstoDC,TotalAmericanTravelers,PercentofAmericanswhoTraveled,TotalTripsbyAmericans,num_stays,income_total
37,2022-02-01,415,1069976.0,773009400.0,82.098828,3757569000.0,4413,15125.6
38,2022-03-01,448,1529721.0,776089400.0,81.843971,3843540000.0,7754,15064.1
39,2022-04-01,429,1636140.0,776088300.0,81.854948,3993425000.0,9208,15055.2
40,2022-05-01,451,1707995.0,777826500.0,82.0106,4021791000.0,10146,15036.4
41,2022-06-01,469,1649175.0,773990800.0,81.766838,3830852000.0,8271,14973.1


In [10]:
# modified to include all values and include nans for missing time periods 

merge1_left_join = pd.merge(reviews_date, travelstats, on='Date', how='left')
merge1_left_join = merge1_left_join.rename(columns={"numstays": "DCAirBnbStays"})

finaldf_left_join = pd.merge(merge1_left_join, income_value_grouped, on='Date', how='left')

finaldf_left_join.tail(10)

Unnamed: 0,Date,num_stays,TotalAirlineTripstoDC,TotalAirlinePassengerstoDC,TotalAmericanTravelers,PercentofAmericanswhoTraveled,TotalTripsbyAmericans,income_total
153,2022-03-01,7754,448.0,1529721.0,776089400.0,81.843971,3843540000.0,15064.1
154,2022-04-01,9208,429.0,1636140.0,776088300.0,81.854948,3993425000.0,15055.2
155,2022-05-01,10146,451.0,1707995.0,777826500.0,82.0106,4021791000.0,15036.4
156,2022-06-01,8271,469.0,1649175.0,773990800.0,81.766838,3830852000.0,14973.1
157,2022-07-01,8002,,,,,,15100.2
158,2022-08-01,7866,,,,,,15149.6
159,2022-09-01,8091,,,,,,15172.2
160,2022-10-01,9588,,,,,,15274.2
161,2022-11-01,6964,,,,,,15332.9
162,2022-12-01,2211,,,,,,15367.3


## Store Train, test, split in S3

In [18]:
S3_PATH = "s3://aurelia-resort-data/model_train/data_csv/"

test_size = 6

train_df = finaldf[:-test_size]
test_df = finaldf[-test_size:]

train_df_nans = finaldf_left_join[:-test_size]
test_df_nans = finaldf_left_join[-test_size:]

# Storing to S3 to remove need to rerun data scrubbing notebooks
train_df.to_csv(S3_PATH+"train.csv", index=False)
test_df.to_csv(S3_PATH+"test.csv", index=False)

train_df_nans.to_csv(S3_PATH+"train_nans.csv", index=False)
test_df_nans.to_csv(S3_PATH+"test_nans.csv", index=False)


## Release Resources

In [12]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [13]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>