# ADS-508 Project Notebook

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import boto3
import sagemaker
import csv
import pandas as pd

In [3]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

## Import
### Importing Data from S3 Bucket

In [4]:
!aws s3 cp --recursive s3://aurelia-resort-data/ ./data/

download: s3://aurelia-resort-data/airbnb/neighbourhoods.csv to data/airbnb/neighbourhoods.csv
download: s3://aurelia-resort-data/income/disp_income.csv to data/income/disp_income.csv
download: s3://aurelia-resort-data/airbnb/listings.csv to data/airbnb/listings.csv
download: s3://aurelia-resort-data/travel/airline_2019.csv to data/travel/airline_2019.csv
download: s3://aurelia-resort-data/travel/airline_2017.csv to data/travel/airline_2017.csv
download: s3://aurelia-resort-data/travel/airline_2016.csv to data/travel/airline_2016.csv
download: s3://aurelia-resort-data/travel/airline_2018.csv to data/travel/airline_2018.csv
download: s3://aurelia-resort-data/airbnb/reviews.csv to data/airbnb/reviews.csv
download: s3://aurelia-resort-data/travel/airline_2021.csv to data/travel/airline_2021.csv
download: s3://aurelia-resort-data/travel/airline_2022.csv to data/travel/airline_2022.csv
download: s3://aurelia-resort-data/travel/airline_data_archived.csv to data/travel/airline_data_archived.c

### Ingesting Data via Pandas

In [5]:
airline = pd.concat(
    map(pd.read_csv, ["./data/travel/airline_2022.csv", "./data/travel/airline_2021.csv",
                     "./data/travel/airline_2020.csv", "./data/travel/airline_2019.csv",
                     "./data/travel/airline_2018.csv", "./data/travel/airline_2017.csv", 
                     "./data/travel/airline_2016.csv"]),
    ignore_index=True)

tripsurvey = pd.read_csv("./data/travel/trip_data2.csv")

airbnb_neigh = pd.read_csv("./data/airbnb/neighbourhoods.csv")

airbnb_reviews = pd.read_csv("./data/airbnb/reviews.csv")

## EDA

### Data Shapes

In [6]:
airline.shape

(1741433, 12)

In [7]:
tripsurvey.shape

(134148, 19)

In [8]:
airbnb_neigh.shape

(39, 2)

In [9]:
airbnb_reviews.shape

(321578, 6)

### Data Previews

In [10]:
airline.head(5)

Unnamed: 0,PASSENGERS,ORIGIN_CITY_NAME,ORIGIN_STATE_NM,ORIGIN_WAC,DEST,DEST_CITY_NAME,DEST_STATE_NM,YEAR,QUARTER,MONTH,DISTANCE_GROUP,CLASS
0,0.0,"Aberdeen, SD",South Dakota,67,FSD,"Sioux Falls, SD",South Dakota,2022,1,1,1,G
1,0.0,"Aberdeen, SD",South Dakota,67,FSD,"Sioux Falls, SD",South Dakota,2022,1,2,1,G
2,0.0,"Aberdeen, SD",South Dakota,67,FSD,"Sioux Falls, SD",South Dakota,2022,1,3,1,G
3,0.0,"Aberdeen, SD",South Dakota,67,FSD,"Sioux Falls, SD",South Dakota,2022,2,4,1,G
4,0.0,"Aberdeen, SD",South Dakota,67,FSD,"Sioux Falls, SD",South Dakota,2022,2,5,1,G


In [11]:
tripsurvey.head(5)

Unnamed: 0,Level,Date,State FIPS,State Postal Code,County FIPS,County Name,Population Staying at Home,Population Not Staying at Home,Number of Trips,Number of Trips <1,Number of Trips 1-3,Number of Trips 3-5,Number of Trips 5-10,Number of Trips 10-25,Number of Trips 25-50,Number of Trips 50-100,Number of Trips 100-250,Number of Trips 250-500,Number of Trips >=500
0,County,2019/01,1.0,AL,1001.0,Autauga County,8153.774194,47715.225806,185406.548387,39861.032258,45577.645161,25302.870968,26100.806452,35098.322581,8192.870968,3177.580645,1724.967742,317.225806,53.225806
1,County,2019/01,1.0,AL,1003.0,Baldwin County,35464.903226,187769.096774,682047.0,132886.967742,171426.83871,93248.870968,119877.483871,113967.483871,36454.580645,8091.83871,3796.709677,1762.451613,533.774194
2,County,2019/01,1.0,AL,1005.0,Barbour County,3746.483871,20939.516129,85555.322581,17446.741935,19057.612903,12462.741935,15250.322581,11596.193548,5833.612903,3003.16129,699.709677,176.387097,28.83871
3,County,2019/01,1.0,AL,1007.0,Bibb County,2858.645161,19535.354839,78483.967742,15839.451613,19777.967742,9542.774194,11619.709677,10237.193548,8677.225806,1930.419355,652.064516,179.935484,27.225806
4,County,2019/01,1.0,AL,1009.0,Blount County,7649.967742,50176.032258,192921.903226,35455.677419,38616.516129,23728.903226,30827.935484,35365.612903,21713.548387,4961.677419,1735.064516,443.741935,73.225806


In [12]:
airbnb_neigh.head(5)

Unnamed: 0,neighbourhood_group,neighbourhood
0,,"Brightwood Park, Crestwood, Petworth"
1,,"Brookland, Brentwood, Langdon"
2,,"Capitol Hill, Lincoln Park"
3,,"Capitol View, Marshall Heights, Benning Heights"
4,,"Cathedral Heights, McLean Gardens, Glover Park"


In [13]:
airbnb_reviews.head(5)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,3686,131293,2010-11-01,257234,Callie,Staying with Levita and her wonderful family w...
1,3686,150766,2010-12-08,255888,Patrick,"Vita is a very welcoming, helpful and friendly..."
2,3686,177749,2011-02-02,366688,Benjamin,"This was my first time using ""airbnb"" and it m..."
3,3686,197451,2011-03-12,213492,T.J.,"First, Vita saved my work week by providing me..."
4,3686,213212,2011-03-30,428455,Pete,Great host! Very welcoming and organised. I st...


### Summary Statistics

In [16]:
airline.describe()

Unnamed: 0,PASSENGERS,ORIGIN_WAC,YEAR,QUARTER,MONTH,DISTANCE_GROUP
count,36142.0,36142.0,36142.0,36142.0,36142.0,36142.0
mean,3427.064247,45.166842,2019.25386,2.530297,6.597366,1.967987
std,6020.06371,23.186547,2.258555,1.117391,3.453505,1.284688
min,1.0,1.0,2016.0,1.0,1.0,1.0
25%,181.0,33.0,2017.0,2.0,4.0,1.0
50%,1427.0,38.0,2019.0,3.0,7.0,2.0
75%,3889.0,63.0,2022.0,4.0,10.0,2.0
max,63773.0,93.0,2022.0,4.0,12.0,10.0


In [17]:
tripsurvey.describe()

Unnamed: 0,State FIPS,County FIPS,Population Staying at Home,Population Not Staying at Home,Number of Trips,Number of Trips <1,Number of Trips 1-3,Number of Trips 3-5,Number of Trips 5-10,Number of Trips 10-25,Number of Trips 25-50,Number of Trips 50-100,Number of Trips 100-250,Number of Trips 250-500,Number of Trips >=500
count,134106.0,131964.0,133646.0,133646.0,133646.0,133646.0,133646.0,133646.0,133646.0,133646.0,133646.0,133646.0,133646.0,133646.0,133646.0
mean,30.259004,30383.649268,67996.55,240860.9,1143412.0,295701.6,281412.4,138619.2,175368.9,169074.4,54501.61,18163.85,7646.022,1644.634,1279.698
std,15.151581,15160.152747,1330054.0,4615650.0,22363350.0,5815983.0,5505153.0,2712604.0,3436352.0,3315083.0,1061576.0,352785.7,149553.3,32231.22,27649.18
min,1.0,1001.0,15.3,144.0,388.8077,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,18177.0,2042.694,9157.815,42232.23,8969.9,9345.669,4166.952,5741.245,6798.098,3418.032,1257.504,428.575,62.40484,17.67742
50%,29.0,29176.0,4890.494,21751.17,101711.5,22320.86,24719.4,11596.9,14431.24,16104.31,7151.935,2678.613,945.6887,159.9032,53.90323
75%,45.0,45081.0,14524.53,59550.01,280768.8,64986.67,71413.83,34527.23,41978.51,42602.67,16598.02,5990.772,2293.331,425.7703,203.5351
max,56.0,56045.0,94903960.0,266041300.0,1476327000.0,403764500.0,374162700.0,184596400.0,236821000.0,235415000.0,71177790.0,21803530.0,10199720.0,2456013.0,3325570.0


### Data Types and Info

In [18]:
airline.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36142 entries, 46321 to 1741254
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PASSENGERS        36142 non-null  float64
 1   ORIGIN_CITY_NAME  36142 non-null  object 
 2   ORIGIN_STATE_NM   36142 non-null  object 
 3   ORIGIN_WAC        36142 non-null  int64  
 4   DEST              36142 non-null  object 
 5   DEST_CITY_NAME    36142 non-null  object 
 6   DEST_STATE_NM     36142 non-null  object 
 7   YEAR              36142 non-null  int64  
 8   QUARTER           36142 non-null  int64  
 9   MONTH             36142 non-null  int64  
 10  DISTANCE_GROUP    36142 non-null  int64  
 11  CLASS             36142 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 3.6+ MB


In [19]:
tripsurvey.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134148 entries, 0 to 134147
Data columns (total 19 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Level                           134148 non-null  object 
 1   Date                            134148 non-null  object 
 2   State FIPS                      134106 non-null  float64
 3   State Postal Code               134106 non-null  object 
 4   County FIPS                     131964 non-null  float64
 5   County Name                     131964 non-null  object 
 6   Population Staying at Home      133646 non-null  float64
 7   Population Not Staying at Home  133646 non-null  float64
 8   Number of Trips                 133646 non-null  float64
 9   Number of Trips <1              133646 non-null  float64
 10  Number of Trips 1-3             133646 non-null  float64
 11  Number of Trips 3-5             133646 non-null  float64
 12  Number of Trips 

## Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}