# ADS-508 Project Notebook

In [1]:
import boto3
import sagemaker
import csv
import pandas as pd

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

## Import
### Importing Data from S3 Bucket

In [20]:
!aws s3 cp --recursive s3://aurelia-resort-data/ ./data/

download: s3://aurelia-resort-data/airbnb/neighbourhoods.csv to data/airbnb/neighbourhoods.csv
download: s3://aurelia-resort-data/income/disp_income.csv to data/income/disp_income.csv
download: s3://aurelia-resort-data/travel/airline_data2.csv to data/travel/airline_data2.csv
download: s3://aurelia-resort-data/airbnb/listings.csv to data/airbnb/listings.csv
download: s3://aurelia-resort-data/travel/trip_data.csv to data/travel/trip_data.csv
download: s3://aurelia-resort-data/travel/airline_data.csv to data/travel/airline_data.csv
download: s3://aurelia-resort-data/travel/airline_data3.csv to data/travel/airline_data3.csv
download: s3://aurelia-resort-data/airbnb/reviews.csv to data/airbnb/reviews.csv


### Ingesting Data via Pandas

In [21]:
airbnb_neigh = pd.read_csv("./data/airbnb/neighbourhoods.csv")
airbnb_reviews = pd.read_csv("./data/airbnb/reviews.csv")
airline = pd.read_csv("./data/travel/airline_data3.csv")
#tripsurvey = pd.read_csv("./data/travel/trip_data.csv")

#Kernel sometimes dies here, maybe due to size of tripsurvey. Just rerun.

## EDA
### Data Shapes

In [22]:
airline.shape

(248522, 36)

In [23]:
tripsurvey.shape

(4018052, 22)

### Data Previews

In [34]:
airline.head(5)

Unnamed: 0,PASSENGERS,FREIGHT,MAIL,DISTANCE,UNIQUE_CARRIER,AIRLINE_ID,UNIQUE_CARRIER_NAME,UNIQUE_CARRIER_ENTITY,REGION,CARRIER,...,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,YEAR,QUARTER,MONTH,DISTANCE_GROUP,CLASS
46136,1.0,0.0,0.0,0.0,OH,20397,PSA Airlines Inc.,07028,D,OH,...,"Washington, DC",VA,51,Virginia,38,2022,2,6,1,F
46390,1.0,0.0,0.0,24.0,OH,20397,PSA Airlines Inc.,07028,D,OH,...,"Washington, DC",VA,51,Virginia,38,2022,1,3,1,F
46391,1.0,0.0,0.0,24.0,OH,20397,PSA Airlines Inc.,07028,D,OH,...,"Washington, DC",VA,51,Virginia,38,2022,4,12,1,F
47086,1.0,0.0,0.0,94.0,OH,20397,PSA Airlines Inc.,07028,D,OH,...,"Washington, DC",VA,51,Virginia,38,2022,2,6,1,F
47412,1.0,0.0,0.0,142.0,AA,19805,American Airlines Inc.,0A050,D,AA,...,"Washington, DC",VA,51,Virginia,38,2022,2,5,1,F


In [11]:
tripsurvey.head(5)

Unnamed: 0,Level,Date,State FIPS,State Postal Code,County FIPS,County Name,Population Staying at Home,Population Not Staying at Home,Number of Trips,Number of Trips <1,...,Number of Trips 5-10,Number of Trips 10-25,Number of Trips 25-50,Number of Trips 50-100,Number of Trips 100-250,Number of Trips 250-500,Number of Trips >=500,Row ID,Week,Month
0,National,2019/01/01,,,,,77433867.0,248733553.0,897784400.0,241667151.0,...,129670778.0,116904343.0,40432062.0,15686639.0,7525563.0,1806022.0,1728112.0,00-00000-20190101,0,1
1,National,2019/01/02,,,,,61305201.0,264862219.0,1139452000.0,291276735.0,...,171637514.0,167412698.0,56148976.0,17739183.0,7817044.0,1962301.0,1531219.0,00-00000-20190102,0,1
2,National,2019/01/03,,,,,63050480.0,263116940.0,1162753000.0,296375014.0,...,175775410.0,172027487.0,57632422.0,18366626.0,8124548.0,2038099.0,1567072.0,00-00000-20190103,0,1
3,National,2019/01/04,,,,,61803652.0,264363768.0,1181954000.0,293159631.0,...,181324645.0,176144493.0,58761592.0,19315785.0,8687318.0,2096065.0,1569185.0,00-00000-20190104,0,1
4,National,2019/01/05,,,,,64389745.0,261777675.0,1180477000.0,295459014.0,...,180941769.0,165239790.0,54842134.0,19363939.0,8490791.0,1991159.0,1438664.0,00-00000-20190105,0,1


In [12]:
airbnb_neigh.head(5)

Unnamed: 0,neighbourhood_group,neighbourhood
0,,"Brightwood Park, Crestwood, Petworth"
1,,"Brookland, Brentwood, Langdon"
2,,"Capitol Hill, Lincoln Park"
3,,"Capitol View, Marshall Heights, Benning Heights"
4,,"Cathedral Heights, McLean Gardens, Glover Park"


In [13]:
airbnb_reviews.head(5)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,3686,131293,2010-11-01,257234,Callie,Staying with Levita and her wonderful family w...
1,3686,150766,2010-12-08,255888,Patrick,"Vita is a very welcoming, helpful and friendly..."
2,3686,177749,2011-02-02,366688,Benjamin,"This was my first time using ""airbnb"" and it m..."
3,3686,197451,2011-03-12,213492,T.J.,"First, Vita saved my work week by providing me..."
4,3686,213212,2011-03-30,428455,Pete,Great host! Very welcoming and organised. I st...


## Relevant Data Filtering

#### Airline

In [32]:
## Filter data to where DC is destination city

airline = airline[airline["DEST_CITY_NAME"].str.contains("DC")]

## Filter data to where passengers > 0

airline = airline[airline["PASSENGERS"]>0]


In [33]:
airline.shape

(5236, 36)

In [31]:
248522 - 5236 # rows removed

242976

In [29]:
airline.columns

Index(['PASSENGERS', 'FREIGHT', 'MAIL', 'DISTANCE', 'UNIQUE_CARRIER',
       'AIRLINE_ID', 'UNIQUE_CARRIER_NAME', 'UNIQUE_CARRIER_ENTITY', 'REGION',
       'CARRIER', 'CARRIER_NAME', 'CARRIER_GROUP', 'CARRIER_GROUP_NEW',
       'ORIGIN_AIRPORT_ID', 'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN_CITY_MARKET_ID',
       'ORIGIN', 'ORIGIN_CITY_NAME', 'ORIGIN_STATE_ABR', 'ORIGIN_STATE_FIPS',
       'ORIGIN_STATE_NM', 'ORIGIN_WAC', 'DEST_AIRPORT_ID',
       'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME',
       'DEST_STATE_ABR', 'DEST_STATE_FIPS', 'DEST_STATE_NM', 'DEST_WAC',
       'YEAR', 'QUARTER', 'MONTH', 'DISTANCE_GROUP', 'CLASS'],
      dtype='object')

In [None]:
# Remove irrelvant columns

df.drop(['FREIGHT', 'MAIL', ], axis=1)

### Summary Statistics

In [10]:
airline.describe()

Unnamed: 0,Activity Period,Passenger Count
count,50730.0,50730.0
mean,201390.46164,28489.836073
std,495.261863,60100.148363
min,200507.0,1.0
25%,201001.0,4553.0
50%,201408.0,8645.5
75%,201807.0,19508.5
max,202212.0,659837.0


In [11]:
tripsurvey.describe()

Unnamed: 0,State FIPS,County FIPS,Population Staying at Home,Population Not Staying at Home,Number of Trips,Number of Trips <1,Number of Trips 1-3,Number of Trips 3-5,Number of Trips 5-10,Number of Trips 10-25,Number of Trips 25-50,Number of Trips 50-100,Number of Trips 100-250,Number of Trips 250-500,Number of Trips >=500,Week,Month
count,4016794.0,3952636.0,3985105.0,3985105.0,3985105.0,3985105.0,3985105.0,3985105.0,3985105.0,3985105.0,3985105.0,3985105.0,3985105.0,3985105.0,3985105.0,4018052.0,4018052.0
mean,30.259,30383.65,68338.71,241887.8,1147674.0,296788.5,282696.4,139090.3,175908.8,169679.7,54699.1,18215.49,7659.008,1649.222,1286.976,24.2496,6.096184
std,15.15153,15160.1,1336336.0,4625886.0,22449480.0,5844233.0,5531007.0,2721950.0,3447392.0,3331007.0,1066654.0,354313.3,151118.2,32569.15,28530.16,15.03604,3.450463
min,1.0,1001.0,8.0,87.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,18.0,18177.0,2058.0,9281.0,42368.0,8862.0,9369.0,4127.0,5705.0,6775.0,3371.0,1230.0,410.0,56.0,15.0,11.0,3.0
50%,29.0,29176.0,4933.0,21908.0,102059.0,22331.0,24853.0,11628.0,14481.0,16152.0,7181.0,2674.0,940.0,156.0,52.0,23.0,6.0
75%,45.0,45081.0,14628.0,59944.0,282556.0,65288.0,71709.0,34748.0,42156.0,42830.0,16698.0,6022.0,2300.0,432.0,202.0,37.0,9.0
max,56.0,56045.0,110211800.0,273740000.0,1569053000.0,461592500.0,405130500.0,198018400.0,252611800.0,256509600.0,76367320.0,28735610.0,14476980.0,3651375.0,5003062.0,52.0,12.0


### Data Types and Info

In [12]:
airline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50730 entries, 0 to 50729
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Activity Period              50730 non-null  int64 
 1   Operating Airline            50730 non-null  object
 2   Operating Airline IATA Code  50427 non-null  object
 3   Published Airline            50730 non-null  object
 4   Published Airline IATA Code  50427 non-null  object
 5   GEO Summary                  50730 non-null  object
 6   GEO Region                   50730 non-null  object
 7   Activity Type Code           50730 non-null  object
 8   Price Category Code          50730 non-null  object
 9   Terminal                     50730 non-null  object
 10  Boarding Area                50730 non-null  object
 11  Passenger Count              50730 non-null  int64 
dtypes: int64(2), object(10)
memory usage: 4.6+ MB


In [13]:
tripsurvey.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4018052 entries, 0 to 4018051
Data columns (total 22 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   Level                           object 
 1   Date                            object 
 2   State FIPS                      float64
 3   State Postal Code               object 
 4   County FIPS                     float64
 5   County Name                     object 
 6   Population Staying at Home      float64
 7   Population Not Staying at Home  float64
 8   Number of Trips                 float64
 9   Number of Trips <1              float64
 10  Number of Trips 1-3             float64
 11  Number of Trips 3-5             float64
 12  Number of Trips 5-10            float64
 13  Number of Trips 10-25           float64
 14  Number of Trips 25-50           float64
 15  Number of Trips 50-100          float64
 16  Number of Trips 100-250         float64
 17  Number of Trips 250-500    

### Missing Data

In [14]:
print(f'\nMissing Data:\n{airline.isnull().sum()}')

print('\nTotal Missing Data:', airline.isnull().sum().sum())


Missing Data:
Activity Period                  0
Operating Airline                0
Operating Airline IATA Code    303
Published Airline                0
Published Airline IATA Code    303
GEO Summary                      0
GEO Region                       0
Activity Type Code               0
Price Category Code              0
Terminal                         0
Boarding Area                    0
Passenger Count                  0
dtype: int64

Total Missing Data: 606


In [15]:
print(f'\nMissing Data:\n{tripsurvey.isnull().sum()}')

print('\nTotal Missing Data:', tripsurvey.isnull().sum().sum())


Missing Data:
Level                                 0
Date                                  0
State FIPS                         1258
State Postal Code                  1258
County FIPS                       65416
County Name                       65416
Population Staying at Home        32947
Population Not Staying at Home    32947
Number of Trips                   32947
Number of Trips <1                32947
Number of Trips 1-3               32947
Number of Trips 3-5               32947
Number of Trips 5-10              32947
Number of Trips 10-25             32947
Number of Trips 25-50             32947
Number of Trips 50-100            32947
Number of Trips 100-250           32947
Number of Trips 250-500           32947
Number of Trips >=500             32947
Row ID                                0
Week                                  0
Month                                 0
dtype: int64

Total Missing Data: 561659


## Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}