# ETL Project 2 - Hawaii Tourism
### Rental Property Performance 2019

This data was extracted from the Hawai’i Tourism Authority Website in the form of csv files on the Performance of Vacation Rentals in 2019.

There are 12 different csv files one for each month

### Import Dependencies 

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2

## Import CSV Data / Create Dataframes

Data was extracted by downloading excel files and saving them as csv files

From there the csv files were uploaded using pandas

In [2]:
jan_file = "../Resources/jan.csv"
jan_df = pd.read_csv(jan_file)

feb_file = "../Resources/feb.csv"
feb_df = pd.read_csv(feb_file)

march_file = "../Resources/march.csv"
march_df = pd.read_csv(march_file)

april_file = "../Resources/april.csv"
april_df = pd.read_csv(april_file)

may_file = "../Resources/may.csv"
may_df = pd.read_csv(may_file)

june_file = "../Resources/june.csv"
june_df = pd.read_csv(june_file)

july_file = "../Resources/july.csv"
july_df = pd.read_csv(july_file)

aug_file = "../Resources/aug.csv"
aug_df = pd.read_csv(aug_file)

sept_file = "../Resources/sept.csv"
sept_df = pd.read_csv(sept_file)

oct_file = "../Resources/oct.csv"
oct_df = pd.read_csv(oct_file)

nov_file = "../Resources/nov.csv"
nov_df = pd.read_csv(nov_file)

dec_file = "../Resources/dec.csv"
dec_df = pd.read_csv(dec_file)





In [3]:
jan_df.head(10)

Unnamed: 0,Month of January 2019,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,
2,,,,Unit Supply,,,Unit Demand,,,Unit Occupancy %,,,Unit Average Daily Rate,,
3,,,,2019,2018,% Change,2019,2018,% Change,2019,2018,Percentage Pt. Change,2019,2018,% Change
4,,,,,,,,,,,,,,,
5,,State of Hawai'i,,733574,N.A.,N.A.,573458,N.A.,N.A.,78.20%,N.A.,N.A.,$218.38,N.A.,N.A.
6,,,,,,,,,,,,,,,
7,,Oahu,,259722,N.A.,N.A.,194053,N.A.,N.A.,74.70%,N.A.,N.A.,$159.61,N.A.,N.A.
8,,Oahu,Waik'k',99260,N.A.,N.A.,77152,N.A.,N.A.,77.70%,N.A.,N.A.,$150.37,N.A.,N.A.
9,,,,,,,,,,,,,,,


## Transform/Clean Dataframes (for each month) 

Firstly I used the jan_df as a test and determined the best way to clean and manipulate the data

Once I was able to delete the unnecessary columns and rows and update the column names I added a columns for the month and the year to the data frame

These steps were then done on each of the 12 dataframes and they were appended together into one large dataframe


In [4]:
#Rename the columns that will be kept
jan_df = jan_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})

In [5]:
#Make a copy of the dataframe that extracts only the columns that will be needed
jan_df = jan_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]

In [6]:
jan_df.head(10)

Unnamed: 0,island_name,city,unit_supply,unit_demand,unit_occupancy,unit_avg_daily_rate
0,,,,,,
1,,,,,,
2,,,Unit Supply,Unit Demand,Unit Occupancy %,Unit Average Daily Rate
3,,,2019,2019,2019,2019
4,,,,,,
5,State of Hawai'i,,733574,573458,78.20%,$218.38
6,,,,,,
7,Oahu,,259722,194053,74.70%,$159.61
8,Oahu,Waik'k',99260,77152,77.70%,$150.37
9,,,,,,


In [7]:
#Drop all rows that contain only NaN values
jan_df = jan_df.dropna(axis = 0, how = 'all')

In [8]:
jan_df.head(40)

Unnamed: 0,island_name,city,unit_supply,unit_demand,unit_occupancy,unit_avg_daily_rate
2,,,Unit Supply,Unit Demand,Unit Occupancy %,Unit Average Daily Rate
3,,,2019,2019,2019,2019
5,State of Hawai'i,,733574,573458,78.20%,$218.38
7,Oahu,,259722,194053,74.70%,$159.61
8,Oahu,Waik'k',99260,77152,77.70%,$150.37
10,Maui County,,211328,178150,84.30%,$285.60
11,Maui County,Wailea/K'hei,95701,79737,83.30%,$289.88
12,Maui County,Lahaina/K'anapali/ N'pili/Kapalua,84882,72549,85.50%,$314.64
14,Island of Hawai'i,,165608,121054,73.10%,$172.03
15,Island of Hawai'i,Kona,81233,64244,79.10%,$146.83


In [9]:
#Drop the two unneeded/ repeat rows

jan_df = jan_df.drop([2,3])

In [10]:
jan_df.head(20)

Unnamed: 0,island_name,city,unit_supply,unit_demand,unit_occupancy,unit_avg_daily_rate
5,State of Hawai'i,,733574,573458,78.20%,$218.38
7,Oahu,,259722,194053,74.70%,$159.61
8,Oahu,Waik'k',99260,77152,77.70%,$150.37
10,Maui County,,211328,178150,84.30%,$285.60
11,Maui County,Wailea/K'hei,95701,79737,83.30%,$289.88
12,Maui County,Lahaina/K'anapali/ N'pili/Kapalua,84882,72549,85.50%,$314.64
14,Island of Hawai'i,,165608,121054,73.10%,$172.03
15,Island of Hawai'i,Kona,81233,64244,79.10%,$146.83
16,Island of Hawai'i,Hilo/Honoka'a,32492,19813,61.00%,$88.85
18,Kaua'i,,96916,80201,82.80%,$281.22


In [11]:
#Reset the index of the dataframe
jan_df.reset_index(drop=True)

Unnamed: 0,island_name,city,unit_supply,unit_demand,unit_occupancy,unit_avg_daily_rate
0,State of Hawai'i,,733574,573458,78.20%,$218.38
1,Oahu,,259722,194053,74.70%,$159.61
2,Oahu,Waik'k',99260,77152,77.70%,$150.37
3,Maui County,,211328,178150,84.30%,$285.60
4,Maui County,Wailea/K'hei,95701,79737,83.30%,$289.88
5,Maui County,Lahaina/K'anapali/ N'pili/Kapalua,84882,72549,85.50%,$314.64
6,Island of Hawai'i,,165608,121054,73.10%,$172.03
7,Island of Hawai'i,Kona,81233,64244,79.10%,$146.83
8,Island of Hawai'i,Hilo/Honoka'a,32492,19813,61.00%,$88.85
9,Kaua'i,,96916,80201,82.80%,$281.22


In [12]:
#Create lists to insert into month and year columns

jan = ["January", "January","January","January","January","January","January","January","January","January"]
feb = ["Febuary","Febuary","Febuary","Febuary","Febuary","Febuary","Febuary","Febuary","Febuary","Febuary",]
mar =["March","March","March","March","March","March","March","March","March","March"]
april = ["April","April","April","April","April","April","April","April","April","April",]
may =["May","May","May","May","May","May","May","May","May","May",]
june = ["June","June","June","June","June","June","June","June","June","June",]
july = ["July","July","July","July","July","July","July","July","July","July",]
aug = ["August","August","August","August","August","August","August","August","August","August",]
sept = ["September","September","September","September","September","September","September","September","September","September",]
october = ["October","October","October","October","October","October","October","October","October","October",] 
nov= ["November","November","November","November","November","November","November","November","November","November",]
dec= ["December","December","December","December","December","December","December","December","December","December",]

year = [2019,2019,2019,2019,2019,2019,2019,2019,2019,2019]

In [13]:
#Insert the month and year columns into the data frame
jan_df["month"] = jan
jan_df["year"] = year

In [14]:
jan_df.head()

Unnamed: 0,island_name,city,unit_supply,unit_demand,unit_occupancy,unit_avg_daily_rate,month,year
5,State of Hawai'i,,733574,573458,78.20%,$218.38,January,2019
7,Oahu,,259722,194053,74.70%,$159.61,January,2019
8,Oahu,Waik'k',99260,77152,77.70%,$150.37,January,2019
10,Maui County,,211328,178150,84.30%,$285.60,January,2019
11,Maui County,Wailea/K'hei,95701,79737,83.30%,$289.88,January,2019


In [15]:
# Do the same as above to febuary's data
feb_df = feb_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})
feb_df = feb_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]


In [16]:
feb_df = feb_df.dropna(axis = 0, how = 'all')

In [17]:
feb_df.head()

Unnamed: 0,island_name,city,unit_supply,unit_demand,unit_occupancy,unit_avg_daily_rate
2,,,Unit Supply,Unit Demand,Unit Occupancy %,Unit Average Daily Rate
3,,,2019,2019,2019,2019
5,State of Hawai'i,,618826,509305,82.30%,$216.56
7,O'ahu,,225185,182026,80.80%,$158.34
8,O'ahu,Waik'k',84632,71692,84.70%,$147.19


In [18]:
feb_df = feb_df.drop([2,3])

In [19]:
feb_df = feb_df.reset_index(drop=True)

In [20]:
feb_df["month"] = feb
feb_df["year"] = year

In [21]:
feb_df.head()

Unnamed: 0,island_name,city,unit_supply,unit_demand,unit_occupancy,unit_avg_daily_rate,month,year
0,State of Hawai'i,,618826,509305,82.30%,$216.56,Febuary,2019
1,O'ahu,,225185,182026,80.80%,$158.34,Febuary,2019
2,O'ahu,Waik'k',84632,71692,84.70%,$147.19,Febuary,2019
3,Maui County,,169222,146309,86.50%,$290.75,Febuary,2019
4,Maui County,Wailea/K'hei,75855,65606,86.50%,$297.06,Febuary,2019


In [22]:
# follow the same steps above to transform the rest of the data from March - Dec
#Reset the columns names and only seclect columns needed
march_df = march_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})
march_df = march_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]

april_df = april_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})
april_df = april_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]

may_df = may_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})
may_df = may_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]

june_df = june_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})
june_df = june_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]

july_df = july_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})
july_df = july_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]

aug_df = aug_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})
aug_df = aug_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]

sept_df = sept_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})
sept_df = sept_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]

oct_df = oct_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})
oct_df = oct_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]

nov_df = nov_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})
nov_df = nov_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]

dec_df = dec_df.rename(columns={'Unnamed: 1': 'island_name', 'Unnamed: 2': 'city', 'Unnamed: 3': 'unit_supply', 'Unnamed: 6': 'unit_demand', 'Unnamed: 9': 'unit_occupancy', 'Unnamed: 12': 'unit_avg_daily_rate'})
dec_df = dec_df[["island_name", "city", "unit_supply", "unit_demand", "unit_occupancy", "unit_avg_daily_rate"]]

In [23]:
#Drop any rows with NAN throughout
march_df = march_df.dropna(axis = 0, how = 'all')

april_df = april_df.dropna(axis = 0, how = 'all')

may_df = may_df.dropna(axis = 0, how = 'all')

june_df = june_df.dropna(axis = 0, how = 'all')

july_df = july_df.dropna(axis = 0, how = 'all')

aug_df = aug_df.dropna(axis = 0, how = 'all')

sept_df = sept_df.dropna(axis = 0, how = 'all')

oct_df = oct_df.dropna(axis = 0, how = 'all')

nov_df = nov_df.dropna(axis = 0, how = 'all')

dec_df = dec_df.dropna(axis = 0, how = 'all')



In [24]:
#drop rows that are not need or are repeating

march_df = march_df.drop([2,3])

april_df = april_df.drop([2,3])

may_df = may_df.drop([2,3])

june_df = june_df.drop([2,3])

july_df = july_df.drop([2,3])

aug_df = aug_df.drop([2,3])

sept_df = sept_df.drop([2,3])

oct_df = oct_df.drop([2,3])

nov_df = nov_df.drop([2,3])

dec_df = dec_df.drop([2,3])

In [25]:
#Reset the index of dataframes
march_df = march_df.reset_index(drop=True)

april_df = april_df.reset_index(drop=True)

may_df = may_df.reset_index(drop=True)

june_df = june_df.reset_index(drop=True)

july_df = july_df.reset_index(drop=True)

aug_df = aug_df.reset_index(drop=True)

sept_df = sept_df.reset_index(drop=True)

oct_df = oct_df.reset_index(drop=True)

nov_df = nov_df.reset_index(drop=True)

dec_df = dec_df.reset_index(drop=True)

In [26]:
#Add month and year columns

march_df["month"] = mar
march_df["year"] = year

april_df["month"] = april
april_df["year"] = year

may_df["month"] = may
may_df["year"] = year

june_df["month"] = june
june_df["year"] = year

july_df["month"] = july
july_df["year"] = year

aug_df["month"] = aug
aug_df["year"] = year

sept_df["month"] = sept
sept_df["year"] = year

oct_df["month"] = october
oct_df["year"] = year

nov_df["month"] = nov
nov_df["year"] = year

dec_df["month"] = dec
dec_df["year"] = year

In [27]:
#Append all the dataframes together

In [28]:
rentalfinal_df = jan_df.append(feb_df, ignore_index=True)

In [29]:
rentalfinal_df = rentalfinal_df.append(march_df, ignore_index=True)

In [30]:
rentalfinal_df = rentalfinal_df.append(april_df, ignore_index=True)

In [31]:
rentalfinal_df = rentalfinal_df.append(may_df, ignore_index=True)

In [32]:
rentalfinal_df = rentalfinal_df.append(june_df, ignore_index=True)

In [33]:
rentalfinal_df = rentalfinal_df.append(july_df, ignore_index=True)

In [34]:
rentalfinal_df = rentalfinal_df.append(aug_df, ignore_index=True)

In [35]:
rentalfinal_df = rentalfinal_df.append(sept_df, ignore_index=True)

In [36]:
rentalfinal_df = rentalfinal_df.append(oct_df, ignore_index=True)

In [37]:
rentalfinal_df = rentalfinal_df.append(nov_df, ignore_index=True)

In [38]:
rentalfinal_df = rentalfinal_df.append(dec_df, ignore_index=True)

In [54]:
#Convert the final dataframe to a csv and save it to the Resources folder
rentalfinal_df.to_csv('rentals_df.csv', index=True)

## Loading the Data

The large combinded dataframe was then uploaded to an SQL database

### Connect to the SQL database 

In [61]:
#Make connection to database

connection_string = "postgres:Pooks1313@localhost:5432/HawaiiTourism_DB"
engine = create_engine(f'postgresql://{connection_string}')

In [62]:
#Check to see if the created table is in the Database
engine.table_names()

['rental_performance']

### Load data into SQL

In [63]:

rentalfinal_df.to_sql(name="rental_performance", con=engine, if_exists='append', index=True)