In [1]:
# import dependencies
import pandas as pd
from sqlalchemy import create_engine, text
import config

In [2]:
# connect to database
db_url = f"postgresql://postgres:{config.password}@localhost:5432/{config.database}"
engine = create_engine(db_url)

In [3]:
# function to load table into postgres db, save backup csv
def to_sql_and_csv(table_name, df):
    # write to csv
    df.to_csv(f"./{table_name}_cleaned.csv", index=False)
    # load into postgres db
    with engine.begin() as conn:
        conn.execute(text(f"DELETE FROM {table_name}"))
        df.to_sql(table_name, conn, if_exists="append", index=False)

In [4]:
# load schema into postgres db

# load sql schema, split by ; and run each statement
# to create tables in postgres
with engine.connect() as conn:
    with open("./schema_v2.0.sql", "r") as file:
        queries = file.read().split(";")
        for query in queries:
            # strip whitespace and ignore empty queries
            if query.strip() != "":
                conn.execute(text(query))
                conn.commit()


# confirm tables are created
with engine.connect() as conn:
    result = conn.execute(
        text(
            "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'"
        )
    )
    print(f"Tables in database: {result.fetchall()}")

Tables in database: [('listings',), ('neighbourhoods',), ('hosts',), ('availability',), ('reviews',), ('min_max_night',), ('listing_reviews',), ('host_listings_count',), ('listings_categorical',), ('calendar',)]


In [5]:
# load df
df = pd.read_csv("../raw_data/listings_detailed_2024_mar.csv")
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,3686,https://www.airbnb.com/rooms/3686,20230913045152,2023-09-13,city scrape,Home in Washington · ★4.64 · 1 bedroom · 1 bed...,IMPORTANT NOTES<br />* Carefully read and be s...,We love that our neighborhood is up and coming...,https://a0.muscache.com/pictures/61e02c7e-3d66...,4645,...,4.84,3.89,4.63,,f,1,0,1,0,0.53
1,3943,https://www.airbnb.com/rooms/3943,20230913045152,2023-09-13,city scrape,Townhouse in Washington · ★4.83 · 1 bedroom · ...,You will be staying in high ceiling bedroom w...,This rowhouse is centrally located in the hear...,https://a0.muscache.com/pictures/airflow/Hosti...,5059,...,4.91,4.57,4.75,Hosted License: 5007242201001033,f,5,0,5,0,2.75
2,4197,https://www.airbnb.com/rooms/4197,20230913045152,2023-09-13,city scrape,Home in Washington · ★4.85 · 1 bedroom · 1 bed...,This is the middle bedroom upstairs in a resto...,"Our area, the Eastern Market neighborhood of C...",https://a0.muscache.com/pictures/miso/Hosting-...,5061,...,4.98,4.96,4.94,Hosted License: 5007242201000749,f,2,0,2,0,0.32
3,4529,https://www.airbnb.com/rooms/4529,20230913045152,2023-09-13,city scrape,Home in Washington · ★4.66 · 1 bedroom · 1 bed...,This is large private bedroom with plenty of...,Very quiet neighborhood and it is easy accessi...,https://a0.muscache.com/pictures/86072003/6709...,5803,...,4.93,4.51,4.83,Exempt,f,1,0,1,0,0.59
4,4967,https://www.airbnb.com/rooms/4967,20230913045152,2023-09-13,previous scrape,Home in Washington · ★4.74 · 1 bedroom · 1 bed...,"<b>The space</b><br />Hello, my name is Seveer...",,https://a0.muscache.com/pictures/2439810/bb320...,7086,...,4.93,4.21,4.64,,f,3,0,3,0,0.2


# hosts

In [6]:
hosts_df = df[
    [
        "host_id",
        "host_url",
        "host_name",
        "host_since",
        "host_location",
        "host_about",
        "host_response_time",
        "host_response_rate",
        "host_acceptance_rate",
        "host_is_superhost",
        "host_thumbnail_url",
        "host_picture_url",
        "host_neighbourhood",
        "host_listings_count",
        "host_total_listings_count",
        "host_verifications",
        "host_has_profile_pic",
        "host_identity_verified",
    ]
]
hosts_df.head()

Unnamed: 0,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified
0,4645,https://www.airbnb.com/users/show/4645,Vita,2008-11-26,"Washington D.C., DC","I am a literary scholar, teacher, poet, vegan ...",within an hour,100%,100%,t,https://a0.muscache.com/im/pictures/user/User-...,https://a0.muscache.com/im/pictures/user/User-...,Anacostia,1,4,"['email', 'phone', 'work_email']",t,t
1,5059,https://www.airbnb.com/users/show/5059,Vasa,2008-12-12,"Washington, DC",I travel often and always try to immerse mysel...,within an hour,100%,92%,t,https://a0.muscache.com/im/pictures/user/8ec69...,https://a0.muscache.com/im/pictures/user/8ec69...,Eckington,5,5,"['email', 'phone']",t,t
2,5061,https://www.airbnb.com/users/show/5061,Sandra,2008-12-12,"Washington D.C., DC",I’m a California native who came to work for t...,within an hour,100%,100%,t,https://a0.muscache.com/im/pictures/user/93c77...,https://a0.muscache.com/im/pictures/user/93c77...,Capitol Hill,2,2,"['email', 'phone']",t,t
3,5803,https://www.airbnb.com/users/show/5803,Bertina,2008-12-30,"Washington, DC",I am a retried teacher. I enjoy walking and wa...,,,,f,https://a0.muscache.com/im/pictures/user/0050a...,https://a0.muscache.com/im/pictures/user/0050a...,Eastland Gardens,3,4,"['email', 'phone']",t,t
4,7086,https://www.airbnb.com/users/show/7086,Edward,2009-01-26,,"I am fun, honest and very easy going and trave...",within a few hours,100%,95%,t,https://a0.muscache.com/im/pictures/user/6efb4...,https://a0.muscache.com/im/pictures/user/6efb4...,Ivy City,3,5,"['email', 'phone']",t,t


In [7]:
# check datatypes
hosts_df.dtypes

host_id                       int64
host_url                     object
host_name                    object
host_since                   object
host_location                object
host_about                   object
host_response_time           object
host_response_rate           object
host_acceptance_rate         object
host_is_superhost            object
host_thumbnail_url           object
host_picture_url             object
host_neighbourhood           object
host_listings_count           int64
host_total_listings_count     int64
host_verifications           object
host_has_profile_pic         object
host_identity_verified       object
dtype: object

In [8]:
# drop duplicate rows (primary key is host_id)
hosts_df = hosts_df.drop_duplicates(subset=["host_id"])

# convert host_since to datetime
hosts_df.host_since = pd.to_datetime(hosts_df.host_since)

# convert host_response_rate and host_acceptance_rate to float
hosts_df.host_response_rate = hosts_df.host_response_rate.str.replace("%", "").astype(float)
hosts_df.host_acceptance_rate = hosts_df.host_acceptance_rate.str.replace("%", "").astype(float)

# convert host_is_superhost, host_has_profile_pic, host_identity_verified to boolean
columns_to_bool = [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
]

for column in columns_to_bool:
    hosts_df[column] = hosts_df[column].map({"t": True, "f": False}).astype(bool)

# check datatypes
hosts_df.dtypes

host_id                               int64
host_url                             object
host_name                            object
host_since                   datetime64[ns]
host_location                        object
host_about                           object
host_response_time                   object
host_response_rate                  float64
host_acceptance_rate                float64
host_is_superhost                      bool
host_thumbnail_url                   object
host_picture_url                     object
host_neighbourhood                   object
host_listings_count                   int64
host_total_listings_count             int64
host_verifications                   object
host_has_profile_pic                   bool
host_identity_verified                 bool
dtype: object

In [9]:
hosts_df.host_neighbourhood.value_counts()

host_neighbourhood
Northwest Washington                688
Northeast Washington                438
Capitol Hill                        243
Southeast Washington                152
Near Northeast/H Street Corridor     84
                                   ... 
Redwood City                          1
Financial District                    1
Waikiki                               1
Mid-Wilshire                          1
East Ocean View                       1
Name: count, Length: 199, dtype: int64

In [10]:
# run save function
to_sql_and_csv("hosts", hosts_df)

# host_listings_count

In [11]:
hosts_lc_df = df[["host_id", "calculated_host_listings_count", "calculated_host_listings_count_entire_homes", "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms"]]
hosts_lc_df.head()

Unnamed: 0,host_id,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
0,4645,1,0,1,0
1,5059,5,0,5,0
2,5061,2,0,2,0
3,5803,1,0,1,0
4,7086,3,0,3,0


In [12]:
# drop duplicate rows (primary key is host_id)
hosts_lc_df = hosts_lc_df.drop_duplicates(subset=["host_id"])

# rename columns
hosts_lc_df.rename(columns={"calculated_host_listings_count": "host_listings_total_count", "calculated_host_listings_count_entire_homes": "host_listings_entire_homes_count", "calculated_host_listings_count_private_rooms": "host_listings_private_rooms_count", "calculated_host_listings_count_shared_rooms": "host_listings_shared_rooms_count"}, inplace=True)
hosts_lc_df.columns

Index(['host_id', 'host_listings_total_count',
       'host_listings_entire_homes_count', 'host_listings_private_rooms_count',
       'host_listings_shared_rooms_count'],
      dtype='object')

In [13]:
hosts_lc_df.dtypes

host_id                              int64
host_listings_total_count            int64
host_listings_entire_homes_count     int64
host_listings_private_rooms_count    int64
host_listings_shared_rooms_count     int64
dtype: object

In [14]:
# run save function
to_sql_and_csv("host_listings_count", hosts_lc_df)

# neighbourhoods

In [15]:
# create table of unique neighbourhoods
neighbourhoods_df = pd.DataFrame(df.neighbourhood_cleansed.unique(), columns=["neighbourhood"]).reset_index()
neighbourhoods_df.rename(columns={"index": "neighbourhood_id"}, inplace=True)
neighbourhoods_df.head()

Unnamed: 0,neighbourhood_id,neighbourhood
0,0,Historic Anacostia
1,1,"Edgewood, Bloomingdale, Truxton Circle, Eckington"
2,2,"Capitol Hill, Lincoln Park"
3,3,"Eastland Gardens, Kenilworth"
4,4,"Ivy City, Arboretum, Trinidad, Carver Langston"


In [16]:
neighbourhoods_df.dtypes

neighbourhood_id     int64
neighbourhood       object
dtype: object

In [17]:
# run save function
to_sql_and_csv("neighbourhoods", neighbourhoods_df)

# listings

In [18]:
listings_df = df[
    [
        "id",
        "host_id",
        "neighbourhood_cleansed",
        "latitude",
        "longitude",
        "accommodates",
        "bathrooms",
        "bedrooms",
        "beds",
        "price",
    ]
]
listings_df.head()

Unnamed: 0,id,host_id,neighbourhood_cleansed,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price
0,3686,4645,Historic Anacostia,38.86339,-76.98889,1,,,1.0,$67.00
1,3943,5059,"Edgewood, Bloomingdale, Truxton Circle, Eckington",38.91195,-77.00456,2,,,1.0,$78.00
2,4197,5061,"Capitol Hill, Lincoln Park",38.88719,-76.99472,1,,,1.0,$80.00
3,4529,5803,"Eastland Gardens, Kenilworth",38.90585,-76.94469,2,,,1.0,$56.00
4,4967,7086,"Ivy City, Arboretum, Trinidad, Carver Langston",38.91217,-76.99249,1,,,1.0,"$2,500.00"


In [19]:
# rename id column
listings_df.rename(columns={"id": "listing_id"}, inplace=True)

# convert price to float
listings_df.price = listings_df.price.str.replace("$", "").str.replace(",", "").astype(float)

# merge neighbourhoods_df, drop neighbourhood_cleansed, neighbourhood
listings_df = pd.merge(listings_df, neighbourhoods_df, left_on="neighbourhood_cleansed", right_on="neighbourhood", how="left")
listings_df.drop(columns=["neighbourhood_cleansed", "neighbourhood"], inplace=True)

# reorganize columns
listings_df = listings_df[
    [
        "listing_id",
        "host_id",
        "neighbourhood_id",
        "latitude",
        "longitude",
        "accommodates",
        "bathrooms",
        "bedrooms",
        "beds",
        "price",
    ]
]

listings_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_df.rename(columns={"id": "listing_id"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_df.price = listings_df.price.str.replace("$", "").str.replace(",", "").astype(float)


Unnamed: 0,listing_id,host_id,neighbourhood_id,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price
0,3686,4645,0,38.86339,-76.98889,1,,,1.0,67.0
1,3943,5059,1,38.91195,-77.00456,2,,,1.0,78.0
2,4197,5061,2,38.88719,-76.99472,1,,,1.0,80.0
3,4529,5803,3,38.90585,-76.94469,2,,,1.0,56.0
4,4967,7086,4,38.91217,-76.99249,1,,,1.0,2500.0


In [20]:
listings_df.dtypes

listing_id            int64
host_id               int64
neighbourhood_id      int64
latitude            float64
longitude           float64
accommodates          int64
bathrooms           float64
bedrooms            float64
beds                float64
price               float64
dtype: object

In [21]:
# run save function
to_sql_and_csv("listings", listings_df)

# listings_categorical

In [22]:
listings_categorical_df = df[[
    "id",
    "name",
    "description",
    "listing_url",
    "neighborhood_overview",
    "picture_url",
    "property_type",
    "room_type",
    "amenities",
    "bathrooms_text",
    "license",        
]]

listings_categorical_df.head()

Unnamed: 0,id,name,description,listing_url,neighborhood_overview,picture_url,property_type,room_type,amenities,bathrooms_text,license
0,3686,Home in Washington · ★4.64 · 1 bedroom · 1 bed...,IMPORTANT NOTES<br />* Carefully read and be s...,https://www.airbnb.com/rooms/3686,We love that our neighborhood is up and coming...,https://a0.muscache.com/pictures/61e02c7e-3d66...,Private room in home,Private room,"[""Free street parking"", ""Bed linens"", ""Extra p...",1 private bath,
1,3943,Townhouse in Washington · ★4.83 · 1 bedroom · ...,You will be staying in high ceiling bedroom w...,https://www.airbnb.com/rooms/3943,This rowhouse is centrally located in the hear...,https://a0.muscache.com/pictures/airflow/Hosti...,Private room in townhouse,Private room,"[""Mini fridge"", ""Clothing storage: wardrobe an...",1 private bath,Hosted License: 5007242201001033
2,4197,Home in Washington · ★4.85 · 1 bedroom · 1 bed...,This is the middle bedroom upstairs in a resto...,https://www.airbnb.com/rooms/4197,"Our area, the Eastern Market neighborhood of C...",https://a0.muscache.com/pictures/miso/Hosting-...,Private room in home,Private room,"[""Children\u2019s books and toys"", ""Free stree...",1.5 shared baths,Hosted License: 5007242201000749
3,4529,Home in Washington · ★4.66 · 1 bedroom · 1 bed...,This is large private bedroom with plenty of...,https://www.airbnb.com/rooms/4529,Very quiet neighborhood and it is easy accessi...,https://a0.muscache.com/pictures/86072003/6709...,Private room in home,Private room,"[""Free street parking"", ""Bed linens"", ""Self ch...",1 shared bath,Exempt
4,4967,Home in Washington · ★4.74 · 1 bedroom · 1 bed...,"<b>The space</b><br />Hello, my name is Seveer...",https://www.airbnb.com/rooms/4967,,https://a0.muscache.com/pictures/2439810/bb320...,Private room in home,Private room,"[""Smoking allowed"", ""Air conditioning"", ""Lock ...",3 baths,


In [23]:
# rename columns
listings_categorical_df.rename(columns={"id": "listing_id", "name": "listing_name"}, inplace=True)

# open listing_simple.csv
listing_simple_df = pd.read_csv("../raw_data/2024_mar/listings.csv")
listing_simple_df = listing_simple_df[['id', 'name']]
listing_simple_df.rename(columns={"id": "listing_id", "name": "hover_description"}, inplace=True)

# merge with listings_categorical_df
listings_categorical_df = pd.merge(listings_categorical_df, listing_simple_df, on="listing_id", how="left")

# reorganize columns
listings_categorical_df = listings_categorical_df[
    [
        "listing_id",
        "listing_name",
        "hover_description",
        "description",
        "listing_url",
        "neighborhood_overview",
        "picture_url",
        "property_type",
        "room_type",
        "amenities",
        "bathrooms_text",
        "license",
    ]
]

listings_categorical_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings_categorical_df.rename(columns={"id": "listing_id", "name": "listing_name"}, inplace=True)


Unnamed: 0,listing_id,listing_name,hover_description,description,listing_url,neighborhood_overview,picture_url,property_type,room_type,amenities,bathrooms_text,license
0,3686,Home in Washington · ★4.64 · 1 bedroom · 1 bed...,Vita's Hideaway,IMPORTANT NOTES<br />* Carefully read and be s...,https://www.airbnb.com/rooms/3686,We love that our neighborhood is up and coming...,https://a0.muscache.com/pictures/61e02c7e-3d66...,Private room in home,Private room,"[""Free street parking"", ""Bed linens"", ""Extra p...",1 private bath,
1,3943,Townhouse in Washington · ★4.83 · 1 bedroom · ...,Historic Rowhouse Near Monuments,You will be staying in high ceiling bedroom w...,https://www.airbnb.com/rooms/3943,This rowhouse is centrally located in the hear...,https://a0.muscache.com/pictures/airflow/Hosti...,Private room in townhouse,Private room,"[""Mini fridge"", ""Clothing storage: wardrobe an...",1 private bath,Hosted License: 5007242201001033
2,4197,Home in Washington · ★4.85 · 1 bedroom · 1 bed...,Capitol Hill Bedroom walk to Metro,This is the middle bedroom upstairs in a resto...,https://www.airbnb.com/rooms/4197,"Our area, the Eastern Market neighborhood of C...",https://a0.muscache.com/pictures/miso/Hosting-...,Private room in home,Private room,"[""Children\u2019s books and toys"", ""Free stree...",1.5 shared baths,Hosted License: 5007242201000749
3,4529,Home in Washington · ★4.66 · 1 bedroom · 1 bed...,Bertina's House Part One,This is large private bedroom with plenty of...,https://www.airbnb.com/rooms/4529,Very quiet neighborhood and it is easy accessi...,https://a0.muscache.com/pictures/86072003/6709...,Private room in home,Private room,"[""Free street parking"", ""Bed linens"", ""Self ch...",1 shared bath,Exempt
4,4967,Home in Washington · ★4.74 · 1 bedroom · 1 bed...,,"<b>The space</b><br />Hello, my name is Seveer...",https://www.airbnb.com/rooms/4967,,https://a0.muscache.com/pictures/2439810/bb320...,Private room in home,Private room,"[""Smoking allowed"", ""Air conditioning"", ""Lock ...",3 baths,


In [24]:
listings_categorical_df.dtypes

listing_id                int64
listing_name             object
hover_description        object
description              object
listing_url              object
neighborhood_overview    object
picture_url              object
property_type            object
room_type                object
amenities                object
bathrooms_text           object
license                  object
dtype: object

In [25]:
# run save function
to_sql_and_csv("listings_categorical", listings_categorical_df)

# availability

In [26]:
availability_df = df[["id", "has_availability", "availability_30", "availability_60", "availability_90", "availability_365", "calendar_last_scraped", "instant_bookable"]]
availability_df.head()

Unnamed: 0,id,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,instant_bookable
0,3686,t,0,0,0,271,2023-09-13,f
1,3943,t,14,33,59,334,2023-09-13,f
2,4197,t,14,26,56,331,2023-09-13,f
3,4529,t,29,59,89,179,2023-09-13,f
4,4967,t,30,60,90,365,2023-09-13,f


In [27]:
# rename columns
availability_df.rename(columns={"id": "listing_id"}, inplace=True)

# convert has_availability, instant_bookable to boolean
columns_to_bool = ["has_availability", "instant_bookable"]

for column in columns_to_bool:
    availability_df[column] = availability_df[column].map({"t": True, "f": False}).astype(bool)
    
# convert calendar_last_scraped to datetime
availability_df.calendar_last_scraped = pd.to_datetime(availability_df.calendar_last_scraped)

availability_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  availability_df.rename(columns={"id": "listing_id"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  availability_df[column] = availability_df[column].map({"t": True, "f": False}).astype(bool)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  availability_df[column] = availability_df[column].map({"t": True, "f": False}).astype(bool)
A value is 

listing_id                        int64
has_availability                   bool
availability_30                   int64
availability_60                   int64
availability_90                   int64
availability_365                  int64
calendar_last_scraped    datetime64[ns]
instant_bookable                   bool
dtype: object

In [28]:
# run save function
to_sql_and_csv("availability", availability_df)

# min_max_night

In [29]:
min_max_night_df = df[["id", "minimum_nights", "maximum_nights", "minimum_minimum_nights", "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights", "minimum_nights_avg_ntm", "maximum_nights_avg_ntm"]]
min_max_night_df.head()

Unnamed: 0,id,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm
0,3686,31,365,31,31,365,365,31.0,365.0
1,3943,1,1125,1,1,1125,1125,1.0,1125.0
2,4197,7,1125,7,7,1125,1125,7.0,1125.0
3,4529,30,1125,30,30,1125,1125,30.0,1125.0
4,4967,1125,1125,1125,1125,1125,1125,1125.0,1125.0


In [30]:
# rename column
min_max_night_df.rename(columns={"id": "listing_id"}, inplace=True)

min_max_night_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  min_max_night_df.rename(columns={"id": "listing_id"}, inplace=True)


listing_id                  int64
minimum_nights              int64
maximum_nights              int64
minimum_minimum_nights      int64
maximum_minimum_nights      int64
minimum_maximum_nights      int64
maximum_maximum_nights      int64
minimum_nights_avg_ntm    float64
maximum_nights_avg_ntm    float64
dtype: object

In [31]:
# run save function
to_sql_and_csv("min_max_night", min_max_night_df)

# listing_reviews

In [32]:
listing_reviews_df = df[
    [
        "id",
        "number_of_reviews",
        "number_of_reviews_ltm",
        "number_of_reviews_l30d",
        "first_review",
        "last_review",
        "review_scores_rating",
        "review_scores_accuracy",
        "review_scores_cleanliness",
        "review_scores_checkin",
        "review_scores_communication",
        "review_scores_location",
        "reviews_per_month",
        "review_scores_value",
    ]
]
listing_reviews_df.head()

Unnamed: 0,id,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,reviews_per_month,review_scores_value
0,3686,83,2,0,2010-11-01,2023-07-21,4.64,4.73,4.45,4.91,4.84,3.89,0.53,4.63
1,3943,480,37,3,2009-05-10,2023-09-05,4.83,4.89,4.92,4.94,4.91,4.57,2.75,4.75
2,4197,56,10,1,2009-05-14,2023-08-27,4.85,4.98,4.87,5.0,4.98,4.96,0.32,4.94
3,4529,102,0,0,2009-08-08,2019-07-05,4.66,4.8,4.6,4.93,4.93,4.51,0.59,4.83
4,4967,31,0,0,2010-11-04,2016-09-22,4.74,4.68,4.89,4.93,4.93,4.21,0.2,4.64


In [33]:
listing_reviews_df.dtypes

id                               int64
number_of_reviews                int64
number_of_reviews_ltm            int64
number_of_reviews_l30d           int64
first_review                    object
last_review                     object
review_scores_rating           float64
review_scores_accuracy         float64
review_scores_cleanliness      float64
review_scores_checkin          float64
review_scores_communication    float64
review_scores_location         float64
reviews_per_month              float64
review_scores_value            float64
dtype: object

In [34]:
# rename column
listing_reviews_df.rename(columns={"id": "listing_id"}, inplace=True)

# convert first_review and last_review to datetime
listing_reviews_df = listing_reviews_df.assign(
    first_review=pd.to_datetime(listing_reviews_df["first_review"]),
    last_review=pd.to_datetime(listing_reviews_df["last_review"]),
)

listing_reviews_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listing_reviews_df.rename(columns={"id": "listing_id"}, inplace=True)


listing_id                              int64
number_of_reviews                       int64
number_of_reviews_ltm                   int64
number_of_reviews_l30d                  int64
first_review                   datetime64[ns]
last_review                    datetime64[ns]
review_scores_rating                  float64
review_scores_accuracy                float64
review_scores_cleanliness             float64
review_scores_checkin                 float64
review_scores_communication           float64
review_scores_location                float64
reviews_per_month                     float64
review_scores_value                   float64
dtype: object

In [35]:
# run save function
to_sql_and_csv("listing_reviews", listing_reviews_df)

# reviews

In [36]:
# read in csv
reviews_df = pd.read_csv("../raw_data/reviews_detailed_2024_mar.csv")
reviews_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,3686,131293,2010-11-01,257234,Callie,Staying with Levita and her wonderful family w...
1,3686,150766,2010-12-08,255888,Patrick,"Vita is a very welcoming, helpful and friendly..."
2,3686,177749,2011-02-02,366688,Benjamin,"This was my first time using ""airbnb"" and it m..."
3,3686,197451,2011-03-12,213492,T.J.,"First, Vita saved my work week by providing me..."
4,3686,213212,2011-03-30,428455,Pete,Great host! Very welcoming and organised. I st...


In [37]:
# change date to datetime
reviews_df["date"] = pd.to_datetime(reviews_df["date"])

# change column names
reviews_df.rename(
    columns={"id": "review_id", "date": "review_date"},
    inplace=True,
)

# reorder columns, drop comments as unnecessary and space-consuming
reviews_df = reviews_df[
    [
        "review_id",
        "listing_id",
        "review_date",
        "reviewer_id",
        "reviewer_name",
    ]
]

# filter out reviews where listing_id is not in listings_df
# otherwise it'll trip a foreign key constraint
reviews_df = reviews_df[reviews_df["listing_id"].isin(listings_df["listing_id"])]

reviews_df.dtypes

review_id                 int64
listing_id                int64
review_date      datetime64[ns]
reviewer_id               int64
reviewer_name            object
dtype: object

In [38]:
# run save function
to_sql_and_csv("reviews", reviews_df)

# reviews adds 30 MB to storage space, without comments (double all prior data - from 29-59MB)

# calendar

In [39]:
calendar_df = pd.read_csv("../raw_data/2024_mar/calendar.csv")
calendar_df.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,313405,2024-03-23,f,$75.00,,1.0,90.0
1,313405,2024-03-24,t,$75.00,,1.0,90.0
2,313405,2024-03-25,t,$75.00,,1.0,90.0
3,313405,2024-03-26,t,$75.00,,1.0,90.0
4,313405,2024-03-27,t,$75.00,,1.0,90.0


In [40]:
# drop rows not in listings_df
calendar_df = calendar_df[calendar_df["listing_id"].isin(listings_df["listing_id"])]

# drop empty column
calendar_df.drop(columns=["adjusted_price"], inplace=True)

# convert date to datetime
calendar_df["date"] = pd.to_datetime(calendar_df["date"])

# convert available to boolean
calendar_df["available"] = (
    calendar_df["available"].map({"t": True, "f": False}).astype("boolean")
)

# convert price to float
calendar_df["price"] = (
    calendar_df["price"].str.replace("$", "").str.replace(",", "").astype(float)
)

# add an index column, called num
calendar_df.reset_index(inplace=True)
calendar_df.rename(columns={"index": "id"}, inplace=True)

calendar_df.dtypes

id                         int64
listing_id                 int64
date              datetime64[ns]
available                boolean
price                    float64
minimum_nights           float64
maximum_nights           float64
dtype: object

In [41]:
# run save function
to_sql_and_csv("calendar", calendar_df)

# calendar adds 110MB (59 - 170MB) (although VACUUM FULL dropped 5MB to 165)

# v1.0 is 237MB, so reviews.comments is 70MB