# **BDA - Phase 1**

In [86]:
import pandas as pd
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

### **1. Raw data**

In [87]:
calendar_df = pd.read_csv('../data/raw-data/calendar.csv')
listings_df = pd.read_csv('../data/raw-data/listings.csv')
reviews_df = pd.read_csv('../data/raw-data/reviews.csv')

In [88]:
calendar_df.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,2992450,2024-09-05,f,$70.00,,28,1125
1,2992450,2024-09-06,f,$70.00,,28,1125
2,2992450,2024-09-07,f,$70.00,,28,1125
3,2992450,2024-09-08,f,$70.00,,28,1125
4,2992450,2024-09-09,f,$70.00,,28,1125


In [89]:
listings_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2992450,https://www.airbnb.com/rooms/2992450,20240905032005,2024-09-05,city scrape,Luxury 2 bedroom apartment,The apartment is located in a quiet neighborho...,,https://a0.muscache.com/pictures/44627226/0e72...,4621559,...,4.56,3.22,3.67,,f,1,1,0,0,0.07
1,3820211,https://www.airbnb.com/rooms/3820211,20240905032005,2024-09-05,city scrape,Restored Precinct in Center Sq. w/Parking,"Cozy, cool little 1BR Apt in the heart Albany'...","Great restaurants, architecture, walking, peop...",https://a0.muscache.com/pictures/prohost-api/H...,19648678,...,4.81,4.82,4.78,,f,4,4,0,0,2.49
2,5651579,https://www.airbnb.com/rooms/5651579,20240905032005,2024-09-05,city scrape,Large studio apt by Capital Center & ESP@,"Spacious studio with hardwood floors, fully eq...",The neighborhood is very eclectic. We have a v...,https://a0.muscache.com/pictures/b3fc42f3-6e5e...,29288920,...,4.87,4.76,4.63,,f,2,1,1,0,3.22
3,6623339,https://www.airbnb.com/rooms/6623339,20240905032005,2024-09-05,city scrape,Center Sq. Loft in Converted Precinct w/ Parking,Large renovated 1 bedroom apartment in convert...,"Located in Albany's finest urban neighborhood,...",https://a0.muscache.com/pictures/prohost-api/H...,19648678,...,4.7,4.8,4.72,,f,4,4,0,0,2.91
4,8035768,https://www.airbnb.com/rooms/8035768,20240905032005,2024-09-05,city scrape,Entire Beautiful French Victorian 1884,Stunningly renovated French Victorian original...,There is a Barbershop with 1900' barber chairs...,https://a0.muscache.com/pictures/103429331/a0a...,42430824,...,4.8,4.74,4.75,,f,2,1,1,0,0.51


In [90]:
reviews_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2992450,15066586,2014-07-01,16827297,Kristen,Large apartment; nice kitchen and bathroom. Ke...
1,2992450,21810844,2014-10-24,22648856,Christopher,"This may be a little late, but just to say Ken..."
2,2992450,27434334,2015-03-04,45406,Altay,The apartment was very clean and convenient to...
3,2992450,28524578,2015-03-25,5485362,John,Kenneth was ready when I got there and arrange...
4,2992450,35913434,2015-06-23,15772025,Jennifer,We were pleased to see how 2nd Street and the ...


### **2. Data cleaning**

**Convert date columns to datetime**

In [91]:
calendar_df['date'] = pd.to_datetime(calendar_df['date'])
listings_df['last_scraped'] = pd.to_datetime(listings_df['last_scraped'])
listings_df['host_since'] = pd.to_datetime(listings_df['host_since'])
listings_df['calendar_last_scraped'] = pd.to_datetime(listings_df['calendar_last_scraped'])
listings_df['first_review'] = pd.to_datetime(listings_df['first_review'])
listings_df['last_review'] = pd.to_datetime(listings_df['last_review'])
reviews_df['date'] = pd.to_datetime(reviews_df['date'])

**Convert price and percentages columns to numeric**

In [92]:
calendar_df['price'] = calendar_df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
listings_df['price'] = listings_df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
listings_df['host_response_rate'] = (listings_df['host_response_rate'].str.replace('%', '').astype('float'))
listings_df['host_acceptance_rate'] = (listings_df['host_acceptance_rate'].str.replace('%', '').astype('float'))

**Convert t, f strings to boolean**

In [93]:
calendar_df['available'] = calendar_df['available'].map({'t': True, 'f': False})
listings_df['host_is_superhost'] = listings_df['host_is_superhost'].map({'t': True, 'f': False})
listings_df['has_availability'] = listings_df['has_availability'].map({'t': True, 'f': False})
listings_df['host_has_profile_pic'] = listings_df['has_availability'].map({'t': True, 'f': False})
listings_df['host_identity_verified'] = listings_df['has_availability'].map({'t': True, 'f': False})
listings_df['instant_bookable'] = listings_df['instant_bookable'].map({'t': True, 'f': False})

**Drop columns with all null values**

In [94]:
listings_all_null_columns = ['calendar_updated', 'neighbourhood_group_cleansed', 'license', 'host_has_profile_pic', 'host_identity_verified']
listings_df.drop(columns=listings_all_null_columns, inplace=True)

calendar_df.drop(columns=['adjusted_price'], inplace=True)

**Handle columns with some null values**

handled based on the column values

(median will be less influenced by outliers)

In [95]:
# columns with few nulls
listings_df['host_is_superhost'].fillna(False, inplace=True)
listings_df['has_availability'].fillna(False, inplace=True)
listings_df['description'].fillna('no description', inplace=True)
listings_df['bathrooms'].fillna(listings_df['bathrooms'].median(), inplace=True)
listings_df['bedrooms'].fillna(listings_df['bedrooms'].median(), inplace=True)
listings_df['beds'].fillna(listings_df['beds'].median(), inplace=True)
listings_df['price'].fillna(listings_df['price'].median(), inplace=True)

# columns with some nulls
listings_df['host_response_time'].fillna(listings_df['host_response_time'].mode()[0], inplace=True)
listings_df['host_location'].fillna(listings_df['host_location'].mode()[0], inplace=True)
listings_df['host_acceptance_rate'].fillna(listings_df['host_acceptance_rate'].median(), inplace=True)
listings_df['neighborhood_overview'].fillna('no overview', inplace=True)
listings_df['host_about'].fillna('no about', inplace=True)
listings_df['neighbourhood'].fillna('Neighborhood highlights', inplace=True)
listings_df['host_neighbourhood'].fillna('Unknown', inplace=True)
listings_df['host_response_rate'].fillna(listings_df['host_response_rate'].median(), inplace=True)
reviews_df['comments'].fillna('no comments', inplace=True)

# place holder date
listings_df['first_review'].fillna('1900-01-01', inplace=True)
listings_df['last_review'].fillna('1900-01-01', inplace=True)

# score columns (fill with 0 to indicate no score)
listings_df['review_scores_rating'].fillna(0, inplace=True)
listings_df['review_scores_checkin'].fillna(0, inplace=True)
listings_df['reviews_per_month'].fillna(0, inplace=True)
listings_df['review_scores_accuracy'].fillna(0, inplace=True)
listings_df['review_scores_cleanliness'].fillna(0, inplace=True)
listings_df['review_scores_communication'].fillna(0, inplace=True)
listings_df['review_scores_location'].fillna(0, inplace=True)
listings_df['review_scores_value'].fillna(0, inplace=True)
listings_df['review_scores_accuracy'].fillna(0, inplace=True)
listings_df['reviews_per_month'].fillna(0, inplace=True)
listings_df['review_scores_accuracy'].fillna(0, inplace=True)

### **3. Prepare the Data**

In [96]:
# remove calander minimum_nights and maximum_nights 
# since this is already in listings
calendar_df.drop(columns=['minimum_nights', 'maximum_nights'], inplace=True)

# drop calendar_last_scraped since all values are equal to last_scraped
listings_df.drop(columns=['calendar_last_scraped'], inplace=True)
listings_df.drop(columns=['source'], inplace=True)

# drop neighbourhood column since all values are 'Neighborhood highlights'
listings_df.drop(columns=['neighbourhood'], inplace=True)
listings_df.rename(columns={'neighbourhood_cleansed': 'neighbourhood'}, inplace=True)

# drop scrape_id since all the values are the same
listings_df.drop(columns=['scrape_id'], inplace=True)

# remane listings price to base_price
listings_df.rename(columns={'price': 'base_price'}, inplace=True)

# add id field to calendar
calendar_df['id'] = range(1, len(calendar_df) + 1)

# make id the first column
columns = ['id'] + [col for col in calendar_df.columns if col != 'id']
calendar_df = calendar_df[columns]

columns = ['id'] + [col for col in reviews_df.columns if col != 'id']
reviews_df = reviews_df[columns]

In [97]:
# create availability_statistics
# and remove the corresponding columns from listings

columns_for_availability_statistics = [
    'id',
    'minimum_minimum_nights', 'maximum_minimum_nights',
    'minimum_maximum_nights', 'maximum_maximum_nights',
    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
    'availability_30', 'availability_60', 'availability_90', 'availability_365'
]
availability_statistics_df = listings_df[columns_for_availability_statistics].copy()
availability_statistics_df.rename(columns={'id': 'listing_id'}, inplace=True)

columns_to_remove_from_listings = [
    'minimum_minimum_nights', 'maximum_minimum_nights',
    'minimum_maximum_nights', 'maximum_maximum_nights',
    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
    'availability_30', 'availability_60', 'availability_90', 'availability_365'
]

listings_df.drop(columns=columns_to_remove_from_listings, inplace=True)

In [98]:
availability_statistics_df['id'] = range(1, len(availability_statistics_df) + 1)
columns = ['id'] + [col for col in availability_statistics_df.columns if col != 'id']
availability_statistics_df = availability_statistics_df[columns]

In [99]:
# Define columns to extract for reviews_statistics_df
columns_for_reviews_statistics = [
    'id',  # Rename to listing_id in the new DataFrame
    'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
    'first_review', 'last_review', 'review_scores_rating',
    'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication',
    'review_scores_location', 'review_scores_value', 'reviews_per_month'
]

# Create reviews_statistics_df by selecting the relevant columns
reviews_statistics_df = listings_df[columns_for_reviews_statistics].copy()

# Rename 'id' to 'listing_id'
reviews_statistics_df.rename(columns={'id': 'listing_id'}, inplace=True)

# Define columns to remove from listings_df
columns_to_remove_from_listings = [
    'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
    'first_review', 'last_review', 'review_scores_rating',
    'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication',
    'review_scores_location', 'review_scores_value', 'reviews_per_month'
]

# Drop the columns from listings_df
listings_df.drop(columns=columns_to_remove_from_listings, inplace=True)


In [100]:
reviews_statistics_df['id'] = range(1, len(reviews_statistics_df) + 1)
columns = ['id'] + [col for col in reviews_statistics_df.columns if col != 'id']
reviews_statistics_df = reviews_statistics_df[columns]

In [101]:
# Define the columns to extract for host_df
columns_for_host_df = [
    'host_id', 'host_url', 'host_name', 'host_about', 'host_is_superhost',
    'host_thumbnail_url', 'host_picture_url', 'host_verifications',
    'host_location', 'host_neighbourhood', 'host_since',
    'host_response_time', 'host_response_rate', 'host_acceptance_rate',
    'host_listings_count', 'host_total_listings_count',
    'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'
]

# Create host_df by selecting the relevant columns
host_df = listings_df[columns_for_host_df].copy()

# Rename columns to match the desired names
host_df.rename(columns={
    'host_url': 'url',
    'host_name': 'name',
    'host_about': 'about',
    'host_is_superhost': 'is_superhost',
    'host_thumbnail_url': 'thumbnail_url',
    'host_picture_url': 'picture_url',
    'host_verifications': 'verifications',
    'host_location': 'location',
    'host_neighbourhood': 'neighbourhood',
    'host_since': 'since',
    'host_response_time': 'response_time',
    'host_response_rate': 'response_rate',
    'host_acceptance_rate': 'acceptance_rate',
    'host_listings_count': 'listings_count',
    'host_total_listings_count': 'total_listings_count'
}, inplace=True)

# Define the columns to remove from listings_df (excluding host_id)
columns_to_remove_from_listings = [
    'host_url', 'host_name', 'host_about', 'host_is_superhost',
    'host_thumbnail_url', 'host_picture_url', 'host_verifications',
    'host_location', 'host_neighbourhood', 'host_since',
    'host_response_time', 'host_response_rate', 'host_acceptance_rate',
    'host_listings_count', 'host_total_listings_count',
    'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'
]

# Drop the columns from listings_df
listings_df.drop(columns=columns_to_remove_from_listings, inplace=True)

# rename host_id to id
host_df.rename(columns={'host_id': 'id'}, inplace=True)

In [102]:
# Define the columns to extract for host_statistics_df
columns_for_host_statistics = [
    'id', 'since', 'response_time', 'response_rate', 'acceptance_rate',
    'listings_count', 'total_listings_count',
    'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'
]

# Create host_statistics_df by selecting the relevant columns
host_statistics_df = host_df[columns_for_host_statistics].copy()

# Drop the columns from host_df that are now in host_statistics_df
columns_to_remove_from_host_df = [
    'since', 'response_time', 'response_rate', 'acceptance_rate',
    'listings_count', 'total_listings_count',
    'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'
]
host_df.drop(columns=columns_to_remove_from_host_df, inplace=True)
host_statistics_df.rename(columns={'id': 'host_id'}, inplace=True)

In [103]:
host_statistics_df['id'] = range(1, len(host_statistics_df) + 1)
columns = ['id'] + [col for col in host_statistics_df.columns if col != 'id']
host_statistics_df = host_statistics_df[columns]

### **4. Store the data**

In [109]:
host_df.to_csv('../data/clean-data/host.csv', index=False)
host_statistics_df.to_csv('../data/clean-data/host_statistics.csv', index=False)
listings_df.to_csv('../data/clean-data/listings.csv', index=False)
calendar_df.to_csv('../data/clean-data/calendar.csv', index=False)
reviews_df.to_csv('../data/clean-data/reviews.csv', index=False)
availability_statistics_df.to_csv('../data/clean-data/availability_statistics.csv', index=False)
reviews_statistics_df.to_csv('../data/clean-data/reviews_statistics.csv', index=False)