### Importing libraries

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")

### Reading the three raw data files we have here

In [2]:
#reading data stored in local machine (available online)
calendar = pd.read_csv("./data/calendar_detailed.csv")
user_reviews = pd.read_csv("./data/reviews_detailed.csv")
listings = pd.read_csv("./data/listings_detailed.csv")

In [3]:
pd.set_option('display.max_colwidth', None)

### Working on the calendar dataset and processing it -

In [4]:
calendar.reset_index(inplace=True)
calendar.rename(columns={"index":"calendar_ID"},inplace = True)
calendar.rename(columns={"listing_id":"listing_ID"}, inplace = True)

calendar["date"] = calendar["date"].astype('datetime64[ns]')
calendar[["available","price","adjusted_price"]] = calendar[["available","price","adjusted_price"]].astype(str)
calendar[["maximum_nights","minimum_nights"]] = calendar[["maximum_nights","minimum_nights"]].fillna(-1)
calendar[["maximum_nights","minimum_nights"]] = calendar[["maximum_nights","minimum_nights"]].astype(int)
calendar[["maximum_nights","minimum_nights"]] = calendar[["maximum_nights","minimum_nights"]].replace(-1, np.nan)
calendar.fillna(0, inplace=True)

In [5]:
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450145 entries, 0 to 1450144
Data columns (total 8 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   calendar_ID     1450145 non-null  int64         
 1   listing_ID      1450145 non-null  int64         
 2   date            1450145 non-null  datetime64[ns]
 3   available       1450145 non-null  object        
 4   price           1450145 non-null  object        
 5   adjusted_price  1450145 non-null  object        
 6   minimum_nights  1450145 non-null  float64       
 7   maximum_nights  1450145 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(3)
memory usage: 88.5+ MB


### Working on listings file and processing it -

In [6]:
listings.rename(columns={'id':'listing_ID'}, inplace = True)
listings.fillna(0, inplace=True)

In [7]:
#cleaning the listings table by removing unwanted columns
listings.drop(["scrape_id","last_scraped","source","price", "bathrooms",'minimum_nights', 'maximum_nights',
       'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated',
       'has_availability', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'calendar_last_scraped','license','reviews_per_month'], axis = 1, inplace = True)

In [8]:
#Creating a table property reviews that has all the properties and it's review scores
property_reviews = listings[["listing_ID",'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
       'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value']]

listings.drop(['number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
       'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value'], axis = 1, inplace = True)

property_reviews.reset_index(inplace=True)
property_reviews.rename(columns={"index":"property_review_ID"}, inplace = True)

In [9]:
#Creating a table that has host details stored
hosts = listings[['listing_ID','host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified']]

listings.drop(['host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url','host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms'], axis = 1, inplace = True)

hosts["host_since"] = hosts["host_since"].astype('datetime64[ns]')
hosts["host_about"].replace("\r","",inplace=True,regex=True)

# hosts.drop(["listing_ID"], inplace = True)
# hosts = hosts.drop_duplicates().reset_index(drop=True)

In [10]:
#Creating a neighbourhood table that stores details of neighbourhood
neighbourhood= listings[["neighbourhood_cleansed"]]
listings.drop(['neighbourhood','neighbourhood_group_cleansed'], axis=1, inplace = True)
neighbourhood.drop_duplicates(inplace=True)
neighbourhood.reset_index(drop=True, inplace=True)

num = 0
mapping = {}
for location in listings.neighbourhood_cleansed.unique():
    num+=1
    mapping[location] = num

num=0
for location in listings.neighbourhood_cleansed.unique():
    num+=1
    listings.replace({location:num}, inplace=True)

df = pd.DataFrame.from_dict(mapping, orient="index").reset_index()
df.rename(columns={"index":"neighbourhood_cleansed",0:"neighbourhood_ID"}, inplace=True)
neighbourhood = pd.merge(neighbourhood,df, on="neighbourhood_cleansed")

neighbourhood.rename(columns={"neighbourhood_cleansed":"neighbourhood_name"}, inplace = True)
neighbourhood = neighbourhood.reindex(columns=['neighbourhood_ID','neighbourhood_name'])

### Processing a new table "reviewers" - 

In [11]:
#creating new table with reviewers data. This includes reviewer name and id
reviewers = user_reviews[["reviewer_id","reviewer_name"]]
reviewers.drop_duplicates(subset=["reviewer_id"], inplace=True)
reviewers.reset_index(inplace=True)
reviewers.drop(columns=["index"], inplace=True)
reviewers.rename(columns={'reviewer_id':'reviewer_ID'}, inplace = True)

### Processing a new table "user_reviews" - 


In [12]:
user_reviews.rename(columns={"id":"review_id","listing_id":"listing_ID"}, inplace = True)
user_reviews.drop(columns=["reviewer_name"], inplace=True)
user_reviews

user_reviews = pd.merge(user_reviews, property_reviews, on='listing_ID', how='left')
user_reviews = user_reviews[["property_review_ID", "review_id", "date", "reviewer_id", "comments"]]

user_reviews.rename(columns={"review_id":"review_ID","reviewer_id":"reviewer_ID"}, inplace = True)
user_reviews["comments"].replace("\r","",inplace=True,regex=True)
user_reviews["comments"].replace("<br/>","",inplace=True,regex=True)

In [13]:
listings = pd.merge(listings, hosts, on="listing_ID", how='left')
listings.drop(['host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified'], axis = 1, inplace = True)

listings.rename(columns={"host_id":"host_ID","neighbourhood_cleansed":"neighbourhood_ID"}, inplace = True)


hosts.drop(["listing_ID"], axis = 1, inplace = True)
hosts = hosts.drop_duplicates().reset_index(drop=True)

In [14]:
directory = "./processed_csv_files"

# Check if the directory exists
if not os.path.exists(directory):
    # If it doesn't exist, create it
    os.makedirs(directory)
    
    
list_of_tables = [user_reviews,calendar,listings,property_reviews,neighbourhood,hosts,reviewers]
list_of_tables_names = ['user_reviews','calendar','listings','property_reviews','neighbourhood','hosts','reviewers']

for table,table_name in zip(list_of_tables,list_of_tables_names):
    table.to_csv(f"./processed_csv_files/{table_name}.csv",index=False)