## Albany AirBnB Project

In [4]:
# Combining listings data for the four months: July,Aug,Sep and Oct into one large csv file

import os
import pandas as pd

# Specify the folder paths where your CSV files are located
folder_paths = ['July_3rd_2023/', 'Aug_9th_2023/', 'Sept_2nd_2023/', 'Oct_1st_2023/']

# Initialize an empty list to store DataFrames
dfs = []

# Iterate through each folder path
for folder_path in folder_paths:
    # Get a list of CSV files in the current folder
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('listings.csv')]
    
    # Iterate through each CSV file in the folder
    for csv_file in csv_files:
        # Read the CSV file into a DataFrame
        file_path = os.path.join(folder_path, csv_file)
        df = pd.read_csv(file_path)
        
        # Append the DataFrame to the list of DataFrames
        dfs.append(df)

# Concatenate all DataFrames in the list along the rows
parent_data = pd.concat(dfs, ignore_index=True)

# Now you have a single DataFrame containing the data from all CSV files


In [7]:
# Displaying the first 5 rows of the dataset
parent_data.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,1489424,https://www.airbnb.com/rooms/1489424,20230700000000.0,7/3/2023,city scrape,Home in Albany · ★4.73 · 1 bedroom · 1 bed · 1...,"Queen size bed, extra comfy mattress, with acc...",Quiet yet convenient.,https://a0.muscache.com/pictures/21977748/1dc8...,5294164,...,4.91,4.81,4.8,,f,1,0,1,0,1.82
1,2992450,https://www.airbnb.com/rooms/2992450,20230700000000.0,7/3/2023,city scrape,Rental unit in Albany · ★3.56 · 2 bedrooms · 2...,The apartment is located in a quiet neighborho...,,https://a0.muscache.com/pictures/44627226/0e72...,4621559,...,4.56,3.22,3.67,,f,1,1,0,0,0.1
2,3820211,https://www.airbnb.com/rooms/3820211,20230700000000.0,7/3/2023,city scrape,Rental unit in Albany · ★4.75 · 1 bedroom · 1 ...,"Cozy, cool little 1BR Apt in the heart Albany'...","Great restaurants, architecture, walking, peop...",https://a0.muscache.com/pictures/678ed39b-74fd...,19648678,...,4.82,4.83,4.79,,f,11,6,5,0,2.71
3,5651579,https://www.airbnb.com/rooms/5651579,20230700000000.0,7/3/2023,city scrape,Rental unit in Albany · ★4.50 · Studio · 1 bed...,"Spacious studio with hardwood floors, fully eq...",The neighborhood is very eclectic. We have a v...,https://a0.muscache.com/pictures/b3fc42f3-6e5e...,29288920,...,4.86,4.8,4.63,,f,3,2,1,0,3.21
4,6623339,https://www.airbnb.com/rooms/6623339,20230700000000.0,7/3/2023,city scrape,Rental unit in Albany · ★4.73 · 1 bedroom · 1 ...,Large renovated 1 bedroom apartment in convert...,"Located in Albany's finest urban neighborhood,...",https://a0.muscache.com/pictures/c9f06fb9-88e0...,19648678,...,4.69,4.81,4.72,,f,11,6,5,0,3.11


In [8]:
# Checking out the size of the dataset
parent_data.shape

#Dataset has 1507 rows and 75 columns

(1507, 75)

In [9]:
# We probably don't need all 75 columns. Displaying the available columns then decide which to keep
parent_data.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [11]:
# Creating a subset dataset with only the relevant columns
main_table = parent_data[['id','listing_url','name','description','picture_url','host_id',
                        'host_name','host_location','host_since','host_response_time',
                        'host_response_rate','host_acceptance_rate','host_is_superhost',
                         'host_picture_url','host_neighbourhood','host_listings_count',
                         'host_total_listings_count','host_verifications','host_has_profile_pic',
                         'host_identity_verified','neighbourhood','latitude','longitude',
                         'property_type','room_type','accommodates','bathrooms','bedrooms','beds',
                         'amenities','price','minimum_nights','maximum_nights','number_of_reviews',
                         'review_scores_rating','review_scores_accuracy']].copy()
# Displaying top 5 rows
main_table.head()

Unnamed: 0,id,listing_url,name,description,picture_url,host_id,host_name,host_location,host_since,host_response_time,...,bathrooms,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy
0,1489424,https://www.airbnb.com/rooms/1489424,Home in Albany · ★4.73 · 1 bedroom · 1 bed · 1...,"Queen size bed, extra comfy mattress, with acc...",https://a0.muscache.com/pictures/21977748/1dc8...,5294164,Efrat,"Albany, NY",3/2/2013,within an hour,...,,,1.0,"[""Stove"", ""Dedicated workspace"", ""Coffee maker...",$50.00,1,1125,217,4.73,4.81
1,2992450,https://www.airbnb.com/rooms/2992450,Rental unit in Albany · ★3.56 · 2 bedrooms · 2...,The apartment is located in a quiet neighborho...,https://a0.muscache.com/pictures/44627226/0e72...,4621559,Kenneth,"New York, NY",1/7/2013,within an hour,...,,2.0,2.0,"[""Heating"", ""Smoke alarm"", ""TV with standard c...",$70.00,28,1125,11,3.56,3.44
2,3820211,https://www.airbnb.com/rooms/3820211,Rental unit in Albany · ★4.75 · 1 bedroom · 1 ...,"Cozy, cool little 1BR Apt in the heart Albany'...",https://a0.muscache.com/pictures/678ed39b-74fd...,19648678,Terra,,8/7/2014,within a day,...,,1.0,1.0,"[""Stove"", ""Self check-in"", ""Keypad"", ""Coffee m...",$125.00,2,365,293,4.75,4.89
3,5651579,https://www.airbnb.com/rooms/5651579,Rental unit in Albany · ★4.50 · Studio · 1 bed...,"Spacious studio with hardwood floors, fully eq...",https://a0.muscache.com/pictures/b3fc42f3-6e5e...,29288920,Gregg,"Albany, NY",3/13/2015,within an hour,...,,,1.0,"[""Stove"", ""Self check-in"", ""Air conditioning"",...",$68.00,3,365,319,4.5,4.61
4,6623339,https://www.airbnb.com/rooms/6623339,Rental unit in Albany · ★4.73 · 1 bedroom · 1 ...,Large renovated 1 bedroom apartment in convert...,https://a0.muscache.com/pictures/c9f06fb9-88e0...,19648678,Terra,,8/7/2014,within a day,...,,1.0,1.0,"[""Stove"", ""Self check-in"", ""Keypad"", ""Air cond...",$140.00,1,1125,306,4.73,4.83


In [13]:
# Checking for duplicated rows 
main_table.duplicated().sum()
# There are no duplicate rows 

0

In [19]:
# Dealing with null values if any
#Displaying only columns with null values
null_columns_or_nonulls = main_table.isnull().sum()
null_columns = null_columns_or_nonulls[null_columns_or_nonulls > 0]
null_columns.sort_values(ascending=False)

bathrooms                 1507
neighbourhood              557
bedrooms                   416
host_location              357
host_neighbourhood         308
review_scores_rating       175
review_scores_accuracy     175
host_is_superhost          122
host_response_time          73
host_response_rate          73
host_acceptance_rate        57
description                 23
beds                         5
dtype: int64

In [31]:
# Dropping bathrooms column since it has no values at all
main_table.drop(columns='bathrooms', inplace=True)

#Checking columns to make sure it's dropped successfully
main_table.columns

Index(['id', 'listing_url', 'name', 'description', 'picture_url', 'host_id',
       'host_name', 'host_location', 'host_since', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'latitude', 'longitude', 'property_type', 'room_type', 'accommodates',
       'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights',
       'maximum_nights', 'number_of_reviews', 'review_scores_rating',
       'review_scores_accuracy'],
      dtype='object')

In [32]:
# Revisiting the nulls result again

null_columns_or_nonulls = main_table.isnull().sum()
null_columns = null_columns_or_nonulls[null_columns_or_nonulls > 0]
null_columns.sort_values(ascending=False)

neighbourhood             557
bedrooms                  416
host_location             357
host_neighbourhood        308
review_scores_rating      175
review_scores_accuracy    175
host_is_superhost         122
host_response_time         73
host_response_rate         73
host_acceptance_rate       57
description                23
beds                        5
dtype: int64

In [38]:
# creating a copy of main table. Using the copy instead to be safer
working_table = main_table.copy()
working_table.head(2)

Unnamed: 0,id,listing_url,name,description,picture_url,host_id,host_name,host_location,host_since,host_response_time,...,accommodates,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,review_scores_accuracy
0,1489424,https://www.airbnb.com/rooms/1489424,Home in Albany · ★4.73 · 1 bedroom · 1 bed · 1...,"Queen size bed, extra comfy mattress, with acc...",https://a0.muscache.com/pictures/21977748/1dc8...,5294164,Efrat,"Albany, NY",3/2/2013,within an hour,...,2,,1.0,"[""Stove"", ""Dedicated workspace"", ""Coffee maker...",$50.00,1,1125,217,4.73,4.81
1,2992450,https://www.airbnb.com/rooms/2992450,Rental unit in Albany · ★3.56 · 2 bedrooms · 2...,The apartment is located in a quiet neighborho...,https://a0.muscache.com/pictures/44627226/0e72...,4621559,Kenneth,"New York, NY",1/7/2013,within an hour,...,4,2.0,2.0,"[""Heating"", ""Smoke alarm"", ""TV with standard c...",$70.00,28,1125,11,3.56,3.44


In [44]:
# Revisiting null values, this time on the working_table
working_table.isnull().sum()[working_table.isnull().sum() > 0].sort_values(ascending=False)

neighbourhood             557
bedrooms                  416
host_location             357
host_neighbourhood        308
review_scores_rating      175
review_scores_accuracy    175
host_is_superhost         122
host_response_time         73
host_response_rate         73
host_acceptance_rate       57
description                23
beds                        5
dtype: int64

In [62]:
# Replacing null values in all the columns
working_table['neighbourhood'] = working_table['neighbourhood'].fillna('unknown')
working_table['bedrooms'] = working_table['bedrooms'].fillna('unknown')
working_table['host_location'] = working_table['host_location'].fillna('unknown')
working_table['host_neighbourhood'] = working_table['host_neighbourhood'].fillna('unknown')
working_table['review_scores_accuracy'] = working_table['review_scores_accuracy'].fillna(
                                        working_table['review_scores_accuracy'].median())
working_table['host_is_superhost'] = working_table['host_is_superhost'].fillna('unknown')
working_table['review_scores_rating'] = working_table['review_scores_rating'].fillna('unknown')
working_table['host_response_time'] = working_table['host_response_time'].fillna('unknown')
working_table['host_response_rate'] = working_table['host_response_rate'].fillna('unknown')
working_table['host_acceptance_rate'] = working_table['host_acceptance_rate'].fillna('unknown')
working_table['description'] = working_table['description'].fillna('unknown')
working_table['beds'] = working_table['beds'].fillna('unknown')

In [66]:
# Checking column datatypes to ensure they're correct
working_table.dtypes

id                             int64
listing_url                   object
name                          object
description                   object
picture_url                   object
host_id                        int64
host_name                     object
host_location                 object
host_since                    object
host_response_time            object
host_response_rate            object
host_acceptance_rate          object
host_is_superhost             object
host_picture_url              object
host_neighbourhood            object
host_listings_count            int64
host_total_listings_count      int64
host_verifications            object
host_has_profile_pic          object
host_identity_verified        object
neighbourhood                 object
latitude                     float64
longitude                    float64
property_type                 object
room_type                     object
accommodates                   int64
bedrooms                      object
b

In [74]:
# Changing iid, host_id to object
working_table = working_table.astype({'id': 'object', 'host_id': 'object'})
# working_table.dtypes

id                            object
listing_url                   object
name                          object
description                   object
picture_url                   object
host_id                       object
host_name                     object
host_location                 object
host_since                    object
host_response_time            object
host_response_rate            object
host_acceptance_rate          object
host_is_superhost             object
host_picture_url              object
host_neighbourhood            object
host_listings_count            int64
host_total_listings_count      int64
host_verifications            object
host_has_profile_pic          object
host_identity_verified        object
neighbourhood                 object
latitude                     float64
longitude                    float64
property_type                 object
room_type                     object
accommodates                   int64
bedrooms                      object
b

In [87]:
# For host_response_rate, the following needs to happen: 
# Remove '%' sign, convert dtype to float, replace 'unknown' with numeric value

working_table['host_response_rate'] = working_table['host_response_rate'].str.replace('%','')
# working_table['host_response_rate'].unique()

# Changing "Unknown" value to median value
# Convert 'unknown' strings to NaN
working_table['host_acceptance_rate'] = pd.to_numeric(working_table['host_acceptance_rate'], errors='coerce')

# Calculate the median of the numeric values
median_value = working_table['host_acceptance_rate'].median()

# Replace NaN values with the median
working_table['host_acceptance_rate'] = working_table['host_acceptance_rate'].fillna(median_value)

working_table['host_acceptance_rate'].unique()

# The above code converted the whole column into NaN! Unfortunately

array([nan])

In [92]:
# Resetting 'host_acceptance_rate' column by retrieving it from main table
original_values = main_table['host_acceptance_rate']
working_table['host_acceptance_rate'] = original_values
working_table['host_acceptance_rate']

0        91%
1       100%
2        60%
3       100%
4        60%
        ... 
1502    100%
1503     NaN
1504     NaN
1505     NaN
1506     NaN
Name: host_acceptance_rate, Length: 1507, dtype: object

In [97]:
# Converting 'host_acceptance_rate' column dtype to float

# Step 1: Removing '%' and changing numeric values to float
working_table['host_acceptance_rate'] = working_table['host_acceptance_rate'].str.
                                        replace('%','').astype(float)
    
# Step 2: Calculating median value that will be used to fill NaN
median_value = working_table['host_acceptance_rate'].median()

# Step 3: Replacing NaN values with median
working_table['host_acceptance_rate'] = working_table['host_acceptance_rate'].fillna(median_value)

# Step 4: Checking to see that NaN have been replaced, and the values are float
working_table['host_acceptance_rate'].unique()

array([ 91., 100.,  60.,  89.,  99.,  50.,  70.,  86.,  88.,  97.,  96.,
        78.,  98.,  33.,  40.,  87.,  92.,  75.,   0.,  20.,  94.,  67.,
        93.,  81.,  44.,  95.,  80.,  65.,  85.,  59.,  90.,  25.,  57.,
        79.,  83.,  77.,  84.,  76.,  71.,  63.,  68.,  41.,  52.,  51.,
        43.])

In [98]:
working_table['host_acceptance_rate'].isnull().sum()

0

In [108]:
working_table.columns

Index(['id', 'listing_url', 'name', 'description', 'picture_url', 'host_id',
       'host_name', 'host_location', 'host_since', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'latitude', 'longitude', 'property_type', 'room_type', 'accommodates',
       'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights',
       'maximum_nights', 'number_of_reviews', 'review_scores_rating',
       'review_scores_accuracy'],
      dtype='object')