# Process Listings Data

In [1]:
## Lets import our requirements
import boto3
import pandas as pd
import psycopg2
import logging
import io
import gzip

import os
from dotenv import load_dotenv

# # load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

pd.set_option('display.max_columns', 500)

# Suppress FutureWarnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## 1. Initialize Boto3 client

In [2]:
# boto3 will initialize connection using environment variables
s3 = boto3.resource('s3')

2024-07-28 22:46:50,493 - INFO - Found credentials in environment variables.


#### 1.1 Develop script to get files in each folder

In [3]:
def list_files_in_folder(bucket_name, folder_prefix):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)

    # List objects within the specified bucket and folder
    obj_list = []
    for obj in bucket.objects.filter(Prefix=folder_prefix):
        # Check if the object is a file (not a folder)
        if not obj.key.endswith('/'):
            obj_list.append(obj.key)
    return obj_list


bucket_name = 'airbnb-capstone-project'
folder_prefix = 'raw/listings/'  # Specify the folder prefix
items = list_files_in_folder(bucket_name, folder_prefix)
items

['raw/listings/albany-listings.csv.gz',
 'raw/listings/chicago-listings.csv.gz',
 'raw/listings/los-angeles-listings.csv.gz',
 'raw/listings/new-york-city-listings.csv.gz',
 'raw/listings/san-francisco-listings.csv.gz',
 'raw/listings/seattle-listings.csv.gz',
 'raw/listings/washington-dc-listings.csv.gz']

## 2. Airbnb listings
Load sample data

In [4]:
s3_path = 's3://airbnb-capstone-project/raw/listings/albany-listings.csv.gz'

date_columns = ['last_scraped','host_since','calendar_last_scraped','first_review','last_review']

df_listings = pd.read_csv(s3_path, compression='gzip', parse_dates=date_columns)

2024-07-28 22:46:51,478 - INFO - Found credentials in environment variables.


In [5]:
df_listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

#### 2.1 Create function to read in CSV data

In [6]:
def read_listings_data(file_path):
    s3_base_url = f"s3://{bucket_name}/"

    # dates to be parsed
    date_columns = ['last_scraped','host_since','calendar_last_scraped','first_review','last_review']

    df = pd.read_csv(s3_base_url + file_path, compression='gzip', parse_dates=date_columns)
    return df

In [7]:
#### 2.2 Create data cleaning function and add logging for data quality checks

In [8]:
def clean_listings_data(df_original, market_name):
    # create copy to avoid modifying the original dataframe
    df = df_original.copy()

    # Add market name column
    logging.info('Adding market name column')
    df['market'] = market_name

    # Remove $ from price and convert to numeric
    logging.info('Cleaning price and rate columns')
    df['price'] = df['price'].str.replace('$', '')
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df['host_response_rate'] = df['host_response_rate'].str.replace('%', '').astype(float)
    df['host_acceptance_rate'] = df['host_acceptance_rate'].str.replace('%', '').astype(float)
    

    # Convert list of amenities into a string
    logging.info('Converting amenities from list to string')
    df['amenities'] = df['amenities'].apply(lambda x: x.replace('"', '').replace('[', '').replace(']', '').replace('\\u2013', '-'))

    # Convert superhost column from 'f' or 't' to 0 or 1, then to integer
    logging.info('Converting array columns to string')
    df['host_is_superhost'] = df['host_is_superhost'].replace({'f': 0, 't': 1})
    df['host_is_superhost'] = df['host_is_superhost'].fillna(0).astype(int, errors='ignore')
    df['host_verifications'] = df['host_verifications'].apply(lambda x: x.replace("'", "").replace("[", "").replace("]", "") if pd.notnull(x) else x)

    logging.info('Converting columns to string')
    df['calendar_updated'] = df['calendar_updated'].astype('str')
    df['license'] = df['license'].astype('str')


    # Check for duplicates
    logging.info('Checking for duplicates')
    duplicate_rows = df.duplicated().sum()
    logging.info(f'Found {duplicate_rows} duplicate rows')
    df = df.drop_duplicates()

    # Ensure date columns are formatted as date type
    logging.info('Converting date columns to date type')
    date_columns = ['last_scraped','host_since','calendar_last_scraped','first_review','last_review']
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce').dt.date

    # Drop unecessary columns
    logging.info('Dropping unecessary columns')
    columns_to_drop = ['source','calendar_updated','listing_url', 'picture_url','host_url','host_thumbnail_url', 'host_picture_url','neighbourhood_group_cleansed']
    df = df.drop(columns=columns_to_drop)

    logging.info('Data cleaning completed for market: %s', market_name)
    return df

#### 2.3 Function to convert to parquet and upload to S3

In [9]:
def upload_cleaned_data_to_s3(df, bucket_name, market_name):
    print('Upload starting...')
    s3 = boto3.client('s3')
    
    # Convert DataFrame to Parquet in memory
    parquet_buffer = io.BytesIO()
    df.to_parquet(parquet_buffer, index=False)

    # Seek to the beginning of the buffer
    parquet_buffer.seek(0)

    # Construct the S3 key
    s3_key = f"processed/listings/{market_name}-listings_processed.parquet"

    # Upload the Parquet file to S3
    s3.put_object(Bucket=bucket_name, Key=s3_key, Body=parquet_buffer.getvalue())
    print(f"File uploaded to S3 bucket '{bucket_name}' with key '{s3_key}'")

In [10]:
#### 2.4 Main script to process files

In [11]:
def process_files(bucket_name, folder_prefix):
    items = list_files_in_folder(bucket_name, folder_prefix)

    for item in items:
        # Extract market name from the file name
        file_name = item.split('/')[-1]
        market_name = '-'.join(file_name.split('-')[:-1])
        
        # Download the file from S3
        logging.info(f'Downloading file: {item}')
        df_listings = read_listings_data(item)
        
        # Clean the data
        df_listings_cleaned = clean_listings_data(df_listings, market_name)
        
        # Upload the cleaned data to S3
        upload_cleaned_data_to_s3(df_listings_cleaned, bucket_name, market_name)

## 3. Process and upload files

In [12]:
bucket_name = 'airbnb-capstone-project'
folder_prefix = 'raw/listings/'

list_files_in_folder(bucket_name, folder_prefix)

['raw/listings/albany-listings.csv.gz',
 'raw/listings/chicago-listings.csv.gz',
 'raw/listings/los-angeles-listings.csv.gz',
 'raw/listings/new-york-city-listings.csv.gz',
 'raw/listings/san-francisco-listings.csv.gz',
 'raw/listings/seattle-listings.csv.gz',
 'raw/listings/washington-dc-listings.csv.gz']

In [13]:
bucket_name = 'airbnb-capstone-project'
input_folder_path = 'raw/listings/'
process_files(bucket_name, input_folder_path)

2024-07-28 22:46:55,915 - INFO - Downloading file: raw/listings/albany-listings.csv.gz
2024-07-28 22:46:56,147 - INFO - Adding market name column
2024-07-28 22:46:56,147 - INFO - Cleaning price and rate columns
2024-07-28 22:46:56,147 - INFO - Converting amenities from list to string
2024-07-28 22:46:56,147 - INFO - Converting array columns to string
2024-07-28 22:46:56,147 - INFO - Converting columns to string
2024-07-28 22:46:56,163 - INFO - Checking for duplicates
2024-07-28 22:46:56,172 - INFO - Found 0 duplicate rows
2024-07-28 22:46:56,172 - INFO - Converting date columns to date type
2024-07-28 22:46:56,181 - INFO - Dropping unecessary columns
2024-07-28 22:46:56,184 - INFO - Data cleaning completed for market: albany


Upload starting...


2024-07-28 22:46:58,671 - INFO - Downloading file: raw/listings/chicago-listings.csv.gz


File uploaded to S3 bucket 'airbnb-capstone-project' with key 'processed/listings/albany-listings_processed.parquet'


2024-07-28 22:47:00,532 - INFO - Adding market name column
2024-07-28 22:47:00,534 - INFO - Cleaning price and rate columns
2024-07-28 22:47:00,534 - INFO - Converting amenities from list to string
2024-07-28 22:47:00,554 - INFO - Converting array columns to string
2024-07-28 22:47:00,571 - INFO - Converting columns to string
2024-07-28 22:47:00,573 - INFO - Checking for duplicates
2024-07-28 22:47:00,608 - INFO - Found 0 duplicate rows
2024-07-28 22:47:00,655 - INFO - Converting date columns to date type
2024-07-28 22:47:00,718 - INFO - Dropping unecessary columns
2024-07-28 22:47:00,718 - INFO - Data cleaning completed for market: chicago


Upload starting...


2024-07-28 22:47:08,390 - INFO - Downloading file: raw/listings/los-angeles-listings.csv.gz


File uploaded to S3 bucket 'airbnb-capstone-project' with key 'processed/listings/chicago-listings_processed.parquet'


2024-07-28 22:47:24,169 - INFO - Adding market name column
2024-07-28 22:47:24,171 - INFO - Cleaning price and rate columns
2024-07-28 22:47:24,187 - INFO - Converting amenities from list to string
2024-07-28 22:47:24,308 - INFO - Converting array columns to string
2024-07-28 22:47:24,325 - INFO - Converting columns to string
2024-07-28 22:47:24,356 - INFO - Checking for duplicates
2024-07-28 22:47:24,525 - INFO - Found 0 duplicate rows
2024-07-28 22:47:24,728 - INFO - Converting date columns to date type
2024-07-28 22:47:24,776 - INFO - Dropping unecessary columns
2024-07-28 22:47:24,794 - INFO - Data cleaning completed for market: los-angeles


Upload starting...


2024-07-28 22:48:03,103 - INFO - Downloading file: raw/listings/new-york-city-listings.csv.gz


File uploaded to S3 bucket 'airbnb-capstone-project' with key 'processed/listings/los-angeles-listings_processed.parquet'


2024-07-28 22:48:06,975 - INFO - Adding market name column
2024-07-28 22:48:06,976 - INFO - Cleaning price and rate columns
2024-07-28 22:48:07,001 - INFO - Converting amenities from list to string
2024-07-28 22:48:07,061 - INFO - Converting array columns to string
2024-07-28 22:48:07,073 - INFO - Converting columns to string
2024-07-28 22:48:07,091 - INFO - Checking for duplicates
2024-07-28 22:48:07,206 - INFO - Found 0 duplicate rows
2024-07-28 22:48:07,373 - INFO - Converting date columns to date type
2024-07-28 22:48:07,406 - INFO - Dropping unecessary columns
2024-07-28 22:48:07,420 - INFO - Data cleaning completed for market: new-york-city


Upload starting...


2024-07-28 22:48:37,783 - INFO - Downloading file: raw/listings/san-francisco-listings.csv.gz


File uploaded to S3 bucket 'airbnb-capstone-project' with key 'processed/listings/new-york-city-listings_processed.parquet'


2024-07-28 22:48:39,366 - INFO - Adding market name column
2024-07-28 22:48:39,367 - INFO - Cleaning price and rate columns
2024-07-28 22:48:39,367 - INFO - Converting amenities from list to string
2024-07-28 22:48:39,391 - INFO - Converting array columns to string
2024-07-28 22:48:39,397 - INFO - Converting columns to string
2024-07-28 22:48:39,400 - INFO - Checking for duplicates
2024-07-28 22:48:39,432 - INFO - Found 0 duplicate rows
2024-07-28 22:48:39,467 - INFO - Converting date columns to date type
2024-07-28 22:48:39,487 - INFO - Dropping unecessary columns
2024-07-28 22:48:39,487 - INFO - Data cleaning completed for market: san-francisco


Upload starting...


2024-07-28 22:48:46,891 - INFO - Downloading file: raw/listings/seattle-listings.csv.gz


File uploaded to S3 bucket 'airbnb-capstone-project' with key 'processed/listings/san-francisco-listings_processed.parquet'


2024-07-28 22:48:48,475 - INFO - Adding market name column
2024-07-28 22:48:48,476 - INFO - Cleaning price and rate columns
2024-07-28 22:48:48,481 - INFO - Converting amenities from list to string
2024-07-28 22:48:48,496 - INFO - Converting array columns to string
2024-07-28 22:48:48,508 - INFO - Converting columns to string
2024-07-28 22:48:48,511 - INFO - Checking for duplicates
2024-07-28 22:48:48,542 - INFO - Found 0 duplicate rows
2024-07-28 22:48:48,576 - INFO - Converting date columns to date type
2024-07-28 22:48:48,594 - INFO - Dropping unecessary columns
2024-07-28 22:48:48,594 - INFO - Data cleaning completed for market: seattle


Upload starting...


2024-07-28 22:48:55,265 - INFO - Downloading file: raw/listings/washington-dc-listings.csv.gz


File uploaded to S3 bucket 'airbnb-capstone-project' with key 'processed/listings/seattle-listings_processed.parquet'


2024-07-28 22:48:58,445 - INFO - Adding market name column
2024-07-28 22:48:58,446 - INFO - Cleaning price and rate columns
2024-07-28 22:48:58,448 - INFO - Converting amenities from list to string
2024-07-28 22:48:58,448 - INFO - Converting array columns to string
2024-07-28 22:48:58,469 - INFO - Converting columns to string
2024-07-28 22:48:58,469 - INFO - Checking for duplicates
2024-07-28 22:48:58,493 - INFO - Found 0 duplicate rows
2024-07-28 22:48:58,520 - INFO - Converting date columns to date type
2024-07-28 22:48:58,535 - INFO - Dropping unecessary columns
2024-07-28 22:48:58,538 - INFO - Data cleaning completed for market: washington-dc


Upload starting...
File uploaded to S3 bucket 'airbnb-capstone-project' with key 'processed/listings/washington-dc-listings_processed.parquet'
