Airbnb OpenData Cleaning - RentVisionNYC


In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
import os


In [None]:
# --- Load Dataset ---
data_path = "/Users/jainam/Downloads/Class_Fall_25/FDS/RentVisionNYC/Airbnb_Dataset/rawdata/Airbnb_Open_Data.csv"
data = pd.read_csv(data_path)
print("Dataset loaded successfully!")
print("Shape:", data.shape)

# --- Initial Inspection ---
print("\nColumns:\n", data.columns)
data.info()


In [None]:
# Airbnb NYC Data Cleaning - RentVisionNYC Project
# Author: Jainam Jain

# --- Drop Redundant or Unnecessary Columns ---
columns_to_drop = [
    'reviews per month', 'review rate number', 'calculated host listings count',
    'availability 365', 'house_rules', 'license', 'id'
]

data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# --- Rename Columns for Consistency ---
data.rename(columns={"NAME": "Name"}, inplace=True)
data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_')

# --- Remove Duplicates ---
duplicates_before = data.duplicated().sum()
data.drop_duplicates(inplace=True)
print(f"\nRemoved {duplicates_before} duplicate rows.")

# --- Handle Missing Values ---
print("\nMissing values before cleaning:\n", data.isna().sum())
if 'last_review' in data.columns:
    data.drop(columns=['last_review'], inplace=True, errors='ignore')
data.dropna(inplace=True)
print("\nMissing values after cleaning:\n", data.isna().sum())

# --- Clean and Convert Data Types ---
# Clean price column
if 'price' in data.columns:
    data['price'] = data['price'].astype(str)
    data['price'] = data['price'].str.replace("$", "", regex=False)
    data['price'] = data['price'].str.replace(",", "", regex=False)
    data['price'] = pd.to_numeric(data['price'], errors='coerce')

# Convert instant_bookable to numeric
if 'instant_bookable' in data.columns:
    data['instant_bookable'] = data['instant_bookable'].apply(lambda x: 1 if x is True or str(x).lower() == 't' else 0)

# Standardize host verification column
if 'host_identity_verified' in data.columns:
    data['host_identity_verified'] = data['host_identity_verified'].astype(str).str.upper()

# --- Feature Engineering ---
if 'minimum_nights' in data.columns:
    data['price_per_night'] = data['price'] / data['minimum_nights']

# Neighborhood-level aggregation (optional future feature)
# Example: listing_density per neighborhood
if 'neighbourhood' in data.columns:
    density = data.groupby('neighbourhood')['name'].count().rename('listing_density')
    data = data.merge(density, on='neighbourhood', how='left')

# --- Reset Index ---
data.reset_index(drop=True, inplace=True)

# --- Save Cleaned Dataset ---
output_dir = "/Users/jainam/Downloads/Class_Fall_25/FDS/RentVisionNYC/Airbnb_Dataset/processeddata"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "nyc_airbnb_clean.csv")
data.to_csv(output_path, index=False)

print(f"\n Data cleaned and saved successfully at:\n{output_path}")
print("Final shape:", data.shape)
