<a href="https://colab.research.google.com/github/harika373/cleaning-data-data-Analysis-/blob/main/Cleaning%20Airbnb%20NYC%20Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

# Load dataset (Airbnb NYC Open Data)
df_airbnb = pd.read_csv("/content/AB_NYC_2019.csv")

# 1. Data Integrity
df_airbnb = df_airbnb[
    (df_airbnb['latitude'].between(40.5, 40.9)) &
    (df_airbnb['longitude'].between(-74.25, -73.7))
]
df_airbnb = df_airbnb[(df_airbnb['availability_365'].between(0, 365)) & (df_airbnb['price'] > 0)]

# 2. Missing Data Handling
df_airbnb['last_review'] = pd.to_datetime(df_airbnb['last_review'], errors='coerce')
df_airbnb['reviews_per_month'] = df_airbnb['reviews_per_month'].fillna(0)

# 3. Duplicate Removal
df_airbnb.drop_duplicates(subset='id', inplace=True)

# 4. Standardization
df_airbnb['price'] = pd.to_numeric(df_airbnb['price'], errors='coerce')
df_airbnb['room_type'] = df_airbnb['room_type'].str.title()

# 5. Outlier Detection (IQR method on price)
Q1 = df_airbnb['price'].quantile(0.25)
Q3 = df_airbnb['price'].quantile(0.75)
IQR = Q3 - Q1
lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
df_airbnb = df_airbnb[(df_airbnb['price'] >= lower) & (df_airbnb['price'] <= upper)]

# Save cleaned dataset
df_airbnb.to_csv("cleaned_airbnb_nyc.csv", index=False)

print("✅ NYC Airbnb dataset cleaned and saved as cleaned_airbnb_nyc.csv")


✅ NYC Airbnb dataset cleaned and saved as cleaned_airbnb_nyc.csv
