<a href="https://colab.research.google.com/github/laxmivaishnavee22/OasisInfobyte/blob/main/cleaned_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


Saving AB_NYC_2019.csv to AB_NYC_2019.csv
Saving New_York_City_.png to New_York_City_.png


In [None]:

!pip install -q seaborn matplotlib pandas numpy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os


OUTDIR = 'outputs'
os.makedirs(OUTDIR, exist_ok=True)


filename = 'AB_NYC_2019.csv'
df = pd.read_csv(filename)
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(df.head())


print("\n--- Basic Info ---")
print(df.info())
print("\nMissing values per column:")
print(df.isnull().sum())

df['reviews_per_month'].fillna(0, inplace=True)
df['last_review'].fillna('No Review', inplace=True)
df['name'].fillna('No Name', inplace=True)
df['host_name'].fillna('Unknown', inplace=True)


df.drop_duplicates(inplace=True)


df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['availability_365'] = pd.to_numeric(df['availability_365'], errors='coerce')

print("\nCleaned dataset shape:", df.shape)


df.to_csv(os.path.join(OUTDIR, 'AB_NYC_2019_cleaned.csv'), index=False)
print("Cleaned data saved to outputs/AB_NYC_2019_cleaned.csv")


print("\n--- Basic Statistics ---")
print(df.describe())


plt.figure(figsize=(8,4))
sns.countplot(x='neighbourhood_group', data=df, palette='viridis')
plt.title('Listings by Neighbourhood Group')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, 'listings_by_neighbourhood.png'))
plt.close()


plt.figure(figsize=(6,4))
sns.countplot(x='room_type', data=df, palette='Set2')
plt.title('Listings by Room Type')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, 'room_types.png'))
plt.close()


avg_price = df.groupby('neighbourhood_group')['price'].mean().sort_values(ascending=False)
plt.figure(figsize=(7,4))
avg_price.plot(kind='bar', color='teal')
plt.title('Average Price by Neighbourhood Group')
plt.ylabel('Average Price ($)')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, 'avg_price_by_neighbourhood.png'))
plt.close()


plt.figure(figsize=(8,5))
sns.histplot(df['price'], bins=100, color='skyblue', kde=True)
plt.xlim(0, 1000)
plt.title('Price Distribution (up to $1000)')
plt.xlabel('Price ($)')
plt.ylabel('Number of listings')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, 'price_distribution.png'))
plt.close()


plt.figure(figsize=(8,4))
sns.histplot(df['availability_365'], bins=50, color='orange', kde=False)
plt.title('Availability Distribution (days per year)')
plt.xlabel('Availability (days)')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, 'availability_distribution.png'))
plt.close()

numeric_cols = ['price','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']
plt.figure(figsize=(8,6))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, 'correlation_heatmap.png'))
plt.close()


top_neighbourhoods = df['neighbourhood'].value_counts().head(10)
plt.figure(figsize=(8,4))
sns.barplot(x=top_neighbourhoods.values, y=top_neighbourhoods.index, palette='mako')
plt.title('Top 10 Neighbourhoods by Listings')
plt.xlabel('Number of Listings')
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, 'top_neighbourhoods.png'))
plt.close()

import matplotlib.image as mpimg

try:
    img = mpimg.imread('New_York_City_.jpg')
    plt.figure(figsize=(8,8))
    plt.imshow(img, extent=[-74.05, -73.7, 40.63, 40.85], alpha=0.6)
    plt.scatter(df['longitude'], df['latitude'], c='red', s=1, alpha=0.5)
    plt.title('NYC Airbnb Listings Map Overlay')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTDIR, 'nyc_listings_map_overlay.png'))
    plt.close()
    print("Saved NYC listings overlay map.")
except Exception as e:
    print("Map image not found or plotting failed (optional):", e)


print("\n--- Quick Insights ---")
print(f"Total listings: {len(df)}")
print(f"Neighbourhood groups: {df['neighbourhood_group'].nunique()}")
print(f"Average price overall: ${df['price'].mean():.2f}")
print(f"Most common room type: {df['room_type'].mode()[0]}")
print("Neighbourhood group with highest average price:", avg_price.idxmax())

print("\nAll plots and cleaned data saved to 'outputs/' folder.")


Dataset loaded successfully!
Shape: (48895, 16)
     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park     2787   
1  2595                             Skylit Midtown Castle     2845   
2  3647               THE VILLAGE OF HARLEM....NEW YORK !     4632   
3  3831                   Cozy Entire Floor of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft by central park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
2    Elisabeth           Manhattan        Harlem  40.80902  -73.94190   
3  LisaRoxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -73.94399   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     Private ro

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['reviews_per_month'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['last_review'].fillna('No Review', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

Cleaned data saved to outputs/AB_NYC_2019_cleaned.csv

--- Basic Statistics ---
                 id       host_id      latitude     longitude         price  \
count  4.889500e+04  4.889500e+04  48895.000000  48895.000000  48895.000000   
mean   1.901714e+07  6.762001e+07     40.728949    -73.952170    152.720687   
std    1.098311e+07  7.861097e+07      0.054530      0.046157    240.154170   
min    2.539000e+03  2.438000e+03     40.499790    -74.244420      0.000000   
25%    9.471945e+06  7.822033e+06     40.690100    -73.983070     69.000000   
50%    1.967728e+07  3.079382e+07     40.723070    -73.955680    106.000000   
75%    2.915218e+07  1.074344e+08     40.763115    -73.936275    175.000000   
max    3.648724e+07  2.743213e+08     40.913060    -73.712990  10000.000000   

       minimum_nights  number_of_reviews  reviews_per_month  \
count    48895.000000       48895.000000       48895.000000   
mean         7.029962          23.274466           1.090910   
std         20.5105


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='neighbourhood_group', data=df, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='room_type', data=df, palette='Set2')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_neighbourhoods.values, y=top_neighbourhoods.index, palette='mako')


Map image not found or plotting failed (optional): [Errno 2] No such file or directory: 'New_York_City_.jpg'

--- Quick Insights ---
Total listings: 48895
Neighbourhood groups: 5
Average price overall: $152.72
Most common room type: Entire home/apt
Neighbourhood group with highest average price: Manhattan

All plots and cleaned data saved to 'outputs/' folder.
