<a href="https://colab.research.google.com/github/kolallen/OASIS_Data_Analytics_Internship_Task-3/blob/main/OASIS_INFOBYTE_Internship_Data_Analytics_Task_3__Data_Cleaning_New_York_City_Airbnb_Open_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the dataset from a CSV file
df = pd.read_csv('AB_NYC_2019.csv')

# Display the first few rows of the dataframe
print(df.head())

     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park     2787   
1  2595                             Skylit Midtown Castle     2845   
2  3647               THE VILLAGE OF HARLEM....NEW YORK !     4632   
3  3831                   Cozy Entire Floor of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft by central park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
2    Elisabeth           Manhattan        Harlem  40.80902  -73.94190   
3  LisaRoxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -73.94399   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     Private room    149               1                  9  20

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy import stats

In [None]:
def clean_dataset(df):
    # Data Integrity: Ensure column names are consistent
    df.columns = [col.strip().lower() for col in df.columns]

    # Missing Data Handling: Impute missing values or drop rows/columns
    # Instead of replacing the entire DataFrame, fill or drop NaNs
    df = df.fillna(method='ffill')  # Example: forward fill missing values
    # df = df.dropna()  # Alternative: drop rows with missing values

    # Duplicate Removal: Drop duplicate rows
    df = df.drop_duplicates()

    # Standardization: Scale numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # Outlier Detection: Remove outliers using Z-score method
    df = df[(np.abs(stats.zscore(df[numeric_cols])) < 3).all(axis=1)]

    return df

In [None]:
# Apply the cleaning function to the dataset
cleaned_df = clean_dataset(df)

# Display the first few rows of the cleaned dataframe
print(cleaned_df.head())

         id                                              name   host_id  \
0 -1.731277                Clean & quiet apt home by the park -0.860159   
1 -1.731272                             Skylit Midtown Castle -0.860158   
2 -1.731176               THE VILLAGE OF HARLEM....NEW YORK ! -0.860135   
4 -1.731051  Entire Apt: Spacious Studio/Loft by central park -0.860103   
5 -1.731044         Large Cozy 1 BR Apartment In Midtown East -0.860101   

   host_name neighbourhood_group neighbourhood  latitude  longitude  \
0       John            Brooklyn    Kensington -1.493849  -0.437652   
1   Jennifer           Manhattan       Midtown  0.452436  -0.684639   
2  Elisabeth           Manhattan        Harlem  1.468399   0.222497   
4      Laura           Manhattan   East Harlem  1.275660   0.177216   
5      Chris           Manhattan   Murray Hill  0.343321  -0.494632   

         room_type     price  minimum_nights  number_of_reviews last_review  \
0     Private room -0.015493       -0.29399

In [None]:
# Save the cleaned dataset back to a CSV file
cleaned_df.to_csv('AB_NYC_2019_cleaned.csv', index=False)