<a href="https://colab.research.google.com/github/kprashantsingh/colab_pub/blob/main/Data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset
url = "https://raw.githubusercontent.com/coursesAM/IOCL2025/refs/heads/main/datasets/AB_NYC_2019.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
# Basic Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [4]:
# Check for missing values
df.isnull().sum()

Unnamed: 0,0
id,0
name,16
host_id,0
host_name,21
neighbourhood_group,0
neighbourhood,0
latitude,0
longitude,0
room_type,0
price,0


In [5]:
# Handle missing values (example: fill numeric with median or mean)
imputer = SimpleImputer(strategy='median')
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

In [6]:
# Remove duplicates
df = df.drop_duplicates()

In [7]:
# Correct erroneous values (example: cap outliers using interquartile range IQR)
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower, lower, df[col])
    df[col] = np.where(df[col] > upper, upper, df[col])

In [8]:
# df['date_column'] = pd.to_datetime(df['date_column'])  # uncomment if needed

In [9]:
# Feature Scaling - Normalization
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [10]:
# Encode categorical variables using one-hot encoding
categorical_cols = df.select_dtypes(include='object').columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)



In [11]:
# Dimensionality reduction (using correlation if any feature is correlated by more than 0.95)
#cor_matrix = df.corr().abs()
#upper = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
#to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
#df_reduced = df.drop(columns=to_drop)

In [12]:
df.head()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,...,last_review_2019-06-29,last_review_2019-06-30,last_review_2019-07-01,last_review_2019-07-02,last_review_2019-07-03,last_review_2019-07-04,last_review_2019-07-05,last_review_2019-07-06,last_review_2019-07-07,last_review_2019-07-08
0,-1.731277,-0.863811,-1.502384,-0.489583,0.191791,-0.850824,-0.327981,-0.824365,1.990847,1.91625,...,False,False,False,False,False,False,False,False,False,False
1,-1.731272,-0.86381,0.455527,-0.800554,1.101648,-0.850824,1.44112,-0.668187,0.406686,1.840275,...,False,False,False,False,False,False,False,False,False,False
2,-1.731176,-0.863787,1.477559,0.341584,0.203763,-0.248539,-0.770256,-0.35583,-0.649421,1.91625,...,False,False,False,False,False,False,False,False,False,False
3,-1.731159,-0.863784,-0.807808,-0.145605,-0.526517,-0.850824,2.104533,2.225707,-0.649421,0.617065,...,False,False,False,False,False,False,True,False,False,False
4,-1.731051,-0.863754,1.283668,0.284573,-0.634263,1.859459,-0.327981,-0.925422,-0.649421,-0.856865,...,False,False,False,False,False,False,False,False,False,False


In [16]:
# Train-test splitting
X = df.drop(columns=['price'])  # Replace 'Target' with your label column
y = df['price']                 # Replace 'Target' with your label column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)