# Gurgaon Real Estate Data Analysis

This notebook performs:
- Data loading
- Exploration and visualization
- Missing values handling
- Outlier detection and trimming

Dataset: `../data/Gurgaon_RealEstate.csv`

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Load Dataset
df = pd.read_csv('../data/Gurgaon_RealEstate.csv')
df.head()

In [None]:
# Inspect Data
df.info()

In [None]:
# Remove Duplicates
df.drop_duplicates(inplace=True)

## Property Type Distribution

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='property_type', data=df)
plt.title('Property Type Distribution')
plt.show()

## Remove Societies with Less Than 3 Records

In [None]:
society_freq = df['society'].value_counts()
societies_to_remove = society_freq[society_freq < 3].index
df = df[~df['society'].isin(societies_to_remove)]

## Price Statistics and Plots

In [None]:
print("Price Summary Statistics:")
print(df['price'].describe())

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df['price'], kde=True)
plt.title('Price Histogram')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(y='price', data=df)
plt.title('Price Boxplot')
plt.show()

In [None]:
print("Price Skewness:", df['price'].skew())
print("Price Kurtosis:", df['price'].kurt())

## Bathroom Statistics

In [None]:
print("Bathroom Summary Statistics:")
print(df['bathroom'].describe())

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df['bathroom'], kde=True)
plt.title('Bathroom Histogram')
plt.show()

## Scatter and Box Plots

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x='area', y='price', data=df)
plt.title('Price vs Area')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='property_type', y='price', data=df)
plt.title('Property Type vs Price')
plt.show()

## Missing Values

In [None]:
print("Missing Values Distribution:")
print(df.isnull().sum())

## KNN Imputation

In [None]:
from sklearn.impute import KNNImputer
numeric_cols = df.select_dtypes(include='number').columns
knn = KNNImputer(n_neighbors=5)
for col in numeric_cols:
    df[col] = knn.fit_transform(df[[col]])

## Encode 'facing'

In [None]:
df['facing'] = df['facing'].astype('category').cat.codes

## Fill 'society' with Mode

In [None]:
mode_society = df['society'].mode()[0]
df['society'] = df['society'].fillna(mode_society)

## Outlier Detection

In [None]:
outliers_z = []
outliers_iqr = []
for col in numeric_cols:
    z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
    outliers_z.extend(df[z_scores > 3].index)

    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers_iqr.extend(df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)].index)

print("Outliers Z-score:", len(set(outliers_z)))
print("Outliers IQR:", len(set(outliers_iqr)))

## Trimming Outliers

In [None]:
Q1 = df[numeric_cols].quantile(0.15)
Q3 = df[numeric_cols].quantile(0.85)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df_trimmed = df.copy()
for col in numeric_cols:
    df_trimmed = df_trimmed[(df_trimmed[col] >= lower[col]) & (df_trimmed[col] <= upper[col])]

## Histograms After Trimming

In [None]:
df_trimmed.hist(figsize=(12,10))
plt.tight_layout()
plt.show()