# Imports

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Read Csv file

In [46]:
df = pd.read_csv('surat_uncleaned.csv')


In [47]:
df

Unnamed: 0,property_name,areaWithType,square_feet,transaction,status,floor,furnishing,facing,description,price_per_sqft,price
0,2 BHK Apartment for Sale in Dindoli Surat,Carpet Area,644 sqft,New Property,Poss. by Oct '24,5 out of 10,Unfurnished,West,"Luxury project with basement parking, Solar ro...","₹2,891 per sqft",₹33.8 Lac
1,2 BHK Apartment for Sale in Althan Surat,Super Area,1278 sqft,New Property,Poss. by Jan '26,6 out of 14,Unfurnished,South -West,2 And 3 BHK Luxurious Flat for Sell In New Alt...,"₹3,551 per sqft",₹45.4 Lac
2,2 BHK Apartment for Sale in Pal Gam Surat,Super Area,1173 sqft,Resale,Ready to Move,5 out of 13,Semi-Furnished,East,This affordable 2 BHK flat is situated along a...,"₹3,800 per sqft",₹44.6 Lac
3,2 BHK Apartment for Sale in Jahangirabad Surat,Carpet Area,700 sqft,New Property,Ready to Move,6 out of 14,Unfurnished,East,2 BHK Flat For sell IN Jahangirabad Prime Loca...,"₹3,966 per sqft",₹47 Lac
4,"2 BHK Apartment for Sale in Orchid Fantasia, P...",Super Area,1250 sqft,Orchid Fantasia,New Property,Unfurnished,2,2,"Multistorey Apartment for Sale in Palanpur, Su...","₹3,600 per sqft",₹45 Lac
...,...,...,...,...,...,...,...,...,...,...,...
4520,6 BHK Apartment for Sale in Millionaires Lifes...,Carpet Area,2000 sqft,New Property,Poss. by Dec '26,5 out of 12,Unfurnished,South - East,"Check out Millionaires Lifestyle in Vesu, one ...",,Call for Price
4521,"4 BHK Apartment for Sale in Savan Superia, Alt...",Super Area,3600 sqft,New Property,Poss. by Dec '25,5 out of 16,Unfurnished,South - East,Superia is a premium residential project launc...,,Call for Price
4522,5 BHK Apartment for Sale in Roongta Green Vall...,Carpet Area,2250 sqft,New Property,Poss. by Dec '25,7 out of 13,Unfurnished,North - East,"When it comes to beautiful homes, nothing beat...",,Call for Price
4523,"6 BHK Apartment for Sale in Cellestial Dreams,...",Carpet Area,3450 sqft,New Property,Ready to Move,7 out of 18,Unfurnished,North - West,"DRB Ravani Cellestial Dreams in Vesu, Surat is...",,Call for Price


## Look at the data

In [48]:
print("Data Shape:", df.shape)

Data Shape: (4525, 11)


In [49]:
print("Data Types:", df.dtypes)

Data Types: property_name     object
areaWithType      object
square_feet       object
transaction       object
status            object
floor             object
furnishing        object
facing            object
description       object
price_per_sqft    object
price             object
dtype: object


In [50]:
print("Missing Values:", df.isnull().sum())


Missing Values: property_name        0
areaWithType         0
square_feet          0
transaction        104
status               1
floor               45
furnishing         340
facing             589
description       1371
price_per_sqft     368
price                0
dtype: int64


# PREPROCCESSING

In [51]:
def price_to_inr(x):
    if pd.isna(x): 
        return np.nan
    s = str(x).lower().replace('₹','').replace(',','').strip()
    if 'request' in s or 'call' in s:   # ignore "on request", "call for price"
        return np.nan

    total = 0.0
    # find numbers with optional unit after them
    for num, unit in re.findall(r'(\d+(?:\.\d+)?)\s*(cr|crore|lac|lakh)?', s):
        n = float(num)
        if unit in ('cr','crore'): n *= 1e7
        elif unit in ('lac','lakh'): n *= 1e5
        total += n

    if total > 0: 
        return total
    # no unit words: try plain number like "5500000"
    try: 
        return float(s)
    except:
        return np.nan

In [52]:
df['price_num'] = df['price'].apply(price_to_inr)


In [53]:
df

Unnamed: 0,property_name,areaWithType,square_feet,transaction,status,floor,furnishing,facing,description,price_per_sqft,price,price_num
0,2 BHK Apartment for Sale in Dindoli Surat,Carpet Area,644 sqft,New Property,Poss. by Oct '24,5 out of 10,Unfurnished,West,"Luxury project with basement parking, Solar ro...","₹2,891 per sqft",₹33.8 Lac,3380000.0
1,2 BHK Apartment for Sale in Althan Surat,Super Area,1278 sqft,New Property,Poss. by Jan '26,6 out of 14,Unfurnished,South -West,2 And 3 BHK Luxurious Flat for Sell In New Alt...,"₹3,551 per sqft",₹45.4 Lac,4540000.0
2,2 BHK Apartment for Sale in Pal Gam Surat,Super Area,1173 sqft,Resale,Ready to Move,5 out of 13,Semi-Furnished,East,This affordable 2 BHK flat is situated along a...,"₹3,800 per sqft",₹44.6 Lac,4460000.0
3,2 BHK Apartment for Sale in Jahangirabad Surat,Carpet Area,700 sqft,New Property,Ready to Move,6 out of 14,Unfurnished,East,2 BHK Flat For sell IN Jahangirabad Prime Loca...,"₹3,966 per sqft",₹47 Lac,4700000.0
4,"2 BHK Apartment for Sale in Orchid Fantasia, P...",Super Area,1250 sqft,Orchid Fantasia,New Property,Unfurnished,2,2,"Multistorey Apartment for Sale in Palanpur, Su...","₹3,600 per sqft",₹45 Lac,4500000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4520,6 BHK Apartment for Sale in Millionaires Lifes...,Carpet Area,2000 sqft,New Property,Poss. by Dec '26,5 out of 12,Unfurnished,South - East,"Check out Millionaires Lifestyle in Vesu, one ...",,Call for Price,
4521,"4 BHK Apartment for Sale in Savan Superia, Alt...",Super Area,3600 sqft,New Property,Poss. by Dec '25,5 out of 16,Unfurnished,South - East,Superia is a premium residential project launc...,,Call for Price,
4522,5 BHK Apartment for Sale in Roongta Green Vall...,Carpet Area,2250 sqft,New Property,Poss. by Dec '25,7 out of 13,Unfurnished,North - East,"When it comes to beautiful homes, nothing beat...",,Call for Price,
4523,"6 BHK Apartment for Sale in Cellestial Dreams,...",Carpet Area,3450 sqft,New Property,Ready to Move,7 out of 18,Unfurnished,North - West,"DRB Ravani Cellestial Dreams in Vesu, Surat is...",,Call for Price,


In [54]:
def normalize_furnishing(x):
    if pd.isna(x): return np.nan
    t = str(x).strip().lower()
    mapping = {
        "semifurnished":"semi-furnished", "semi furnished":"semi-furnished",
        "semi-furnished":"semi-furnished",
        "fully furnished":"furnished", "full furnished":"furnished", "furnished":"furnished",
        "unfurnished":"unfurnished", "not furnished":"unfurnished", "raw":"unfurnished"
    }
    return mapping.get(t, t)


In [55]:
def normalize_facing(x):
    if pd.isna(x): return np.nan
    t = str(x).strip().lower()
    m = {
        "e":"east","east":"east","w":"west","west":"west",
        "n":"north","north":"north","s":"south","south":"south",
        "ne":"northeast","north-east":"northeast","north east":"northeast",
        "nw":"northwest","north-west":"northwest","north west":"northwest",
        "se":"southeast","south-east":"southeast","south east":"southeast",
        "sw":"southwest","south-west":"southwest","south west":"southwest",
    }
    return m.get(t, t)


In [56]:
NUM = re.compile(r'[\d,.]+')

def num_from_text(x):
    """First numeric token -> float (handles commas)."""
    if pd.isna(x): return np.nan
    m = NUM.search(str(x))
    return float(m.group(0).replace(",", "")) if m else np.nan

In [57]:
def normalize_floor(x):
    if pd.isna(x): return np.nan
    s = str(x).lower().strip()
    if "basement" in s: return -1
    if s == "g" or "ground" in s: return 0
    m = re.search(r"(\d+)", s)
    return int(m.group(1)) if m else np.nan


## text tiding

In [58]:
for c in df.select_dtypes(include="object").columns:
    df[c] = df[c].astype(str).str.strip().replace({"nan": np.nan})


In [59]:
print("Cleaning numeric fields...")
df["square_feet_num"]    = df["square_feet"].apply(num_from_text)
df["price_num"]          = df["price"].apply(price_to_inr)
df["price_per_sqft_num"] = df["price_per_sqft"].apply(num_from_text)

Cleaning numeric fields...


In [60]:
print("Deriving missing price/per-sqft where possible...")
mask_pps = df["price_per_sqft_num"].isna() & df["price_num"].notna() & df["square_feet_num"].gt(0)
df.loc[mask_pps, "price_per_sqft_num"] = (df.loc[mask_pps, "price_num"] / df.loc[mask_pps, "square_feet_num"]).round(2)

mask_price = df["price_num"].isna() & df["price_per_sqft_num"].notna() & df["square_feet_num"].gt(0)
df.loc[mask_price, "price_num"] = (df.loc[mask_price, "price_per_sqft_num"] * df.loc[mask_price, "square_feet_num"]).round(0)


Deriving missing price/per-sqft where possible...


In [61]:
df

Unnamed: 0,property_name,areaWithType,square_feet,transaction,status,floor,furnishing,facing,description,price_per_sqft,price,price_num,square_feet_num,price_per_sqft_num
0,2 BHK Apartment for Sale in Dindoli Surat,Carpet Area,644 sqft,New Property,Poss. by Oct '24,5 out of 10,Unfurnished,West,"Luxury project with basement parking, Solar ro...","₹2,891 per sqft",₹33.8 Lac,3380000.0,644.0,2891.0
1,2 BHK Apartment for Sale in Althan Surat,Super Area,1278 sqft,New Property,Poss. by Jan '26,6 out of 14,Unfurnished,South -West,2 And 3 BHK Luxurious Flat for Sell In New Alt...,"₹3,551 per sqft",₹45.4 Lac,4540000.0,1278.0,3551.0
2,2 BHK Apartment for Sale in Pal Gam Surat,Super Area,1173 sqft,Resale,Ready to Move,5 out of 13,Semi-Furnished,East,This affordable 2 BHK flat is situated along a...,"₹3,800 per sqft",₹44.6 Lac,4460000.0,1173.0,3800.0
3,2 BHK Apartment for Sale in Jahangirabad Surat,Carpet Area,700 sqft,New Property,Ready to Move,6 out of 14,Unfurnished,East,2 BHK Flat For sell IN Jahangirabad Prime Loca...,"₹3,966 per sqft",₹47 Lac,4700000.0,700.0,3966.0
4,"2 BHK Apartment for Sale in Orchid Fantasia, P...",Super Area,1250 sqft,Orchid Fantasia,New Property,Unfurnished,2,2,"Multistorey Apartment for Sale in Palanpur, Su...","₹3,600 per sqft",₹45 Lac,4500000.0,1250.0,3600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4520,6 BHK Apartment for Sale in Millionaires Lifes...,Carpet Area,2000 sqft,New Property,Poss. by Dec '26,5 out of 12,Unfurnished,South - East,"Check out Millionaires Lifestyle in Vesu, one ...",,Call for Price,,2000.0,
4521,"4 BHK Apartment for Sale in Savan Superia, Alt...",Super Area,3600 sqft,New Property,Poss. by Dec '25,5 out of 16,Unfurnished,South - East,Superia is a premium residential project launc...,,Call for Price,,3600.0,
4522,5 BHK Apartment for Sale in Roongta Green Vall...,Carpet Area,2250 sqft,New Property,Poss. by Dec '25,7 out of 13,Unfurnished,North - East,"When it comes to beautiful homes, nothing beat...",,Call for Price,,2250.0,
4523,"6 BHK Apartment for Sale in Cellestial Dreams,...",Carpet Area,3450 sqft,New Property,Ready to Move,7 out of 18,Unfurnished,North - West,"DRB Ravani Cellestial Dreams in Vesu, Surat is...",,Call for Price,,3450.0,


In [62]:
print("Normalizing categories...")
if "furnishing" in df: df["furnishing"] = df["furnishing"].apply(normalize_furnishing)
if "facing" in df:     df["facing"]     = df["facing"].apply(normalize_facing)
if "floor" in df:      df["floor_num"]  = df["floor"].apply(normalize_floor)

# Simple tidy on a few others
if "transaction" in df: df["transaction"] = df["transaction"].astype(str).str.lower().str.strip()
if "status" in df:      df["status"]      = df["status"].astype(str).str.lower().str.strip()
if "areaWithType" in df: df["areaWithType"] = df["areaWithType"].astype(str).str.title()
if "property_name" in df: df["property_name"] = df["property_name"].astype(str).str.title()


Normalizing categories...


# Fill mising values

In [64]:
print("Imputing missing values (median for numeric, mode for categorical)...")
for c in ["square_feet_num","price_num","price_per_sqft_num","floor_num"]:
    if c in df:
        df[c] = df[c].fillna(df[c].median())

for c in ["transaction","status","furnishing","facing"]:
    if c in df:
        mode = df[c].mode(dropna=True)
        if len(mode): df[c] = df[c].fillna(mode.iloc[0])

Imputing missing values (median for numeric, mode for categorical)...


# Remove Duplicates

In [65]:
key_cols = [c for c in ["property_name","areaWithType","square_feet_num","price_num"] if c in df]
before = len(df)
df = df.drop_duplicates(subset=key_cols, keep="first").reset_index(drop=True)
after = len(df)
print(f"Duplicates removed: {before - after}")


Duplicates removed: 282


In [66]:

order = [
    "property_name","areaWithType",
    "square_feet","square_feet_num",
    "transaction","status",
    "floor","floor_num","furnishing","facing",
    "description",
    "price_per_sqft","price_per_sqft_num",
    "price","price_num"
]
df = df[[c for c in order if c in df.columns]]

In [67]:
df

Unnamed: 0,property_name,areaWithType,square_feet,square_feet_num,transaction,status,floor,floor_num,furnishing,facing,description,price_per_sqft,price_per_sqft_num,price,price_num
0,2 Bhk Apartment For Sale In Dindoli Surat,Carpet Area,644 sqft,644.0,new property,poss. by oct '24,5 out of 10,5.0,unfurnished,west,"Luxury project with basement parking, Solar ro...","₹2,891 per sqft",2891.0,₹33.8 Lac,3380000.0
1,2 Bhk Apartment For Sale In Althan Surat,Super Area,1278 sqft,1278.0,new property,poss. by jan '26,6 out of 14,6.0,unfurnished,south -west,2 And 3 BHK Luxurious Flat for Sell In New Alt...,"₹3,551 per sqft",3551.0,₹45.4 Lac,4540000.0
2,2 Bhk Apartment For Sale In Pal Gam Surat,Super Area,1173 sqft,1173.0,resale,ready to move,5 out of 13,5.0,semi-furnished,east,This affordable 2 BHK flat is situated along a...,"₹3,800 per sqft",3800.0,₹44.6 Lac,4460000.0
3,2 Bhk Apartment For Sale In Jahangirabad Surat,Carpet Area,700 sqft,700.0,new property,ready to move,6 out of 14,6.0,unfurnished,east,2 BHK Flat For sell IN Jahangirabad Prime Loca...,"₹3,966 per sqft",3966.0,₹47 Lac,4700000.0
4,"2 Bhk Apartment For Sale In Orchid Fantasia, P...",Super Area,1250 sqft,1250.0,orchid fantasia,new property,Unfurnished,5.0,2,2,"Multistorey Apartment for Sale in Palanpur, Su...","₹3,600 per sqft",3600.0,₹45 Lac,4500000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4238,6 Bhk Apartment For Sale In Millionaires Lifes...,Carpet Area,2000 sqft,2000.0,new property,poss. by dec '26,5 out of 12,5.0,unfurnished,south - east,"Check out Millionaires Lifestyle in Vesu, one ...",,4690.0,Call for Price,7100000.0
4239,"4 Bhk Apartment For Sale In Savan Superia, Alt...",Super Area,3600 sqft,3600.0,new property,poss. by dec '25,5 out of 16,5.0,unfurnished,south - east,Superia is a premium residential project launc...,,4690.0,Call for Price,7100000.0
4240,5 Bhk Apartment For Sale In Roongta Green Vall...,Carpet Area,2250 sqft,2250.0,new property,poss. by dec '25,7 out of 13,7.0,unfurnished,north - east,"When it comes to beautiful homes, nothing beat...",,4690.0,Call for Price,7100000.0
4241,"6 Bhk Apartment For Sale In Cellestial Dreams,...",Carpet Area,3450 sqft,3450.0,new property,ready to move,7 out of 18,7.0,unfurnished,north - west,"DRB Ravani Cellestial Dreams in Vesu, Surat is...",,4690.0,Call for Price,7100000.0


In [77]:
print("\n=== SUMMARY ===")
print("Shape:", df.shape)
print("Nulls after cleaning:\n", df.isna().sum())
print("Price per sqft (min/mean/max):", df["price_per_sqft_num"].min(), df["price_per_sqft_num"].mean(), df["price_per_sqft_num"].max())
print("Price (min/mean/max):", df["price_num"].min(), df["price_num"].mean(), df["price_num"].max())


=== SUMMARY ===
Shape: (4243, 10)
Nulls after cleaning:
 property_name         0
areaWithType          0
square_feet_num       0
transaction           0
status                0
floor_num             0
furnishing            0
facing                0
price_per_sqft_num    0
price_num             0
dtype: int64
Price per sqft (min/mean/max): 1.0 10004.86283290125 12500000.0
Price (min/mean/max): 100000.0 13069686.542540655 5344400000.000001


In [69]:
df.drop(columns=['description'], inplace=True)


In [78]:
df.dtypes


property_name          object
areaWithType           object
square_feet_num       float64
transaction            object
status                 object
floor_num             float64
furnishing             object
facing                 object
price_per_sqft_num    float64
price_num             float64
dtype: object

In [73]:
df.drop(columns=['square_feet'], inplace=True)

KeyError: "['square_feet'] not found in axis"

In [72]:
df.drop(columns=['price_per_sqft'], inplace=True)

In [75]:
df.drop(columns=['price'], inplace=True)

In [76]:
df.drop(columns=['floor'], inplace=True)