## FLAT DATASET CLEANING

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('surat_uncleaned.csv')

## DATA PREPROCESSING

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4525 entries, 0 to 4524
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   property_name   4525 non-null   object
 1   areaWithType    4525 non-null   object
 2   square_feet     4525 non-null   object
 3   transaction     4421 non-null   object
 4   status          4524 non-null   object
 5   floor           4480 non-null   object
 6   furnishing      4185 non-null   object
 7   facing          3936 non-null   object
 8   description     3154 non-null   object
 9   price_per_sqft  4157 non-null   object
 10  price           4525 non-null   object
dtypes: object(11)
memory usage: 389.0+ KB


In [4]:
df.head()

Unnamed: 0,property_name,areaWithType,square_feet,transaction,status,floor,furnishing,facing,description,price_per_sqft,price
0,2 BHK Apartment for Sale in Dindoli Surat,Carpet Area,644 sqft,New Property,Poss. by Oct '24,5 out of 10,Unfurnished,West,"Luxury project with basement parking, Solar ro...","₹2,891 per sqft",₹33.8 Lac
1,2 BHK Apartment for Sale in Althan Surat,Super Area,1278 sqft,New Property,Poss. by Jan '26,6 out of 14,Unfurnished,South -West,2 And 3 BHK Luxurious Flat for Sell In New Alt...,"₹3,551 per sqft",₹45.4 Lac
2,2 BHK Apartment for Sale in Pal Gam Surat,Super Area,1173 sqft,Resale,Ready to Move,5 out of 13,Semi-Furnished,East,This affordable 2 BHK flat is situated along a...,"₹3,800 per sqft",₹44.6 Lac
3,2 BHK Apartment for Sale in Jahangirabad Surat,Carpet Area,700 sqft,New Property,Ready to Move,6 out of 14,Unfurnished,East,2 BHK Flat For sell IN Jahangirabad Prime Loca...,"₹3,966 per sqft",₹47 Lac
4,"2 BHK Apartment for Sale in Orchid Fantasia, P...",Super Area,1250 sqft,Orchid Fantasia,New Property,Unfurnished,2,2,"Multistorey Apartment for Sale in Palanpur, Su...","₹3,600 per sqft",₹45 Lac


In [5]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')


In [6]:
df.duplicated().sum()

np.int64(109)

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.isnull().sum()

property_name        0
areawithtype         0
square_feet          0
transaction        103
status               1
floor               45
furnishing         331
facing             563
description       1331
price_per_sqft     358
price                0
dtype: int64

In [9]:
df['transaction'].fillna(df['transaction'].mode())
df['status'].fillna(df['status'].mode())
df['floor'].fillna(df['floor'].mode)
df['furnishing'].fillna(df['furnishing'].mode())
df['facing'].fillna(df['facing'].mode())

0               West
1        South -West
2               East
3               East
4                  2
            ...     
4520    South - East
4521    South - East
4522    North - East
4523    North - West
4524           North
Name: facing, Length: 4416, dtype: object

## Feature Engineering

In [10]:
df['unit_only'] = df['square_feet'].astype(str).str.replace(r'\d+', '', regex=True).str.strip()
unique_units = df['unit_only'].unique()
unique_units

array(['sqft', 'sqm', 'sqyrd', 'Resale', ', sqft', 'Ready to Move',
       'acre', 'rood', 'ground', ', sqm'], dtype=object)

In [11]:
units = ['sqft', 'sqm', 'sqyrd', 'acre', 'rood', 'ground']

for unit in units:
    df[unit] = df['square_feet'].str.extract(r'([\d,]+)\s*' + unit, expand=False)
    df[unit] = df[unit].str.replace(',', '').astype(float)

In [12]:
df

Unnamed: 0,property_name,areawithtype,square_feet,transaction,status,floor,furnishing,facing,description,price_per_sqft,price,unit_only,sqft,sqm,sqyrd,acre,rood,ground
0,2 BHK Apartment for Sale in Dindoli Surat,Carpet Area,644 sqft,New Property,Poss. by Oct '24,5 out of 10,Unfurnished,West,"Luxury project with basement parking, Solar ro...","₹2,891 per sqft",₹33.8 Lac,sqft,644.0,,,,,
1,2 BHK Apartment for Sale in Althan Surat,Super Area,1278 sqft,New Property,Poss. by Jan '26,6 out of 14,Unfurnished,South -West,2 And 3 BHK Luxurious Flat for Sell In New Alt...,"₹3,551 per sqft",₹45.4 Lac,sqft,1278.0,,,,,
2,2 BHK Apartment for Sale in Pal Gam Surat,Super Area,1173 sqft,Resale,Ready to Move,5 out of 13,Semi-Furnished,East,This affordable 2 BHK flat is situated along a...,"₹3,800 per sqft",₹44.6 Lac,sqft,1173.0,,,,,
3,2 BHK Apartment for Sale in Jahangirabad Surat,Carpet Area,700 sqft,New Property,Ready to Move,6 out of 14,Unfurnished,East,2 BHK Flat For sell IN Jahangirabad Prime Loca...,"₹3,966 per sqft",₹47 Lac,sqft,700.0,,,,,
4,"2 BHK Apartment for Sale in Orchid Fantasia, P...",Super Area,1250 sqft,Orchid Fantasia,New Property,Unfurnished,2,2,"Multistorey Apartment for Sale in Palanpur, Su...","₹3,600 per sqft",₹45 Lac,sqft,1250.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4520,6 BHK Apartment for Sale in Millionaires Lifes...,Carpet Area,2000 sqft,New Property,Poss. by Dec '26,5 out of 12,Unfurnished,South - East,"Check out Millionaires Lifestyle in Vesu, one ...",,Call for Price,sqft,2000.0,,,,,
4521,"4 BHK Apartment for Sale in Savan Superia, Alt...",Super Area,3600 sqft,New Property,Poss. by Dec '25,5 out of 16,Unfurnished,South - East,Superia is a premium residential project launc...,,Call for Price,sqft,3600.0,,,,,
4522,5 BHK Apartment for Sale in Roongta Green Vall...,Carpet Area,2250 sqft,New Property,Poss. by Dec '25,7 out of 13,Unfurnished,North - East,"When it comes to beautiful homes, nothing beat...",,Call for Price,sqft,2250.0,,,,,
4523,"6 BHK Apartment for Sale in Cellestial Dreams,...",Carpet Area,3450 sqft,New Property,Ready to Move,7 out of 18,Unfurnished,North - West,"DRB Ravani Cellestial Dreams in Vesu, Surat is...",,Call for Price,sqft,3450.0,,,,,


In [13]:
df['acre'].unique()

array([nan, 90.])

In [14]:
UNIT_CONVERSION = {
    'sqft': 1,
    'sqm': 10.764,
    'sqyrd': 9,
    'acre': 43560,
    'rood': 10890,
    'ground': 2400
}


df['cleaned'] = df['square_feet'].str.replace(',', '').str.strip()


pattern = r'(?P<value>\d+\.?\d*)\s*(?P<unit>' + '|'.join(UNIT_CONVERSION.keys()) + ')?$'
extracted = df['cleaned'].str.extract(pattern)
extracted['value'] = extracted['value'].astype(float)


df['total_sqft'] = np.nan

for unit in UNIT_CONVERSION:
    mask = extracted['unit'] == unit
    df.loc[mask, 'total_sqft'] = extracted.loc[mask, 'value'] * UNIT_CONVERSION[unit]

no_unit_mask = extracted['unit'].isna() & extracted['value'].notna()
df.loc[no_unit_mask, 'total_sqft'] = extracted.loc[no_unit_mask, 'value']

df = df.drop(['cleaned'], axis=1)

print(df[['square_feet', 'total_sqft']])

     square_feet  total_sqft
0       644 sqft       644.0
1      1278 sqft      1278.0
2      1173 sqft      1173.0
3       700 sqft       700.0
4      1250 sqft      1250.0
...          ...         ...
4520   2000 sqft      2000.0
4521   3600 sqft      3600.0
4522   2250 sqft      2250.0
4523   3450 sqft      3450.0
4524   4500 sqft      4500.0

[4416 rows x 2 columns]


In [15]:
columns_to_drop = ['sqft','sqm', 'sqyrd', 'acre', 'rood', 'ground', 'unit_only', 'description']
df = df.drop(columns=columns_to_drop)
df = df.drop(columns=['cleaned', 'value', 'unit'], errors='ignore')

In [16]:
df['property_name'].unique()

array(['2 BHK Apartment for Sale in Dindoli Surat',
       '2 BHK Apartment for Sale in Althan Surat',
       '2 BHK Apartment for Sale in Pal Gam Surat', ...,
       '5 BHK Apartment for Sale in Sangini Arise, Bharthana Surat',
       '5 BHK Apartment for Sale in D and M Elysium Avenue, Piplod Surat',
       '6 BHK Apartment for Sale in Millionaires Lifestyle, Vesu Surat'],
      shape=(1992,), dtype=object)

In [17]:
df['bhk'] = df['property_name'].str.extract(r'(\d+)\s*BHK').astype(float)
df = df.drop(columns=['property_name'])
print(df['bhk'].value_counts().sort_index())

bhk
1.0      337
2.0     1142
3.0     1443
4.0      644
5.0      181
6.0       39
7.0        8
8.0        6
9.0        1
10.0       2
Name: count, dtype: int64


In [18]:
df

Unnamed: 0,areawithtype,square_feet,transaction,status,floor,furnishing,facing,price_per_sqft,price,total_sqft,bhk
0,Carpet Area,644 sqft,New Property,Poss. by Oct '24,5 out of 10,Unfurnished,West,"₹2,891 per sqft",₹33.8 Lac,644.0,2.0
1,Super Area,1278 sqft,New Property,Poss. by Jan '26,6 out of 14,Unfurnished,South -West,"₹3,551 per sqft",₹45.4 Lac,1278.0,2.0
2,Super Area,1173 sqft,Resale,Ready to Move,5 out of 13,Semi-Furnished,East,"₹3,800 per sqft",₹44.6 Lac,1173.0,2.0
3,Carpet Area,700 sqft,New Property,Ready to Move,6 out of 14,Unfurnished,East,"₹3,966 per sqft",₹47 Lac,700.0,2.0
4,Super Area,1250 sqft,Orchid Fantasia,New Property,Unfurnished,2,2,"₹3,600 per sqft",₹45 Lac,1250.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
4520,Carpet Area,2000 sqft,New Property,Poss. by Dec '26,5 out of 12,Unfurnished,South - East,,Call for Price,2000.0,6.0
4521,Super Area,3600 sqft,New Property,Poss. by Dec '25,5 out of 16,Unfurnished,South - East,,Call for Price,3600.0,4.0
4522,Carpet Area,2250 sqft,New Property,Poss. by Dec '25,7 out of 13,Unfurnished,North - East,,Call for Price,2250.0,5.0
4523,Carpet Area,3450 sqft,New Property,Ready to Move,7 out of 18,Unfurnished,North - West,,Call for Price,3450.0,6.0


In [19]:
df.isnull().sum()

areawithtype        0
square_feet         0
transaction       103
status              1
floor              45
furnishing        331
facing            563
price_per_sqft    358
price               0
total_sqft          6
bhk               613
dtype: int64

In [20]:
print(df['areawithtype'].value_counts())

areawithtype
Super Area     2542
Carpet Area    1626
Plot Area       207
Built Area       35
Status            4
Transaction       2
Name: count, dtype: int64


In [21]:
valid_areas = ['Super Area', 'Carpet Area', 'Plot Area', 'Built Area']
incorrect_mask = ~df['areawithtype'].isin(valid_areas)
correct_mode = df.loc[~incorrect_mask, 'areawithtype'].mode()[0]
df.loc[incorrect_mask, 'areawithtype'] = correct_mode
print("Cleaned 'areawithtype' counts:")
print(df['areawithtype'].value_counts())

Cleaned 'areawithtype' counts:
areawithtype
Super Area     2548
Carpet Area    1626
Plot Area       207
Built Area       35
Name: count, dtype: int64


In [22]:
print(df['square_feet'].value_counts())

square_feet
1000 sqft    74
800 sqft     63
1100 sqft    62
1200 sqft    54
700 sqft     53
             ..
9222 sqft     1
2338 sqft     1
2090 sqft     1
2309 sqft     1
5701 sqft     1
Name: count, Length: 1399, dtype: int64


In [23]:
ready_to_move_mask = df['square_feet'] == 'Ready to Move'
resale_mask = df['square_feet'] == 'Resale'

df.loc[ready_to_move_mask, 'status'] = 'Ready to Move'
df.loc[resale_mask, 'transaction'] = 'Resale'

df.loc[ready_to_move_mask | resale_mask, 'square_feet'] = None

print("Moved 'Ready to Move' count:", ready_to_move_mask.sum())
print("Moved 'Resale' count:", resale_mask.sum())
print("\nRemaining square_feet values:", df['square_feet'].unique())

Moved 'Ready to Move' count: 4
Moved 'Resale' count: 2

Remaining square_feet values: ['644 sqft' '1278 sqft' '1173 sqft' ... '2545 sqft' '5175 sqft'
 '2620 sqft']


In [24]:
df.drop(columns=['square_feet'], inplace=True)

In [25]:
print(df['transaction'].value_counts())

transaction
Resale                          2131
New Property                    1471
Unfurnished                      398
Semi-Furnished                    82
Furnished                         66
2                                 23
3                                 21
No                                19
Congo                             17
1                                 16
Yes                               15
5                                 10
4                                  7
Main Road                          5
Soham Elegance                     3
Orchid Gardenia                    3
6                                  3
Orchid Fantasia                    2
Jolly Residency                    2
Raghuvir Silverstone               2
Casa Rivera                        2
Main Road, Garden/Park             1
1 Covered                          1
Other                              1
10 m                               1
Shagun Residency                   1
Mahavir Nagar             

In [26]:
swap_values = ['New Property', 'Resale']
cols_to_check = ['transaction', 'floor', 'furnishing', 'facing']

for col in cols_to_check:
    swap_mask = df[col].isin(swap_values)
    df.loc[swap_mask, ['status', col]] = df.loc[swap_mask, [col, 'status']].values
    
print("Swapped counts per column:")
for col in cols_to_check:
    count = df[col].isin(swap_values).sum()
    print(f"{col}: {count} remaining swap values (should be 0)")

Swapped counts per column:
transaction: 0 remaining swap values (should be 0)
floor: 0 remaining swap values (should be 0)
furnishing: 0 remaining swap values (should be 0)
facing: 0 remaining swap values (should be 0)


In [27]:
print(df['status'].value_counts())

status
Resale                         2635
New Property                   1759
Ready to Move                    21
Const. Age New Construction       1
Name: count, dtype: int64


In [28]:
ready_mask = (
    (df['status'] == 'Ready to Move') |
    (df['floor'] == 'Ready to Move') |
    (df['furnishing'] == 'Ready to Move') |
    (df['facing'] == 'Ready to Move')
)

poss_mask = (
    df['status'].str.contains('Poss. by', na=False) |
    df['floor'].str.contains('Poss. by', na=False) |
    df['furnishing'].str.contains('Poss. by', na=False) |
    df['facing'].str.contains('Poss. by', na=False)
)

swap_mask = ready_mask | poss_mask

cols_to_check = ['status', 'floor', 'furnishing', 'facing']
for col in cols_to_check:

    col_mask = (
        (df[col] == 'Ready to Move') | 
        (df[col].str.contains('Poss. by', na=False))
    )
    df.loc[col_mask, ['transaction', col]] = df.loc[col_mask, [col, 'transaction']].values

In [29]:
target_cols = ['status', 'transaction', 'furnishing', 'facing']

swap_mask = df[target_cols].apply(
    lambda x: x.str.contains('out of|Basement', case=False, na=False, regex=True)
).any(axis=1)

for col in target_cols:
    df.loc[swap_mask, ['floor', col]] = df.loc[swap_mask, [col, 'floor']].values

In [30]:
df['floor'].unique()

array(['5 out of 10', '6 out of 14', '5 out of 13', 'Unfurnished',
       '7 out of 10', '3 out of 14', '6 out of 13', '1 out of 1',
       '1 out of 13', '4 out of 7', '3 out of 3', '2 out of 6',
       '3 out of 5', '2 out of 5', '3 out of 12', '5 out of 5',
       '1 out of 7', 'Ground out of 1', 'Lower Basement out of 7',
       '1 out of 5', '7 out of 14', '9 out of 14', '5 out of 12',
       '7 out of 13', 'Freehold', '1 out of 2', '2 out of 10',
       '12 out of 14', '10 out of 14', '2 out of 12', '1 out of 4',
       '13 out of 14', '4 out of 14', '6 out of 10', '2 out of 4',
       '3 out of 4', '3 out of 10', '9 out of 13',
       'Upper Basement out of 5', '1 out of 3', '4 out of 4',
       'Ground out of 2', 'Lower Basement out of 2', 'Ground out of 4',
       nan, 'Ground out of 5', '4 out of 10', 'Ground out of 14',
       'Ground out of 6', '5 out of 6', '8 out of 9', '4 out of 5',
       '2 out of 7', '7 out of 7', 'Ground out of 3', '4 out of 8',
       'Co-operative 

In [31]:
swap_values = ['Unfurnished', 'Furnished', 'Semi-Furnished']
cols_to_check = ['transaction', 'floor', 'status', 'facing']

for col in cols_to_check:
    swap_mask = df[col].isin(swap_values)
    df.loc[swap_mask, ['furnishing', col]] = df.loc[swap_mask, [col, 'furnishing']].values
    

In [32]:
df['furnishing'].unique()

array(['Unfurnished', 'Semi-Furnished', '1', 'Furnished', nan, '2', '3',
       '1 Covered,', '2 Covered', '1 Covered', '26 Covered', 'No', '6',
       '> 10', '4', '10 Open', '5 Covered,', '7 m', '30 m', '50 Covered,',
       'Main Road', '1 Open', '12 m', 'Yes', '2 Covered,', 'Congo',
       'Vaishnodevi Ideal Homes', '100 Covered', 'West', '5 Covered', '5',
       '50 Open', '20 m', '20 Open', '10 m', '3 Covered', '2 Open', '6 m',
       '15 m', '10 Covered,'], dtype=object)

In [33]:
df['facing'].unique()

array(['West', 'South -West', 'East', '2', 'Main Road', nan,
       'North - East', 'South - East', '1', 'Freehold', 'Garden/Park',
       '3', 'North - West', 'South', '6', 'Co-operative Society',
       'Garden/Park, Main Road', '1 Covered', 'No', '30 m', 'North',
       'Varni Siddheshwar Heights', '5', 'Salasar Icon', 'Green Valley',
       'Shyam enclave', 'Rajhans Platinum', 'Vitthal Bunglows', '1 Open',
       'Royal Nest', 'Rajhans Wings', '2 Open',
       'Gordhan Green Valley Mangalam Park Bldg S',
       'Apt Swaminagar Society', 'Swagat Clifton', 'Akash',
       'Green Arcade Phase 1 And 2', 'Satyam Tower', '4',
       'Suryanjali Residency', 'GHB Mukhya Mantri Gruh Yojana',
       'Vibrant Eco Park', 'Madhav Residency', 'Shiv Drashti Residency',
       'Gruham Luxuria', 'SHANKHESHWER COMPLAX', 'JT Stuti Highland',
       'Times Galaxy', '100 m', 'Savani Prayosha Pride',
       'Shayona Janki Residency', 'Sundaram Residency', '6 m',
       'Silicon Palm', 'Raj Abhishek City

In [34]:
allowed_values = {
    'areawithtype': ['Super Area', 'Carpet Area', 'Plot Area', 'Built Area'],
    'transaction': ['Ready to Move'] + df['transaction'][df['transaction'].str.contains('Poss. by', na=False)].tolist(),
    'status': ['New Property', 'Resale'],
    'floor': df['floor'][df['floor'].str.contains('out of|Basement', na=False, regex=True)].tolist(),
    'furnishing': ['Unfurnished', 'Furnished', 'Semi-Furnished']
}

for col, valid in allowed_values.items():
    if col == 'facing': 
        continue

    invalid_mask = ~df[col].isin(valid)

    df.loc[invalid_mask, ['facing', col]] = df.loc[invalid_mask, [col, 'facing']].values

In [35]:
df['facing'] = df['facing'].astype(str)

contains_numbers = df['facing'].str.contains(r'\d', regex=True)

non_numeric = df[~contains_numbers]['facing']
facing_mode = non_numeric.mode()[0] if not non_numeric.empty else 'Unknown'

df.loc[contains_numbers, 'facing'] = facing_mode


print(f"Replaced {contains_numbers.sum()} entries containing numbers with mode: '{facing_mode}'")
print("\nCleaned 'facing' values:")
print(df['facing'].unique())

Replaced 1027 entries containing numbers with mode: 'East'

Cleaned 'facing' values:
['West' 'South -West' 'East' 'Anand Aspire' 'North - East' 'South - East'
 'nan' 'Main Road' 'North - West' 'Freehold' 'South'
 'Co-operative Society' 'Garden/Park, Main Road' 'Garden/Park' 'No'
 'North' 'Varni Siddheshwar Heights' 'Salasar Icon' 'Green Valley'
 'Shyam enclave' 'Rajhans Platinum' 'Vitthal Bunglows' 'Royal Nest'
 'Rajhans Wings' 'Gordhan Green Valley Mangalam Park Bldg S'
 'Apt Swaminagar Society' 'Swagat Clifton' 'Satyam Tower'
 'Suryanjali Residency' 'GHB Mukhya Mantri Gruh Yojana' 'Madhav Residency'
 'Shiv Drashti Residency' 'Gruham Luxuria' 'SHANKHESHWER COMPLAX'
 'JT Stuti Highland' 'Times Galaxy' 'Savani Prayosha Pride'
 'Nilkanth Seven Homes' 'Shayona Janki Residency' 'Sundaram Residency'
 'Silicon Palm' 'Aagam Navkar' 'Raj Abhishek City Homes'
 'Dhvani Royal Residency' 'Residency' 'Vinayak Enclave' 'Pan Sharanam'
 'Pool, Garden/Park, Main Road' 'Rajhans Swapna' 'Panchtatva Resid

In [36]:
valid_directions = [
    'East', 'South -West', 'North - East', 'West', 
    'North', 'South - East', 'South', 'North - West'
]


df['facing'] = df['facing'].str.strip()

facing_mode = df[df['facing'].isin(valid_directions)]['facing'].mode()[0]

invalid_mask = ~df['facing'].isin(valid_directions)
df.loc[invalid_mask, 'facing'] = facing_mode

print(f"Replaced {invalid_mask.sum()} invalid entries with mode: '{facing_mode}'")
print("\nFinal facing distribution:")
print(df['facing'].value_counts())

Replaced 739 invalid entries with mode: 'East'

Final facing distribution:
facing
East            3475
South -West      358
North - East     245
West             108
North            107
South - East      56
South             40
North - West      27
Name: count, dtype: int64


In [37]:

def extract_components(s):
    if pd.isna(s) or not str(s).strip():
        return (None, None, None)
    
    s = str(s).strip()
    
    currency_pattern = r'^([₹$€£¥₽₩₤₭֏؋ƒ₡₢₣₤₥₦₧₨₩₪₫₭₮₯₰₱₲₳₴₵₶₷₸₹₺₼₽₾₿ℳ元円圆圓﷼¤]|CA\$|US\$|A\$|C\$)'
    currency = re.match(currency_pattern, s)
    currency = currency.group(1) if currency else None

    value = re.search(r'([0-9,\.]+)', s.replace(' ', ''))
    value = value.group(1).replace(',', '') if value else None
    
    unit_pattern = r'(?:per\s*(?:sq|square)\s*[\.\-\s]*(?:ft|foot|feet|meter|m|m²|m2)|psf|pm|psm)'
    unit = re.search(unit_pattern, s, re.IGNORECASE)
    unit = unit.group().lower() if unit else None
    
    return (currency, value, unit)


components = df['price_per_sqft'].apply(extract_components)
df[['currency', 'numeric_value', 'unit']] = pd.DataFrame(components.tolist(), index=df.index)

df['price_sqft'] = pd.to_numeric(df['numeric_value'], errors='coerce')

unit_map = {
    'per sq.ft.': 'per sqft',
    'per square foot': 'per sqft',
    'psf': 'per sqft',
    'per sqm': 'per sqm',
    'per square meter': 'per sqm',
    'psm': 'per sqm',
    'pm': 'per sqm'
}
df['unit'] = df['unit'].map(unit_map)

In [38]:
df

Unnamed: 0,areawithtype,transaction,status,floor,furnishing,facing,price_per_sqft,price,total_sqft,bhk,currency,numeric_value,unit,price_sqft
0,Carpet Area,Poss. by Oct '24,New Property,5 out of 10,Unfurnished,West,"₹2,891 per sqft",₹33.8 Lac,644.0,2.0,₹,2891,,2891.0
1,Super Area,Poss. by Jan '26,New Property,6 out of 14,Unfurnished,South -West,"₹3,551 per sqft",₹45.4 Lac,1278.0,2.0,₹,3551,,3551.0
2,Super Area,Ready to Move,Resale,5 out of 13,Semi-Furnished,East,"₹3,800 per sqft",₹44.6 Lac,1173.0,2.0,₹,3800,,3800.0
3,Carpet Area,Ready to Move,New Property,6 out of 14,Unfurnished,East,"₹3,966 per sqft",₹47 Lac,700.0,2.0,₹,3966,,3966.0
4,Super Area,2,New Property,Orchid Fantasia,Unfurnished,East,"₹3,600 per sqft",₹45 Lac,1250.0,2.0,₹,3600,,3600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4520,Carpet Area,Poss. by Dec '26,New Property,5 out of 12,Unfurnished,South - East,,Call for Price,2000.0,6.0,,,,
4521,Super Area,Poss. by Dec '25,New Property,5 out of 16,Unfurnished,South - East,,Call for Price,3600.0,4.0,,,,
4522,Carpet Area,Poss. by Dec '25,New Property,7 out of 13,Unfurnished,North - East,,Call for Price,2250.0,5.0,,,,
4523,Carpet Area,Ready to Move,New Property,7 out of 18,Unfurnished,North - West,,Call for Price,3450.0,6.0,,,,


In [39]:
df.drop('price_per_sqft', axis=1, inplace=True)
df.drop('currency', axis=1, inplace=True)
df.drop('numeric_value', axis=1, inplace=True)
df.drop('unit', axis=1, inplace=True)

In [40]:
df

Unnamed: 0,areawithtype,transaction,status,floor,furnishing,facing,price,total_sqft,bhk,price_sqft
0,Carpet Area,Poss. by Oct '24,New Property,5 out of 10,Unfurnished,West,₹33.8 Lac,644.0,2.0,2891.0
1,Super Area,Poss. by Jan '26,New Property,6 out of 14,Unfurnished,South -West,₹45.4 Lac,1278.0,2.0,3551.0
2,Super Area,Ready to Move,Resale,5 out of 13,Semi-Furnished,East,₹44.6 Lac,1173.0,2.0,3800.0
3,Carpet Area,Ready to Move,New Property,6 out of 14,Unfurnished,East,₹47 Lac,700.0,2.0,3966.0
4,Super Area,2,New Property,Orchid Fantasia,Unfurnished,East,₹45 Lac,1250.0,2.0,3600.0
...,...,...,...,...,...,...,...,...,...,...
4520,Carpet Area,Poss. by Dec '26,New Property,5 out of 12,Unfurnished,South - East,Call for Price,2000.0,6.0,
4521,Super Area,Poss. by Dec '25,New Property,5 out of 16,Unfurnished,South - East,Call for Price,3600.0,4.0,
4522,Carpet Area,Poss. by Dec '25,New Property,7 out of 13,Unfurnished,North - East,Call for Price,2250.0,5.0,
4523,Carpet Area,Ready to Move,New Property,7 out of 18,Unfurnished,North - West,Call for Price,3450.0,6.0,


In [41]:
print(df['price'].value_counts())

price
Call for Price    172
₹35 Lac            79
₹65 Lac            73
₹45 Lac            70
₹40 Lac            69
                 ... 
₹2.57 Cr            1
₹3.02 Cr            1
₹2.79 Cr            1
₹16.50 Cr           1
₹4.05 Cr            1
Name: count, Length: 841, dtype: int64


In [42]:
df['price'].unique()

array(['₹33.8 Lac ', '₹45.4 Lac ', '₹44.6 Lac ', '₹47 Lac ', '₹45 Lac ',
       '₹43.2 Lac ', '₹42.1 Lac ', '₹44.1 Lac ', '₹44.3 Lac ', '₹40 Lac ',
       '₹44 Lac ', '₹12.5 Lac ', '₹50 Lac ', '₹35.8 Lac ', '₹26.5 Lac ',
       '₹42 Lac ', '₹43 Lac ', '₹20 Lac ', '₹37 Lac ', '₹29 Lac ',
       '₹38 Lac ', '₹13 Lac ', '₹46.5 Lac ', '₹41.6 Lac ', '₹33.5 Lac ',
       '₹42.5 Lac ', '₹48 Lac ', '₹32 Lac ', '₹35 Lac ', '₹46.2 Lac ',
       '₹38.5 Lac ', '₹23 Lac ', '₹24 Lac ', '₹28 Lac ', '₹35.7 Lac ',
       '₹36.5 Lac ', '₹36 Lac ', '₹43.6 Lac ', '₹25 Lac ', '₹41 Lac ',
       '₹28.5 Lac ', '₹9 Lac ', '₹30 Lac ', '₹39.5 Lac ', '₹21.5 Lac ',
       '₹10.5 Lac ', '₹18 Lac ', '₹10 Lac ', '₹23.5 Lac ', '₹16.3 Lac ',
       '₹13.5 Lac ', '₹49 Lac ', '₹21 Lac ', '₹39 Lac ', '₹16 Lac ',
       '₹27 Lac ', '₹15 Lac ', '₹39.9 Lac ', '₹7.5 Lac ', '₹12.8 Lac ',
       '₹9.5 Lac ', '₹31.5 Lac ', '₹46 Lac ', '₹6 Lac ', '₹34 Lac ',
       '₹22 Lac ', '₹19 Lac ', '₹8.5 Lac ', '₹11 Lac ', '₹16.5 Lac ',
 

In [43]:

df['unit'] = df['price'].str.extract(r'(?:₹|\$|Rs?\.?)\s*[\d.,]+\s*([A-Za-z]+)\b', flags=re.IGNORECASE)

# ------------------------------------------------------------
# STEP 2: Show COMPLETE frequency analysis
print("🔍 ALL UNIT TYPES FOUND:")
unit_counts = df['unit'].value_counts(dropna=False)
print(unit_counts)

# ------------------------------------------------------------
# STEP 3: Display EXAMPLES OF EACH UNIT
print("\n📝 Sample Entries for Each Unit:")
for unit in unit_counts.index:
    if pd.isna(unit):
        samples = df[df['unit'].isna()]['price'].head(2).tolist()
        print(f"• No unit: {samples}")
    else:
        samples = df[df['unit'].str.lower() == unit.lower()]['price'].head(2).tolist()
        print(f"• {unit}: {samples}")

🔍 ALL UNIT TYPES FOUND:
unit
Lac    2757
Cr     1487
NaN     172
Name: count, dtype: int64

📝 Sample Entries for Each Unit:
• Lac: ['₹33.8 Lac ', '₹45.4 Lac ']
• Cr: ['₹1 Cr ', '₹1 Cr ']
• No unit: ['Call for Price', 'Call for Price']


In [44]:

CR_TO_LAC = 100  # 1 Cr = 100 Lac

# Step 1: Create clean 'price_lac' column directly
def convert_to_lac(price):
    if pd.isna(price) or str(price).strip() == '':
        return np.nan
    
    price = str(price).strip()
    
    # Handle 'Call for Price'
    if 'Call' in price:
        return median_lac  # Will be set later
    
    # Extract numeric value
    try:
        num = float(re.sub(r'[^\d.]', '', price))
    except:
        return np.nan
    
    # Convert Cr to Lac
    if 'Cr' in price:
        num *= CR_TO_LAC
    
    return round(num, 2)

# Calculate median from Lac prices first (excluding Cr/Call prices)
valid_lac_mask = df['price'].str.contains('Lac', na=False)
median_lac = (df[valid_lac_mask]['price']
              .str.replace(r'[^\d.]', '', regex=True)
              .astype(float)
              .median())

# Apply conversion
df['price_lac'] = df['price'].apply(convert_to_lac)

# Step 2: Remove original and intermediate columns
df.drop(columns=['price', 'unit', 'numeric_value'], errors='ignore', inplace=True)

# Step 3: Remove unconvertible rows
df = df[df['price_lac'].notna()]

print("✅ Final Clean DataFrame:")
print(df.head())

✅ Final Clean DataFrame:
  areawithtype       transaction        status            floor  \
0  Carpet Area  Poss. by Oct '24  New Property      5 out of 10   
1   Super Area  Poss. by Jan '26  New Property      6 out of 14   
2   Super Area     Ready to Move        Resale      5 out of 13   
3  Carpet Area     Ready to Move  New Property      6 out of 14   
4   Super Area                 2  New Property  Orchid Fantasia   

       furnishing       facing  total_sqft  bhk  price_sqft  price_lac  
0     Unfurnished         West       644.0  2.0      2891.0       33.8  
1     Unfurnished  South -West      1278.0  2.0      3551.0       45.4  
2  Semi-Furnished         East      1173.0  2.0      3800.0       44.6  
3     Unfurnished         East       700.0  2.0      3966.0       47.0  
4     Unfurnished         East      1250.0  2.0      3600.0       45.0  


In [45]:
df

Unnamed: 0,areawithtype,transaction,status,floor,furnishing,facing,total_sqft,bhk,price_sqft,price_lac
0,Carpet Area,Poss. by Oct '24,New Property,5 out of 10,Unfurnished,West,644.0,2.0,2891.0,33.8
1,Super Area,Poss. by Jan '26,New Property,6 out of 14,Unfurnished,South -West,1278.0,2.0,3551.0,45.4
2,Super Area,Ready to Move,Resale,5 out of 13,Semi-Furnished,East,1173.0,2.0,3800.0,44.6
3,Carpet Area,Ready to Move,New Property,6 out of 14,Unfurnished,East,700.0,2.0,3966.0,47.0
4,Super Area,2,New Property,Orchid Fantasia,Unfurnished,East,1250.0,2.0,3600.0,45.0
...,...,...,...,...,...,...,...,...,...,...
4520,Carpet Area,Poss. by Dec '26,New Property,5 out of 12,Unfurnished,South - East,2000.0,6.0,,50.9
4521,Super Area,Poss. by Dec '25,New Property,5 out of 16,Unfurnished,South - East,3600.0,4.0,,50.9
4522,Carpet Area,Poss. by Dec '25,New Property,7 out of 13,Unfurnished,North - East,2250.0,5.0,,50.9
4523,Carpet Area,Ready to Move,New Property,7 out of 18,Unfurnished,North - West,3450.0,6.0,,50.9


In [46]:
median_lac = df['price_sqft'].median()
df['price_sqft'] = df['price_sqft'].fillna(median_lac)

In [47]:
df

Unnamed: 0,areawithtype,transaction,status,floor,furnishing,facing,total_sqft,bhk,price_sqft,price_lac
0,Carpet Area,Poss. by Oct '24,New Property,5 out of 10,Unfurnished,West,644.0,2.0,2891.0,33.8
1,Super Area,Poss. by Jan '26,New Property,6 out of 14,Unfurnished,South -West,1278.0,2.0,3551.0,45.4
2,Super Area,Ready to Move,Resale,5 out of 13,Semi-Furnished,East,1173.0,2.0,3800.0,44.6
3,Carpet Area,Ready to Move,New Property,6 out of 14,Unfurnished,East,700.0,2.0,3966.0,47.0
4,Super Area,2,New Property,Orchid Fantasia,Unfurnished,East,1250.0,2.0,3600.0,45.0
...,...,...,...,...,...,...,...,...,...,...
4520,Carpet Area,Poss. by Dec '26,New Property,5 out of 12,Unfurnished,South - East,2000.0,6.0,4697.5,50.9
4521,Super Area,Poss. by Dec '25,New Property,5 out of 16,Unfurnished,South - East,3600.0,4.0,4697.5,50.9
4522,Carpet Area,Poss. by Dec '25,New Property,7 out of 13,Unfurnished,North - East,2250.0,5.0,4697.5,50.9
4523,Carpet Area,Ready to Move,New Property,7 out of 18,Unfurnished,North - West,3450.0,6.0,4697.5,50.9


In [48]:
df.isnull().sum()

areawithtype      0
transaction     142
status           19
floor           164
furnishing      385
facing            0
total_sqft        6
bhk             613
price_sqft        0
price_lac         0
dtype: int64

In [49]:
df['transaction'] = df['transaction'].fillna(df['transaction'].mode()[0])

df['status'] = df['status'].fillna('Unknown')

df['floor'] = df['floor'].fillna(df['floor'].mode()[0])

df['furnishing'] = df['furnishing'].fillna(df['furnishing'].mode()[0])

df['total_sqft'] = df.groupby('bhk')['total_sqft'].transform(
    lambda x: x.fillna(x.median())
)
df['bhk'] = df.groupby('total_sqft')['bhk'].transform(
    lambda x: x.fillna(x.median())
)

overall_sqft_median = df['total_sqft'].median()
overall_bhk_median = df['bhk'].median()

df['total_sqft'] = df['total_sqft'].fillna(overall_sqft_median)
df['bhk'] = df['bhk'].fillna(overall_bhk_median)


In [50]:
df.isnull().sum()

areawithtype    0
transaction     0
status          0
floor           0
furnishing      0
facing          0
total_sqft      0
bhk             0
price_sqft      0
price_lac       0
dtype: int64

In [51]:
df

Unnamed: 0,areawithtype,transaction,status,floor,furnishing,facing,total_sqft,bhk,price_sqft,price_lac
0,Carpet Area,Poss. by Oct '24,New Property,5 out of 10,Unfurnished,West,644.0,2.0,2891.0,33.8
1,Super Area,Poss. by Jan '26,New Property,6 out of 14,Unfurnished,South -West,1278.0,2.0,3551.0,45.4
2,Super Area,Ready to Move,Resale,5 out of 13,Semi-Furnished,East,1173.0,2.0,3800.0,44.6
3,Carpet Area,Ready to Move,New Property,6 out of 14,Unfurnished,East,700.0,2.0,3966.0,47.0
4,Super Area,2,New Property,Orchid Fantasia,Unfurnished,East,1250.0,2.0,3600.0,45.0
...,...,...,...,...,...,...,...,...,...,...
4520,Carpet Area,Poss. by Dec '26,New Property,5 out of 12,Unfurnished,South - East,2000.0,6.0,4697.5,50.9
4521,Super Area,Poss. by Dec '25,New Property,5 out of 16,Unfurnished,South - East,3600.0,4.0,4697.5,50.9
4522,Carpet Area,Poss. by Dec '25,New Property,7 out of 13,Unfurnished,North - East,2250.0,5.0,4697.5,50.9
4523,Carpet Area,Ready to Move,New Property,7 out of 18,Unfurnished,North - West,3450.0,6.0,4697.5,50.9


## Ecoding and Scaling

In [53]:
import pandas as pd

categorical_cols = [
    'areawithtype', 
    'transaction', 
    'status', 
    'floor', 
    'furnishing', 
    'facing'
]

df = pd.get_dummies(
    df, 
    columns=categorical_cols,
    prefix=categorical_cols,
    drop_first=True
)

In [54]:
from sklearn.preprocessing import StandardScaler
numeric_cols = ['total_sqft', 'bhk', 'price_sqft',]
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4416 entries, 0 to 4524
Columns: 412 entries, total_sqft to facing_West
dtypes: bool(408), float64(4)
memory usage: 1.9 MB


In [58]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define features (X) and target (y) from the DataFrame
X = df.drop('price_lac', axis=1)
y = df['price_lac']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model (metrics explained later)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("--- Linear Regression Results ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

--- Linear Regression Results ---
Mean Squared Error (MSE): 70024.54
R-squared (R2): -3.91
