In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Set styling
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Libraries imported successfully")

‚úÖ Libraries imported successfully


In [2]:
# Load Cleaned Data
df = pd.read_csv('../data/processed/cleaned_real_estate_data.csv')

print(f"üìÅ Dataset loaded: {df.shape[0]} rows √ó {df.shape[1]} columns")
print("\nüìã Columns:")
print(df.columns.tolist())
print("\nüîç First 5 rows:")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_real_estate_data.csv'

## üí∞ Feature 1: Price per Square Foot

In [3]:
# Calculate Price per Sq.Ft (converting Lakhs to Rupees)
df['Price_Per_SqFt'] = (df['Price_Lakhs'] * 100000) / df['Area_SqFt']

print("üí∞ Price per Square Foot Statistics:")
print(df['Price_Per_SqFt'].describe())

# Visualize distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(df['Price_Per_SqFt'], bins=50, color='skyblue', edgecolor='black')
axes[0].set_xlabel('Price per Sq.Ft (‚Çπ)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Price per Square Foot')
axes[0].axvline(df['Price_Per_SqFt'].median(), color='red', linestyle='--', label=f'Median: ‚Çπ{df["Price_Per_SqFt"].median():.0f}')
axes[0].legend()

# Box plot
axes[1].boxplot(df['Price_Per_SqFt'].dropna(), vert=True)
axes[1].set_ylabel('Price per Sq.Ft (‚Çπ)')
axes[1].set_title('Price per Sq.Ft - Box Plot')

plt.tight_layout()
plt.show()

print(f"\n‚úÖ Price_Per_SqFt feature created")

NameError: name 'df' is not defined

## üìè Feature 2: Area Categories

In [4]:
# Define area categories
def categorize_area(area):
    if area < 800:
        return 'Small'
    elif area < 1500:
        return 'Medium'
    elif area < 2500:
        return 'Large'
    else:
        return 'Extra Large'

df['Area_Category'] = df['Area_SqFt'].apply(categorize_area)

print("üìè Area Category Distribution:")
print(df['Area_Category'].value_counts())

# Visualize
plt.figure(figsize=(10, 6))
area_counts = df['Area_Category'].value_counts()
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
plt.pie(area_counts.values, labels=area_counts.index, autopct='%1.1f%%', 
        colors=colors, startangle=90)
plt.title('Property Distribution by Area Category', fontsize=14, fontweight='bold')
plt.axis('equal')
plt.show()

print(f"\n‚úÖ Area_Category feature created")

NameError: name 'df' is not defined

## üè∑Ô∏è Feature 3: Property Type Classification

In [5]:
# Classify based on BHK
def classify_property_type(bhk):
    if bhk == 1:
        return 'Studio/1BHK'
    elif bhk == 2:
        return '2BHK'
    elif bhk == 3:
        return '3BHK'
    elif bhk == 4:
        return '4BHK'
    else:
        return 'Luxury (5+ BHK)'

df['Property_Type'] = df['BHK'].apply(classify_property_type)

print("üè∑Ô∏è Property Type Distribution:")
print(df['Property_Type'].value_counts())

# Visualize
plt.figure(figsize=(12, 6))
prop_counts = df['Property_Type'].value_counts().sort_index()
bars = plt.bar(prop_counts.index, prop_counts.values, color='teal', edgecolor='black', alpha=0.7)
plt.xlabel('Property Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Property Distribution by Type', fontsize=14, fontweight='bold')
plt.xticks(rotation=45)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}',
            ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

print(f"\n‚úÖ Property_Type feature created")

NameError: name 'df' is not defined

## üíé Feature 4: Price Segments

In [6]:
# Create price segments
def categorize_price(price):
    if price < 50:
        return 'Budget (< 50L)'
    elif price < 100:
        return 'Affordable (50L-1Cr)'
    elif price < 200:
        return 'Premium (1-2Cr)'
    else:
        return 'Luxury (> 2Cr)'

df['Price_Segment'] = df['Price_Lakhs'].apply(categorize_price)

print("üíé Price Segment Distribution:")
print(df['Price_Segment'].value_counts())

# Visualize
plt.figure(figsize=(12, 6))
segment_order = ['Budget (< 50L)', 'Affordable (50L-1Cr)', 'Premium (1-2Cr)', 'Luxury (> 2Cr)']
segment_counts = df['Price_Segment'].value_counts().reindex(segment_order)
colors_gradient = ['#52B788', '#74C69D', '#95D5B2', '#B7E4C7']
bars = plt.bar(segment_counts.index, segment_counts.values, color=colors_gradient, edgecolor='black')
plt.xlabel('Price Segment', fontsize=12)
plt.ylabel('Number of Properties', fontsize=12)
plt.title('Property Distribution by Price Segment', fontsize=14, fontweight='bold')
plt.xticks(rotation=15)

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}',
            ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

print(f"\n‚úÖ Price_Segment feature created")

NameError: name 'df' is not defined

## üó∫Ô∏è Feature 5: Locality-Based Features

In [7]:
# Calculate property count per locality (this is OK - no data leakage)
locality_property_count = df.groupby('Locality').size().to_dict()
df['Locality_Property_Count'] = df['Locality'].map(locality_property_count)

print("üó∫Ô∏è Locality-Based Features Summary:")
print(f"   Locality_Property_Count: {df['Locality_Property_Count'].describe()}")

# Top 10 localities by property count
print("\nüèôÔ∏è Top 10 Localities by Property Count:")
top_localities = df['Locality'].value_counts().head(10)
print(top_localities)

print(f"\n‚úÖ Locality-based features created")
print("‚ö†Ô∏è  NOTE: Removed Locality_Avg_Price features to prevent data leakage")

NameError: name 'df' is not defined

## üî¢ Feature 6: Bathroom to BHK Ratio

In [8]:
# Calculate bathroom to BHK ratio
df['Bathroom_BHK_Ratio'] = df['Bathrooms'] / df['BHK']

print("üî¢ Bathroom to BHK Ratio Statistics:")
print(df['Bathroom_BHK_Ratio'].describe())

# Visualize
plt.figure(figsize=(10, 6))
plt.hist(df['Bathroom_BHK_Ratio'].dropna(), bins=20, color='coral', edgecolor='black')
plt.xlabel('Bathroom to BHK Ratio', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Bathroom to BHK Ratio', fontsize=14, fontweight='bold')
plt.axvline(df['Bathroom_BHK_Ratio'].mean(), color='red', linestyle='--', 
            label=f'Mean: {df["Bathroom_BHK_Ratio"].mean():.2f}')
plt.legend()
plt.tight_layout()
plt.show()

print(f"\n‚úÖ Bathroom_BHK_Ratio feature created")

NameError: name 'df' is not defined

## üìä Feature 7: Area per Bedroom

In [9]:
# Calculate area per bedroom
df['Area_Per_Bedroom'] = df['Area_SqFt'] / df['BHK']

# Classify space quality
def classify_space_quality(area_per_bed):
    if area_per_bed < 400:
        return 'Compact'
    elif area_per_bed < 600:
        return 'Standard'
    elif area_per_bed < 800:
        return 'Spacious'
    else:
        return 'Very Spacious'

df['Space_Quality'] = df['Area_Per_Bedroom'].apply(classify_space_quality)

print("üìä Area per Bedroom Statistics:")
print(df['Area_Per_Bedroom'].describe())
print("\nüè† Space Quality Distribution:")
print(df['Space_Quality'].value_counts())

print(f"\n‚úÖ Area_Per_Bedroom and Space_Quality features created")

NameError: name 'df' is not defined

## üî§ Feature 9: Encode Categorical Variables

In [None]:
# Create encoded versions for machine learning
label_encoders = {}

categorical_columns = ['Furnishing', 'Area_Category', 'Property_Type', 
                      'Price_Segment', 'Space_Quality']

for col in categorical_columns:
    if col in df.columns:
        le = LabelEncoder()
        df[f'{col}_Encoded'] = le.fit_transform(df[col])
        label_encoders[col] = le
        
        print(f"\n{col} Encoding:")
        for i, class_name in enumerate(le.classes_):
            print(f"   {class_name}: {i}")

print(f"\n‚úÖ Categorical variables encoded")

NameError: name 'df' is not defined

## üèôÔ∏è Feature 10: Top Locality Flag

In [11]:
# Identify top 20 localities by property count
top_localities = df['Locality'].value_counts().head(20).index.tolist()
df['Is_Top_Locality'] = df['Locality'].isin(top_localities).astype(int)

print("üèôÔ∏è Top 20 Localities:")
print(df['Locality'].value_counts().head(20))
print(f"\n‚úÖ Is_Top_Locality feature created")
print(f"   Properties in top localities: {df['Is_Top_Locality'].sum()}")
print(f"   Properties in other localities: {(df['Is_Top_Locality'] == 0).sum()}")

NameError: name 'df' is not defined

## üìã Summary: All Features Created

In [12]:
print("="*70)
print("FEATURE ENGINEERING SUMMARY")
print("="*70)

new_features = [
    'Price_Per_SqFt',
    'Area_Category',
    'Property_Type',
    'Price_Segment',
    'Locality_Property_Count',
    'Bathroom_BHK_Ratio',
    'Area_Per_Bedroom',
    'Space_Quality',
    'Is_Top_Locality'
]

print(f"\n‚úÖ Total New Features Created: {len(new_features)}")
print("\nüìã New Feature List:")
for i, feature in enumerate(new_features, 1):
    print(f"   {i}. {feature}")

print(f"\nüìä Final Dataset Shape: {df.shape}")
print(f"   Original columns: 10")
print(f"   Total columns now: {df.shape[1]}")
print(f"   New features added: {df.shape[1] - 10}")

print("\n‚ö†Ô∏è  DATA LEAKAGE PREVENTION:")
print("   ‚ùå Removed: Locality_Avg_Price (derived from target)")
print("   ‚ùå Removed: Locality_Avg_PriceSqFt (derived from target)")
print("   ‚ùå Removed: Value_Score (derived from target)")
print("   ‚ùå Removed: Locality_Price_Category (derived from target)")
print("   ‚úÖ These features would artificially inflate model performance!")

FEATURE ENGINEERING SUMMARY

‚úÖ Total New Features Created: 9

üìã New Feature List:
   1. Price_Per_SqFt
   2. Area_Category
   3. Property_Type
   4. Price_Segment
   5. Locality_Property_Count
   6. Bathroom_BHK_Ratio
   7. Area_Per_Bedroom
   8. Space_Quality
   9. Is_Top_Locality


NameError: name 'df' is not defined

## üíæ Export Enhanced Dataset

In [None]:
# Export full feature-engineered dataset
output_file = '../data/processed/featured_real_estate_data.csv'
df.to_csv(output_file, index=False)

print(f"\nüíæ Feature-engineered dataset saved: {output_file}")
print(f"   Shape: {df.shape}")
print(f"\n‚úÖ FEATURE ENGINEERING COMPLETED SUCCESSFULLY!")

# Preview
print("\nüëÄ Preview of Enhanced Data:")
display_cols = ['Property_Title', 'Price_Lakhs', 'Area_SqFt', 'BHK', 'Price_Per_SqFt', 
                'Area_Category', 'Property_Type', 'Price_Segment']
df[display_cols].head(10)

NameError: name 'df' is not defined

## üìä Quick Feature Correlation Analysis

In [14]:
# Correlation with price
numeric_features = ['Area_SqFt', 'BHK', 'Bathrooms', 'Price_Per_SqFt', 
                   'Bathroom_BHK_Ratio', 'Area_Per_Bedroom', 'Is_Top_Locality']

correlations = df[numeric_features + ['Price_Lakhs']].corr()['Price_Lakhs'].sort_values(ascending=False)

print("\nüîó Feature Correlation with Price:")
print(correlations)

# Visualize
plt.figure(figsize=(10, 8))
correlations_sorted = correlations.drop('Price_Lakhs').sort_values()
colors = ['red' if x < 0 else 'green' for x in correlations_sorted.values]
plt.barh(range(len(correlations_sorted)), correlations_sorted.values, color=colors, alpha=0.7)
plt.yticks(range(len(correlations_sorted)), correlations_sorted.index)
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.title('Feature Correlation with Price (No Data Leakage)', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
plt.tight_layout()
plt.show()

print("\n‚úÖ Clean features with no data leakage!")

NameError: name 'df' is not defined