In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from scipy import stats
import sys
from pathlib import Path

# Add parent directory to path
sys.path.append(str(Path.cwd().parent))

# Configure settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
warnings.filterwarnings('ignore')

print("🔧 Feature Engineering Pipeline Started")
print(f"📅 Timestamp: {pd.Timestamp.now()}")


In [None]:
# Load raw data
try:
    df_raw = pd.read_csv('../data/raw/bengaluru_house_prices.csv')
    print("✅ Raw data loaded successfully")
except FileNotFoundError:
    print("❌ Dataset not found. Creating sample data...")
    # Create sample data for demonstration
    np.random.seed(42)
    sample_data = {
        'area_type': np.random.choice(['Super built-up Area', 'Built-up Area', 'Plot Area'], 1000),
        'availability': np.random.choice(['Ready To Move', '18-Jun', '19-Dec'], 1000, p=[0.7, 0.2, 0.1]),
        'location': np.random.choice(['Whitefield', 'Electronic City', 'Marathahalli', 'BTM Layout', 'Koramangala'], 1000),
        'size': [f"{np.random.choice([1,2,3,4])} BHK" for _ in range(1000)],
        'society': [f"Society {i}" for i in np.random.randint(1, 100, 1000)],
        'total_sqft': np.random.normal(1200, 400, 1000),
        'bath': np.random.randint(1, 5, 1000),
        'balcony': np.random.randint(0, 4, 1000),
        'price': np.random.normal(80, 30, 1000)
    }
    df_raw = pd.DataFrame(sample_data)
    df_raw['total_sqft'] = np.maximum(df_raw['total_sqft'], 500)
    df_raw['price'] = np.maximum(df_raw['price'], 20)
    print("📊 Sample data created")

print(f"\n📋 Dataset Info:")
print(f"Shape: {df_raw.shape}")
print(f"Columns: {list(df_raw.columns)}")
print(f"Memory usage: {df_raw.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Display sample
display(df_raw.head())


In [None]:
# Initial data quality assessment
print("🔍 Initial Data Quality Assessment:")
print(f"Missing values: {df_raw.isnull().sum().sum()}")
print(f"Duplicate rows: {df_raw.duplicated().sum()}")

# Missing values by column
missing_summary = pd.DataFrame({
    'Column': df_raw.columns,
    'Missing_Count': df_raw.isnull().sum(),
    'Missing_Percent': (df_raw.isnull().sum() / len(df_raw)) * 100,
    'Data_Type': df_raw.dtypes
})

display(missing_summary[missing_summary['Missing_Count'] > 0])


In [None]:
class DataCleaner:
    """Data cleaning pipeline for house price dataset"""
    
    def __init__(self, df):
        self.df = df.copy()
        self.cleaning_log = []
        
    def log_step(self, step, before_shape, after_shape, description):
        """Log cleaning steps"""
        rows_removed = before_shape[0] - after_shape[0]
        self.cleaning_log.append({
            'Step': step,
            'Description': description,
            'Rows_Before': before_shape[0],
            'Rows_After': after_shape[0],
            'Rows_Removed': rows_removed,
            'Percent_Removed': (rows_removed / before_shape[0]) * 100
        })
    
    def remove_duplicates(self):
        """Remove duplicate rows"""
        before_shape = self.df.shape
        self.df = self.df.drop_duplicates()
        after_shape = self.df.shape
        self.log_step('Remove Duplicates', before_shape, after_shape, 
                     'Removed duplicate rows')
        return self
    
    def handle_missing_values(self):
        """Handle missing values based on column type and amount"""
        before_shape = self.df.shape
        
        # For this dataset, we'll drop rows with missing critical values
        critical_columns = ['location', 'total_sqft', 'price']
        self.df = self.df.dropna(subset=critical_columns)
        
        # Fill missing balcony with 0 (reasonable default)
        if 'balcony' in self.df.columns:
            self.df['balcony'] = self.df['balcony'].fillna(0)
        
        after_shape = self.df.shape
        self.log_step('Handle Missing Values', before_shape, after_shape,
                     'Dropped rows with missing critical values')
        return self
    
    def clean_total_sqft(self):
        """Clean and standardize total_sqft column"""
        before_shape = self.df.shape
        
        def clean_sqft_value(value):
            if pd.isna(value):
                return np.nan
            
            # Convert to string and clean
            value_str = str(value).strip()
            
            # Handle range values like "1200 - 1300"
            if '-' in value_str:
                try:
                    parts = value_str.split('-')
                    if len(parts) == 2:
                        lower = float(parts[0].strip())
                        upper = float(parts[1].strip())
                        return (lower + upper) / 2
                except:
                    return np.nan
            
            # Handle single values
            try:
                # Remove any non-numeric characters except decimal point
                cleaned_value = re.sub(r'[^\d.]', '', value_str)
                return float(cleaned_value) if cleaned_value else np.nan
            except:
                return np.nan
        
        self.df['total_sqft'] = self.df['total_sqft'].apply(clean_sqft_value)
        
        # Remove rows with invalid sqft values
        self.df = self.df.dropna(subset=['total_sqft'])
        
        after_shape = self.df.shape
        self.log_step('Clean Total Sqft', before_shape, after_shape,
                     'Cleaned and standardized sqft values')
        return self
    
    def extract_bhk(self):
        """Extract BHK number from size column"""
        before_shape = self.df.shape
        
        def extract_bhk_number(size_str):
            if pd.isna(size_str):
                return np.nan
            
            # Extract number from strings like "2 BHK", "3BHK", "4 Bedroom"
            size_str = str(size_str).strip().upper()
            
            # Find numbers in the string
            numbers = re.findall(r'\d+', size_str)
            
            if numbers:
                bhk_num = int(numbers[0])
                # Reasonable bounds for BHK
                return bhk_num if 1 <= bhk_num <= 10 else np.nan
            
            return np.nan
        
        self.df['bhk'] = self.df['size'].apply(extract_bhk_number)
        
        # Remove rows where BHK extraction failed
        self.df = self.df.dropna(subset=['bhk'])
        
        after_shape = self.df.shape
        self.log_step('Extract BHK', before_shape, after_shape,
                     'Extracted BHK numbers from size column')
        return self
    
    def clean_location(self):
        """Clean and standardize location names"""
        before_shape = self.df.shape
        
        def clean_location_name(location):
            if pd.isna(location):
                return 'Unknown'
            
            location = str(location).strip()
            
            # Remove extra spaces
            location = re.sub(r'\s+', ' ', location)
            
            # Title case
            location = location.title()
            
            # Common standardizations
            standardizations = {
                'Electronic City': 'Electronic City',
                'E City': 'Electronic City',
                'Ecity': 'Electronic City',
                'Whitefield': 'Whitefield',
                'White Field': 'Whitefield',
                'Sarjapur Road': 'Sarjapur Road',
                'Sarjapura Road': 'Sarjapur Road',
            }
            
            for variant, standard in standardizations.items():
                if location.lower() == variant.lower():
                    return standard
            
            return location
        
        self.df['location'] = self.df['location'].apply(clean_location_name)
        
        after_shape = self.df.shape
        self.log_step('Clean Location', before_shape, after_shape,
                     'Cleaned and standardized location names')
        return self
    
    def remove_outliers(self, method='iqr', multiplier=1.5):
        """Remove outliers using specified method"""
        before_shape = self.df.shape
        
        numerical_cols = ['price', 'total_sqft', 'bhk', 'bath']
        
        for col in numerical_cols:
            if col in self.df.columns:
                if method == 'iqr':
                    Q1 = self.df[col].quantile(0.25)
                    Q3 = self.df[col].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - multiplier * IQR
                    upper_bound = Q3 + multiplier * IQR
                    
                    self.df = self.df[
                        (self.df[col] >= lower_bound) & 
                        (self.df[col] <= upper_bound)
                    ]
        
        # Business logic filtering
        if 'bath' in self.df.columns and 'bhk' in self.df.columns:
            # Bathrooms should not exceed BHK by more than 2
            self.df = self.df[self.df['bath'] <= self.df['bhk'] + 2]
        
        if 'total_sqft' in self.df.columns and 'bhk' in self.df.columns:
            # Minimum 300 sqft per room
            self.df['sqft_per_room'] = self.df['total_sqft'] / self.df['bhk']
            self.df = self.df[self.df['sqft_per_room'] >= 300]
        
        after_shape = self.df.shape
        self.log_step('Remove Outliers', before_shape, after_shape,
                     f'Removed outliers using {method} method')
        return self
    
    def get_cleaned_data(self):
        """Return cleaned dataset and cleaning log"""
        return self.df, pd.DataFrame(self.cleaning_log)


In [None]:
# Apply cleaning pipeline
print("🧹 Applying Data Cleaning Pipeline...")
cleaner = DataCleaner(df_raw)

df_cleaned, cleaning_log = (cleaner
                           .remove_duplicates()
                           .handle_missing_values()
                           .clean_total_sqft()
                           .extract_bhk()
                           .clean_location()
                           .remove_outliers()
                           .get_cleaned_data())

print("\n📊 Cleaning Summary:")
display(cleaning_log)

print(f"\n📈 Final Dataset:")
print(f"Original shape: {df_raw.shape}")
print(f"Cleaned shape: {df_cleaned.shape}")
print(f"Data retention: {len(df_cleaned)/len(df_raw)*100:.1f}%")


In [None]:
class FeatureEngineer:
    """Feature engineering pipeline"""
    
    def __init__(self, df):
        self.df = df.copy()
        self.feature_log = []
        
    def log_feature(self, feature_name, description):
        """Log feature creation"""
        self.feature_log.append({
            'Feature': feature_name,
            'Description': description,
            'Type': 'Derived'
        })
    
    def create_price_features(self):
        """Create price-related features"""
        if 'price' in self.df.columns and 'total_sqft' in self.df.columns:
            # Price per square feet
            self.df['price_per_sqft'] = (self.df['price'] * 100000) / self.df['total_sqft']
            self.log_feature('price_per_sqft', 'Price per square feet in rupees')
            
            # Price category
            self.df['price_category'] = pd.cut(
                self.df['price'],
                bins=[0, 30, 60, 100, 200, float('inf')],
                labels=['Budget', 'Mid-range', 'Premium', 'Luxury', 'Ultra-luxury']
            )
            self.log_feature('price_category', 'Categorical price segments')
        
        return self
    
    def create_size_features(self):
        """Create size-related features"""
        if 'total_sqft' in self.df.columns and 'bhk' in self.df.columns:
            # Square feet per room
            self.df['sqft_per_room'] = self.df['total_sqft'] / self.df['bhk']
            self.log_feature('sqft_per_room', 'Square feet per room (room size)')
        
        if 'bath' in self.df.columns and 'bhk' in self.df.columns:
            # Bath to BHK ratio
            self.df['bath_per_bhk'] = self.df['bath'] / self.df['bhk']
            self.log_feature('bath_per_bhk', 'Bathroom to bedroom ratio')
        
        # Total rooms (including living areas)
        room_cols = ['bhk', 'bath']
        if 'balcony' in self.df.columns:
            room_cols.append('balcony')
        
        if all(col in self.df.columns for col in room_cols):
            self.df['total_rooms'] = self.df[room_cols].sum(axis=1)
            self.log_feature('total_rooms', 'Total number of rooms')
        
        return self
    
    def create_location_features(self):
        """Create location-based features"""
        if 'location' in self.df.columns:
            # Location frequency (popularity)
            location_counts = self.df['location'].value_counts()
            self.df['location_frequency'] = self.df['location'].map(location_counts)
            self.log_feature('location_frequency', 'Number of properties in same location')
            
            # Location tier based on average price
            if 'price' in self.df.columns:
                location_avg_price = self.df.groupby('location')['price'].mean()
                price_quartiles = location_avg_price.quantile([0.33, 0.67])
                
                def get_location_tier(location):
                    avg_price = location_avg_price.get(location, 0)
                    if avg_price >= price_quartiles[0.67]:
                        return 'Premium'
                    elif avg_price >= price_quartiles[0.33]:
                        return 'Mid-tier'
                    else:
                        return 'Budget'
                
                self.df['location_tier'] = self.df['location'].apply(get_location_tier)
                self.log_feature('location_tier', 'Location tier based on average price')
        
        return self
    
    def create_amenity_features(self):
        """Create amenity-related features"""
        # Balcony indicator
        if 'balcony' in self.df.columns:
            self.df['has_balcony'] = (self.df['balcony'] > 0).astype(int)
            self.log_feature('has_balcony', 'Binary indicator for balcony presence')
        
        # Luxury indicator (multiple criteria)
        luxury_conditions = []
        
        if 'bhk' in self.df.columns:
            luxury_conditions.append(self.df['bhk'] >= 4)
        
        if 'bath' in self.df.columns:
            luxury_conditions.append(self.df['bath'] >= 3)
        
        if 'total_sqft' in self.df.columns:
            luxury_conditions.append(self.df['total_sqft'] >= 2000)
        
        if luxury_conditions:
            # Property is luxury if it meets at least 2 criteria
            luxury_score = sum(luxury_conditions)
            self.df['is_luxury'] = (luxury_score >= 2).astype(int)
            self.log_feature('is_luxury', 'Luxury property indicator')
        
        return self
    
    def create_efficiency_features(self):
        """Create efficiency and value features"""
        if 'price_per_sqft' in self.df.columns:
            # Price efficiency relative to location average
            if 'location' in self.df.columns:
                location_avg_psqft = self.df.groupby('location')['price_per_sqft'].mean()
                self.df['price_efficiency'] = (
                    self.df['price_per_sqft'] / 
                    self.df['location'].map(location_avg_psqft)
                )
                self.log_feature('price_efficiency', 'Price relative to location average')
        
        # Space efficiency
        if 'total_sqft' in self.df.columns and 'total_rooms' in self.df.columns:
            self.df['space_efficiency'] = self.df['total_sqft'] / self.df['total_rooms']
            self.log_feature('space_efficiency', 'Square feet per total room')
        
        return self
    
    def get_engineered_data(self):
        """Return feature-engineered dataset and feature log"""
        return self.df, pd.DataFrame(self.feature_log)


In [None]:
# Apply feature engineering
print("⚙️ Applying Feature Engineering Pipeline...")
engineer = FeatureEngineer(df_cleaned)

df_featured, feature_log = (engineer
                           .create_price_features()
                           .create_size_features()
                           .create_location_features()
                           .create_amenity_features()
                           .create_efficiency_features()
                           .get_engineered_data())

print("\n📊 Feature Engineering Summary:")
display(feature_log)

print(f"\n📈 Feature Engineering Results:")
print(f"Original features: {len(df_cleaned.columns)}")
print(f"Final features: {len(df_featured.columns)}")
print(f"New features created: {len(df_featured.columns) - len(df_cleaned.columns)}")

# Display sample of new features
new_features = [col for col in df_featured.columns if col not in df_cleaned.columns]
if new_features:
    print(f"\n🆕 New Features Sample:")
    display(df_featured[new_features].head())


In [None]:
# Categorical encoding pipeline
print("🏷️ Categorical Encoding Pipeline")

# Identify categorical columns
categorical_cols = df_featured.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# Initialize encoders
encoders = {}
df_encoded = df_featured.copy()

for col in categorical_cols:
    if col in df_encoded.columns:
        print(f"\nEncoding {col}:")
        print(f"  Unique values: {df_encoded[col].nunique()}")
        
        if df_encoded[col].nunique() <= 20:  # Use label encoding for reasonable number of categories
            # Label Encoding
            le = LabelEncoder()
            df_encoded[f'{col}_encoded'] = le.fit_transform(df_encoded[col].astype(str))
            encoders[col] = le
            
            print(f"  Applied Label Encoding")
            print(f"  Encoded range: 0 to {len(le.classes_) - 1}")
            
            # Show encoding mapping for small sets
            if len(le.classes_) <= 10:
                encoding_map = dict(zip(le.classes_, range(len(le.classes_))))
                print(f"  Encoding map: {encoding_map}")
        
        else:
            # For high cardinality, use frequency encoding
            freq_encoding = df_encoded[col].value_counts().to_dict()
            df_encoded[f'{col}_frequency'] = df_encoded[col].map(freq_encoding)
            encoders[f'{col}_frequency'] = freq_encoding
            
            print(f"  Applied Frequency Encoding (high cardinality)")
            print(f"  Frequency range: {df_encoded[f'{col}_frequency'].min()} to {df_encoded[f'{col}_frequency'].max()}")

print(f"\n✅ Categorical encoding completed")
print(f"Encoders created: {len(encoders)}")


In [None]:
# Feature selection for modeling
print("🎯 Feature Selection for Modeling")

# Define features for modeling
modeling_features = [
    # Core features
    'total_sqft', 'bhk', 'bath', 'balcony',
    
    # Encoded categorical features
    'location_encoded',
    
    # Derived features
    'price_per_sqft', 'sqft_per_room', 'bath_per_bhk',
    'location_frequency', 'has_balcony'
]

# Select features that exist in the dataset
available_features = [col for col in modeling_features if col in df_encoded.columns]
missing_features = [col for col in modeling_features if col not in df_encoded.columns]

print(f"Available features ({len(available_features)}): {available_features}")
if missing_features:
    print(f"Missing features ({len(missing_features)}): {missing_features}")

# Create final feature matrix
X = df_encoded[available_features].copy()
y = df_encoded['price'].copy() if 'price' in df_encoded.columns else None

print(f"\n📊 Final Feature Matrix:")
print(f"Features (X): {X.shape}")
if y is not None:
    print(f"Target (y): {y.shape}")

# Feature statistics
print(f"\n📈 Feature Statistics:")
display(X.describe().round(2))

# Check for any remaining missing values
missing_in_features = X.isnull().sum()
if missing_in_features.sum() > 0:
    print(f"\n⚠️ Missing values in features:")
    print(missing_in_features[missing_in_features > 0])
    
    # Fill remaining missing values
    X = X.fillna(X.median())
    print("✅ Filled missing values with median")

print(f"\n✅ Final feature matrix prepared successfully")


In [None]:
# Data scaling and train-test split
print("⚖️ Data Scaling and Train-Test Split")

if y is not None:
    # Train-test split with stratification
    try:
        price_bins = pd.qcut(y, q=5, labels=False, duplicates='drop')
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=price_bins
        )
        print("✅ Stratified train-test split completed")
    except Exception as e:
        print(f"⚠️ Stratified split failed: {e}")
        print("Using random split instead")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
    
    print(f"\nTrain set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")
    
    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Convert back to DataFrames for easier handling
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
    
    print(f"\n✅ Feature scaling completed")
    print(f"Scaler fitted on {X_train.shape[0]} training samples")
    
    # Scaling statistics
    print(f"\n📊 Scaling Statistics:")
    scaling_stats = pd.DataFrame({
        'Feature': X_train.columns,
        'Original_Mean': X_train.mean(),
        'Original_Std': X_train.std(),
        'Scaled_Mean': X_train_scaled.mean(),
        'Scaled_Std': X_train_scaled.std()
    }).round(3)
    
    display(scaling_stats.head(10))
else:
    print("⚠️ No target variable found, skipping train-test split")
    X_train, X_test, y_train, y_test = X, None, None, None
    
    # Still apply scaling to full dataset
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
    print("✅ Applied scaling to full dataset")


In [None]:
# Save processed data
print("💾 Saving Processed Data")

# Create processed data directory
processed_dir = Path('../data/processed')
processed_dir.mkdir(parents=True, exist_ok=True)

# Save cleaned and featured dataset
df_featured.to_csv(processed_dir / 'processed_data.csv', index=False)
print(f"✅ Processed data saved: {processed_dir / 'processed_data.csv'}")

# Save feature matrix
if y is not None:
    # Save train-test splits
    X_train.to_csv(processed_dir / 'X_train.csv', index=False)
    X_test.to_csv(processed_dir / 'X_test.csv', index=False)
    y_train.to_csv(processed_dir / 'y_train.csv', index=False, header=['price'])
    y_test.to_csv(processed_dir / 'y_test.csv', index=False, header=['price'])
    
    # Save scaled versions
    X_train_scaled.to_csv(processed_dir / 'X_train_scaled.csv', index=False)
    X_test_scaled.to_csv(processed_dir / 'X_test_scaled.csv', index=False)
    
    print(f"✅ Train-test splits saved")
    print(f"✅ Scaled features saved")

# Save encoders and scaler using joblib
import joblib

models_dir = Path('../models/trained_models')
models_dir.mkdir(parents=True, exist_ok=True)

# Save encoders
if encoders:
    joblib.dump(encoders, models_dir / 'encoders.pkl')
    print(f"✅ Encoders saved: {models_dir / 'encoders.pkl'}")

# Save scaler
if 'scaler' in locals():
    joblib.dump(scaler, models_dir / 'feature_scaler.pkl')
    print(f"✅ Scaler saved: {models_dir / 'feature_scaler.pkl'}")

print(f"\n🎉 Feature Engineering Pipeline Completed Successfully!")
print(f"📊 Final Summary:")
print(f"   Original records: {len(df_raw):,}")
print(f"   Final records: {len(df_featured):,}")
print(f"   Data retention: {len(df_featured)/len(df_raw)*100:.1f}%")
print(f"   Original features: {len(df_raw.columns)}")
print(f"   Final features: {len(df_featured.columns)}")
print(f"   Features for modeling: {len(available_features)}")
print(f"\n✅ Ready for model training!")
