In [1]:
# House Price Prediction - Complete and Improved Code

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("="*60)
print("HOUSE PRICE PREDICTION MODEL")
print("="*60)

# ============================================================================
# 1. LOAD THE DATASET
# ============================================================================
print("\n[1/12] Loading Dataset...")
df = pd.read_csv(r"D:\PROJECT\Daily-Python-Projects\house-price-predicition\House Price India.csv")
print(f"✓ Dataset Loaded Successfully: {df.shape[0]} rows, {df.shape[1]} columns")

# ============================================================================
# 2. DATA CLEANING
# ============================================================================
print("\n[2/12] Cleaning Data...")
# Strip spaces from column names
df.columns = df.columns.str.strip()

# Rename columns for convenience
df.rename(columns={
    'number of bedrooms': 'bedrooms',
    'number of bathrooms': 'bathrooms',
    'living area': 'living_area',
    'lot area': 'lot_area',
    'Built Year': 'built_year',
    'Renovation Year': 'renovation_year',
    'Area of the house(excluding basement)': 'area_no_basement',
    'Area of the basement': 'basement_area',
    'Distance from the airport': 'distance_airport',
    'Number of schools nearby': 'schools_nearby'
}, inplace=True)
print("✓ Column names cleaned and standardized")

# ============================================================================
# 3. EXPLORATORY DATA ANALYSIS
# ============================================================================
print("\n[3/12] Performing Exploratory Data Analysis...")
print("\nDataset Info:")
print(f"  - Total Records: {len(df)}")
print(f"  - Total Features: {len(df.columns)}")
print(f"  - Missing Values: {df.isnull().sum().sum()}")
print(f"\nPrice Statistics:")
print(f"  - Mean Price: ₹{df['Price'].mean():,.2f}")
print(f"  - Median Price: ₹{df['Price'].median():,.2f}")
print(f"  - Min Price: ₹{df['Price'].min():,.2f}")
print(f"  - Max Price: ₹{df['Price'].max():,.2f}")

# ============================================================================
# 4. HANDLE MISSING VALUES
# ============================================================================
print("\n[4/12] Handling Missing Values...")
# Drop rows where target (Price) is missing
df = df.dropna(subset=['Price'])

# Handle numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

# Handle categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])

print(f"✓ Missing values handled. Remaining nulls: {df.isnull().sum().sum()}")

# ============================================================================
# 5. FEATURE ENGINEERING
# ============================================================================
print("\n[5/12] Engineering New Features...")
# Create new useful features
df['house_age'] = 2024 - df['built_year']
df['is_renovated'] = (df['renovation_year'] > 0).astype(int)
df['years_since_renovation'] = df.apply(
    lambda x: 2024 - x['renovation_year'] if x['renovation_year'] > 0 else x['house_age'], 
    axis=1
)
df['total_rooms'] = df['bedrooms'] + df['bathrooms']
df['price_per_sqft'] = df['Price'] / (df['living_area'] + 1)  # +1 to avoid division by zero

print(f"✓ Created 5 new features: house_age, is_renovated, years_since_renovation, total_rooms, price_per_sqft")

# ============================================================================
# 6. ENCODE CATEGORICAL FEATURES
# ============================================================================
print("\n[6/12] Encoding Categorical Features...")
categorical_cols = ['waterfront present', 'condition of the house', 'grade of the house']

# Store original shape
original_shape = df.shape

for col in categorical_cols:
    if col in df.columns:
        df = pd.get_dummies(df, columns=[col], prefix=col, drop_first=True)

print(f"✓ One-hot encoding applied. Shape: {original_shape} → {df.shape}")

# ============================================================================
# 7. DROP IRRELEVANT COLUMNS
# ============================================================================
print("\n[7/12] Dropping Irrelevant Columns...")
drop_cols = ['id', 'Date', 'Postal Code', 'Lattitude', 'Longitude', 'price_per_sqft']
df = df.drop([col for col in drop_cols if col in df.columns], axis=1, errors='ignore')
print(f"✓ Dropped {len([col for col in drop_cols if col in df.columns])} columns")

# ============================================================================
# 8. PREPARE FEATURES AND TARGET
# ============================================================================
print("\n[8/12] Preparing Features and Target...")
X = df.drop('Price', axis=1)
y = df['Price']

print(f"✓ Features (X): {X.shape}")
print(f"✓ Target (y): {y.shape}")
print(f"\nFeatures included: {list(X.columns[:10])}{'...' if len(X.columns) > 10 else ''}")

# ============================================================================
# 9. FEATURE SCALING
# ============================================================================
print("\n[9/12] Scaling Features...")
scaler = StandardScaler()
num_features = X.select_dtypes(include=['int64', 'float64']).columns
X_scaled = X.copy()
X_scaled[num_features] = scaler.fit_transform(X[num_features])
print(f"✓ Scaled {len(num_features)} numerical features")

# ============================================================================
# 10. TRAIN-TEST SPLIT
# ============================================================================
print("\n[10/12] Splitting Data...")
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)
print(f"✓ Training set: {X_train.shape[0]} samples")
print(f"✓ Test set: {X_test.shape[0]} samples")

# ============================================================================
# 11. TRAIN MULTIPLE MODELS
# ============================================================================
print("\n[11/12] Training Multiple Models...")
print("-" * 60)

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=15),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=15, n_jobs=-1)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    
    results[name] = {
        'model': model,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    
    print(f"  ✓ RMSE: ₹{rmse:,.2f}")
    print(f"  ✓ MAE: ₹{mae:,.2f}")
    print(f"  ✓ R² Score: {r2:.4f}")
    print(f"  ✓ Cross-Val R² (5-fold): {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# ============================================================================
# 12. MODEL COMPARISON AND SELECTION
# ============================================================================
print("\n[12/12] Model Comparison and Selection...")
print("=" * 80)
print(f"{'Model':<20} {'RMSE':<20} {'MAE':<20} {'R² Score':<15}")
print("=" * 80)

best_model_name = None
best_r2 = -float('inf')

for name, result in results.items():
    print(f"{name:<20} ₹{result['rmse']:>15,.2f}   ₹{result['mae']:>15,.2f}   {result['r2']:>12.4f}")
    if result['r2'] > best_r2:
        best_r2 = result['r2']
        best_model_name = name

print("=" * 80)
print(f"\n Best Model: {best_model_name} (R² = {best_r2:.4f})")

# ============================================================================
# 13. SAVE MODELS AND SCALER
# ============================================================================
print("\n[13/12] Saving Models...")
for name, result in results.items():
    filename = name.lower().replace(' ', '_') + '_model.pkl'
    joblib.dump(result['model'], filename)
    print(f"  ✓ Saved: {filename}")

joblib.dump(scaler, 'scaler.pkl')
print(f"  ✓ Saved: scaler.pkl")

# Save feature names for future predictions
feature_names = list(X.columns)
joblib.dump(feature_names, 'feature_names.pkl')
print(f"  ✓ Saved: feature_names.pkl")

# ============================================================================
# 14. SAMPLE PREDICTION
# ============================================================================
print("\n" + "="*60)
print("SAMPLE PREDICTION")
print("="*60)

# Create a realistic sample using median values from training data
sample_input = X_train.median().values.reshape(1, -1)

best_model = results[best_model_name]['model']
predicted_price = best_model.predict(sample_input)[0]

print(f"\nUsing {best_model_name} for prediction:")
print(f"Predicted Price: ₹{predicted_price:,.2f}")

# Show some sample feature values used
print("\nSample Feature Values (first 5):")
for i, col in enumerate(X.columns[:5]):
    print(f"  - {col}: {sample_input[0][i]:.2f}")

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE!")
print("="*60)
print("\nSaved Files:")
print("  - linear_regression_model.pkl")
print("  - decision_tree_model.pkl")
print("  - random_forest_model.pkl")
print("  - scaler.pkl")
print("  - feature_names.pkl")
print("\nYou can now use these models for predictions!")

HOUSE PRICE PREDICTION MODEL

[1/12] Loading Dataset...
✓ Dataset Loaded Successfully: 14620 rows, 23 columns

[2/12] Cleaning Data...
✓ Column names cleaned and standardized

[3/12] Performing Exploratory Data Analysis...

Dataset Info:
  - Total Records: 14620
  - Total Features: 23
  - Missing Values: 0

Price Statistics:
  - Mean Price: ₹538,932.22
  - Median Price: ₹450,000.00
  - Min Price: ₹78,000.00
  - Max Price: ₹7,700,000.00

[4/12] Handling Missing Values...
✓ Missing values handled. Remaining nulls: 0

[5/12] Engineering New Features...
✓ Created 5 new features: house_age, is_renovated, years_since_renovation, total_rooms, price_per_sqft

[6/12] Encoding Categorical Features...
✓ One-hot encoding applied. Shape: (14620, 28) → (14620, 39)

[7/12] Dropping Irrelevant Columns...
✓ Dropped 0 columns

[8/12] Preparing Features and Target...
✓ Features (X): (14620, 32)
✓ Target (y): (14620,)

Features included: ['bedrooms', 'bathrooms', 'living_area', 'lot_area', 'number of floo