In [6]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import random
import pickle

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [7]:
# Load data
data = pd.read_csv('cleaned_data.csv')

# Basic feature engineering
data['Price_log'] = np.log1p(data['Price'])
data['baths_beds_interaction'] = data['Baths'] * data['Beds']  # New interaction feature

# Remove extreme outliers (top/bottom 1%)
q_low = data['Price_log'].quantile(0.01)
q_high = data['Price_log'].quantile(0.99)
data = data[(data['Price_log'] > q_low) & (data['Price_log'] < q_high)]

# Select features and target
numerical_features = ['Baths', 'Beds', 'House size', 'baths_beds_interaction']  # Added interaction
X = data[numerical_features]
y = data['Price_log']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Create preprocessing pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# Define log transformer for House size
log_transformer = FunctionTransformer(np.log1p, validate=False)

# Preprocessing pipeline
preprocessor = Pipeline([
    ('log', ColumnTransformer([
        ('log_size', log_transformer, ['House size']),
        ('passthrough', 'passthrough', ['Baths', 'Beds', 'baths_beds_interaction'])
    ])),
    ('scaler', StandardScaler()),
    ('transformer', PowerTransformer())
])

# Fit and transform data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Save preprocessor to pickle file
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)
print("Preprocessor saved as 'preprocessor.pkl'")

Preprocessor saved as 'preprocessor.pkl'


In [9]:
# Define model architecture
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Now 4 features
    Dense(256, activation='relu'),  # Added layer
    BatchNormalization(),
    Dropout(0.2),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dense(1)
])

# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss='mse',
    metrics=['mae']
)

# Define callbacks
checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(patience=10, restore_best_weights=True)

# Train model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=64,
    callbacks=[early_stopping, checkpoint],
    verbose=1
)

Epoch 1/100
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 265.5157 - mae: 16.2363 - val_loss: 121.8010 - val_mae: 11.0111
Epoch 2/100
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 85.4416 - mae: 8.9491 - val_loss: 4.9847 - val_mae: 2.1590
Epoch 3/100
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 2.8742 - mae: 1.4618 - val_loss: 0.2941 - val_mae: 0.3947
Epoch 4/100
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.3488 - mae: 0.4462 - val_loss: 0.3321 - val_mae: 0.4228
Epoch 5/100
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.2867 - mae: 0.4052 - val_loss: 0.2939 - val_mae: 0.3868
Epoch 6/100
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.2726 - mae: 0.3960 - val_loss: 0.2373 - val_mae: 0.3504
Epoch 7/100
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[

In [10]:
# Load best model weights
model.load_weights('best_model.keras')

# Make predictions
y_pred = model.predict(X_test).flatten()

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step 
Model Performance:
RMSE: 0.4476
R²: 0.6118
