In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib
import os

# Load the raw dataset
df = pd.read_csv('raw_data.csv')

# --- Data Cleaning and Preprocessing ---

# For this example, we assume 'amenities_encoded' exists. 
# If not, you can create it or drop it. Let's assume it's a numerical feature.
# If 'amenities_encoded' does not exist in your raw_data.csv, you can create a placeholder:
if 'amenities_encoded' not in df.columns:
    df['amenities_encoded'] = 0 # Placeholder if the column is missing

# Define columns to be used before dropping rows with missing values
# This ensures we don't drop rows if 'price' is missing but other features are present
feature_cols = ['area', 'bedrooms', 'location', 'amenities_encoded']
target_col = 'price'

# Drop rows where essential features or the target are missing
df.dropna(subset=feature_cols + [target_col], inplace=True)

# --- Feature Engineering ---

# One-hot encode the 'location' column
df_processed = pd.get_dummies(df, columns=['location'], drop_first=True)

# --- Model Training ---

# Define your features (X) and target (y)
# X should include all columns EXCEPT the target variable 'price'
X = df_processed.drop(columns=[target_col])
y = df_processed[target_col]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# --- Save the Model and Columns ---

# Create the 'models' directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save the trained model
joblib.dump(model, 'models/real_estate_model.pkl')

# Save the list of feature columns
joblib.dump(X.columns.tolist(), 'models/model_columns.pkl')

print("✅ Model and column list saved successfully!")
print("Model trained on the following columns:")
for col in X.columns:
    print(f"- {col}")