# 3. Model Training
---
This notebook handles model training including train-test splitting, normalization, and fitting models.

In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [10]:
# Load dataset
data = pd.read_csv('housing.csv')

# Split features and target
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']

# Convert categorical variables to numeric using one-hot encoding
X = pd.get_dummies(X, drop_first=True)


In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Normalize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Train multiple regression models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}
trained_models = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    trained_models[name] = model

In [14]:
# Save trained models and test data using joblib
import joblib
for name, model in trained_models.items():
    joblib.dump(model, f'{name.replace(" ", "_")}.pkl')
joblib.dump((X_test_scaled, y_test), 'test_data.pkl')

['test_data.pkl']