In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from IPython.display import display
import joblib

# Load dataset
dataset_path = 'housing_data.csv'
housing_df = pd.read_csv(dataset_path)

# Display dataset shape
print("Dataset shape:", housing_df.shape)

# Count X (features) and Y (target)
print("Number of Features (X):", len(housing_df.columns) - 1)
print("Number of Target Values (Y):", housing_df['median_house_value'].count())

# Remove unnecessary columns
if 'No' in housing_df.columns:
    housing_df = housing_df.drop(columns=['No'])

# Identify features and target
housing_target = 'median_house_value'
housing_features = [col for col in housing_df.columns if col != housing_target]

# Splitting into train-test (last 190 for test)
housing_train = housing_df.iloc[:-190].copy()
housing_test = housing_df.iloc[-190:].copy()

# Separating features and target
X_train_housing = housing_train[housing_features]
y_train_housing = housing_train[housing_target]
X_test_housing = housing_test[housing_features]
y_test_housing = housing_test[housing_target]

# Handling categorical and numerical features
numeric_features = X_train_housing.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train_housing.select_dtypes(exclude=[np.number]).columns.tolist()

# Handling missing values using median imputation for numerical data
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])

# Encoding categorical features using One-Hot Encoding
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder(handle_unknown="ignore", drop='first'))])

# Combining transformations using ColumnTransformer
preprocessor_housing = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Transform features
X_train_housing = preprocessor_housing.fit_transform(X_train_housing)
X_test_housing = preprocessor_housing.transform(X_test_housing)

# Training Regression Models
models_reg = {
    "Linear Regression": LinearRegression(),
    "KNN Regression": KNeighborsRegressor(n_neighbors=5),
    "Random Forest Regression": RandomForestRegressor(n_estimators=100, random_state=42)
}

results_reg = {}
predicted_values = {}

for name, model in models_reg.items():
    model.fit(X_train_housing, y_train_housing)
    y_pred = model.predict(X_test_housing)
    results_reg[name] = {
        "MAE": mean_absolute_error(y_test_housing, y_pred),
        "MSE": mean_squared_error(y_test_housing, y_pred),
        "R2 Score": r2_score(y_test_housing, y_pred)
    }
    predicted_values[name] = np.mean(y_pred)

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results_reg).T
print("\nEvaluation Metrics table for Regression Models:")
display(results_df)

# Convert predicted values dictionary to a DataFrame
predicted_values_df = pd.DataFrame.from_dict(predicted_values, orient='index', columns=['Predicted Mean House Value'])

# Display the DataFrame for predicted mean house value for each model
print("\nPredicted Mean House Value for Each Model:")
display(predicted_values_df)

# Save each trained model
for name, model in models_reg.items():
    joblib.dump(model, f"{name.replace(' ', '_')}_model.pkl")

print("All models saved successfully!")

# Plot each metric separately
fig, axes = plt.subplots(3, 1, figsize=(10, 18))

# Plot MAE
results_df[['MAE']].plot(kind='bar', ax=axes[0], color='blue', legend=False)
axes[0].set_title("Mean Absolute Error (MAE)")
axes[0].set_ylabel("MAE")
axes[0].grid(True)

# Plot MSE
results_df[['MSE']].plot(kind='bar', ax=axes[1], color='green', legend=False)
axes[1].set_title("Mean Squared Error (MSE)")
axes[1].set_ylabel("MSE")
axes[1].grid(True)

# Plot R2 Score
results_df[['R2 Score']].plot(kind='bar', ax=axes[2], color='red', legend=False)
axes[2].set_title("R² Score")
axes[2].set_ylabel("R² Score")
axes[2].grid(True)

plt.tight_layout()
plt.show()



FileNotFoundError: [Errno 2] No such file or directory: 'housing_data.csv'