In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
df = pd.read_csv("D:\IITG\other courses\AIML project1\Data_file - data_file.csv")

# Preprocessing
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']
df.drop(columns=['total_rooms', 'total_bedrooms', 'population', 'households'], inplace=True)
df = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=True)

# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = ['longitude', 'latitude', 'housing_median_age', 'median_income', 'median_house_value']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Add polynomial features
df['median_income_squared'] = df['median_income'] ** 2
df['median_income_cubed'] = df['median_income'] ** 3

# Add interaction terms
df['income_age_interaction'] = df['median_income'] * df['housing_median_age']

# Log transformation of skewed features
df['log_median_income'] = np.log1p(df['median_income'])
df['log_median_house_value'] = np.log1p(df['median_house_value'])

# Scale new features
df[['median_income_squared', 'median_income_cubed', 'income_age_interaction']] = scaler.fit_transform(
    df[['median_income_squared', 'median_income_cubed', 'income_age_interaction']]
)

# Regression Models
# Simple Linear Regression
X_simple = df[['median_income']]
y = df['median_house_value']
X_train_simple, X_test_simple, y_train, y_test = train_test_split(X_simple, y, test_size=0.2, random_state=42)

model_simple = LinearRegression()
model_simple.fit(X_train_simple, y_train)
y_pred_simple = model_simple.predict(X_test_simple)
rmse_simple = np.sqrt(mean_squared_error(y_test, y_pred_simple))
r2_simple = r2_score(y_test, y_pred_simple)

# Multiple Linear Regression
X_multiple = df.drop(columns=['median_house_value'])
X_train_multiple, X_test_multiple, y_train, y_test = train_test_split(X_multiple, y, test_size=0.2, random_state=42)

model_multiple = LinearRegression()
model_multiple.fit(X_train_multiple, y_train)
y_pred_multiple = model_multiple.predict(X_test_multiple)
rmse_multiple = np.sqrt(mean_squared_error(y_test, y_pred_multiple))
r2_multiple = r2_score(y_test, y_pred_multiple)

# Print evaluation metrics
print("Simple Linear Regression:")
print(f"RMSE: {rmse_simple}")
print(f"R² Score: {r2_simple}\n")

print("Multiple Linear Regression:")
print(f"RMSE: {rmse_multiple}")
print(f"R² Score: {r2_multiple}")


Simple Linear Regression:
RMSE: 0.17362611373591147
R² Score: 0.45885918903846656

Multiple Linear Regression:
RMSE: 0.016973398647176022
R² Score: 0.9948284846046926
