<a href="https://colab.research.google.com/github/jesinthkal/Projects/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# ==========================================
# 1. Import & Inspect Data
# ==========================================
print("Loading dataset...")

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data_data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
df = pd.DataFrame(data_data, columns=feature_names)
df['PRICE'] = target
print("Dataset Loaded: Boston Housing")

# 1.1 Quick Inspection
print("\n--- Data Head ---")
print(df.head())
print("\n--- Data Info ---")
print(df.info())
print("\n--- Missing Values ---")
print(df.isnull().sum())


  raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)


Loading dataset...


In [None]:
# ==========================================
# 2. Exploratory Data Analysis (EDA)
# ==========================================
# 2.1 Correlation Heatmap
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
# Select features most correlated with PRICE
top_corr_features = correlation_matrix.index[abs(correlation_matrix["PRICE"]) > 0.1]
sns.heatmap(df[top_corr_features].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap (Features with high correlation to PRICE)")
plt.show()

# 2.2 Scatter Plots for Key Features
# Adjust features based on which dataset you are using
feature1 = 'MedInc' if 'MedInc' in df.columns else 'RM'    # Median Income (CA) or Rooms (Boston)
feature2 = 'AveRooms' if 'AveRooms' in df.columns else 'LSTAT' # Avg Rooms (CA) or Lower Status (Boston)

plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
sns.scatterplot(x=df[feature1], y=df['PRICE'], alpha=0.5)
plt.title(f"Price vs {feature1}")
plt.xlabel(feature1)
plt.ylabel("Price")

plt.subplot(1, 2, 2)
sns.scatterplot(x=df[feature2], y=df['PRICE'], alpha=0.5)
plt.title(f"Price vs {feature2}")
plt.xlabel(feature2)
plt.ylabel("Price")
plt.show()

# ==========================================
# 3. Preprocessing
# ==========================================
# 3.1 Split Data
X = df.drop('PRICE', axis=1)
y = df['PRICE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3.2 Feature Scaling (Standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nData splitting and scaling complete.")
print(f"Train shape: {X_train_scaled.shape}, Test shape: {X_test_scaled.shape}")



In [None]:
# ==========================================
# 4. Modeling
# ==========================================
model = LinearRegression()
model.fit(X_train_scaled, y_train)
print("\nModel training complete.")



In [None]:
# ==========================================
# 5. Evaluation
# ==========================================
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

def print_metrics(y_true, y_pred, set_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n--- {set_name} Metrics ---")
    print(f"MAE:  {mae:.4f}")
    print(f"MSE:  {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"RÂ²:   {r2:.4f}")

print_metrics(y_train, y_train_pred, "Training")
print_metrics(y_test, y_test_pred, "Test")

# 5-Fold Cross Validation
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(f"\n5-Fold CV RMSE: {-cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")



In [None]:
# ==========================================
# 6. Residual Diagnostics
# ==========================================
residuals = y_test - y_test_pred

plt.figure(figsize=(14, 5))

# Residuals vs Fitted
plt.subplot(1, 2, 1)
sns.scatterplot(x=y_test_pred, y=residuals, alpha=0.5)
plt.axhline(0, color='r', linestyle='--')
plt.xlabel("Fitted Values (Predicted Price)")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residuals vs Fitted Values")

# Histogram of Residuals
plt.subplot(1, 2, 2)
sns.histplot(residuals, kde=True, bins=30)
plt.title("Distribution of Residuals")
plt.xlabel("Residual Value")
plt.show()



In [None]:
# ==========================================
# 7. Interpretation
# ==========================================
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_,
    'Abs_Coefficient': np.abs(model.coef_)
})

print("\n--- Top 5 Feature Importance ---")
print(coefficients.sort_values(by='Abs_Coefficient', ascending=False).head(5)[['Feature', 'Coefficient']])