Task 1: Linear Regression
        Optimzed Regression

In [None]:
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load Dataset
dataset_path = "/mnt/data/insurance.csv"
df = pd.read_csv(dataset_path)

# Display dataset info
display(df.head())
df.info()

# Data Preprocessing
df = pd.get_dummies(df, drop_first=True)  # One-hot encoding categorical variables
X = df.drop(columns=['charges'])  # Features
y = df['charges']  # Target

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Visualizing Data Distribution
plt.figure(figsize=(10, 5))
sns.histplot(y, bins=30, kde=True)
plt.title("Distribution of Medical Charges")
plt.show()

# Correlation Heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

# Linear Regression (Gradient Descent Optimization)
param_grid_lr = {'alpha': [0.0001, 0.001, 0.01, 0.1], 'max_iter': [500, 1000, 5000]}
lr_grid_search = GridSearchCV(SGDRegressor(), param_grid_lr, cv=5, scoring='neg_mean_squared_error', verbose=1)
lr_grid_search.fit(X_train, y_train)
best_lr = lr_grid_search.best_estimator_
print(f"Best Linear Regression params: {lr_grid_search.best_params_}")

# Decision Tree Regression
param_grid_dt = {'max_depth': [3, 5, 10, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
dt_grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid_dt, cv=5, scoring='neg_mean_squared_error', verbose=1)
dt_grid_search.fit(X_train, y_train)
best_dt = dt_grid_search.best_estimator_
print(f"Best Decision Tree params: {dt_grid_search.best_params_}")

# Random Forest Regression
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 10, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
rf_grid_search = GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5, scoring='neg_mean_squared_error', verbose=1)
rf_grid_search.fit(X_train, y_train)
best_rf = rf_grid_search.best_estimator_
print(f"Best Random Forest params: {rf_grid_search.best_params_}")

# Model Evaluation
mse_lr = mean_squared_error(y_test, best_lr.predict(X_test))
mse_dt = mean_squared_error(y_test, best_dt.predict(X_test))
mse_rf = mean_squared_error(y_test, best_rf.predict(X_test))

best_model = min([(mse_lr, best_lr, "Linear Regression"), (mse_dt, best_dt, "Decision Tree"), (mse_rf, best_rf, "Random Forest")], key=lambda x: x[0])
print(f"Best model: {best_model[2]} with MSE: {best_model[0]}")

# Save Best Model
joblib.dump(best_model[1], "best_health_model.pkl")

# Visualizing Model Performance
models = ["Linear Regression", "Decision Tree", "Random Forest"]
mse_values = [mse_lr, mse_dt, mse_rf]
plt.figure(figsize=(8, 5))
sns.barplot(x=models, y=mse_values, palette="viridis")
plt.title("Model Performance Comparison (MSE)")
plt.ylabel("Mean Squared Error")
plt.show()

# Load and Predict
loaded_model = joblib.load("best_health_model.pkl")
sample_input = np.array(X_test[0]).reshape(1, -1)
prediction = loaded_model.predict(sample_input)
print(f"Predicted medical charge: {prediction[0]}")
