In [None]:
#----------------------------------------
# ✅ Import Necessary Libraries
#----------------------------------------

# For data manipulation
import pandas as pd
import numpy as np

# For visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline  # To display plots within the notebook

# For preprocessing and machine learning models
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

# Regression algorithms
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# For model evaluation
from sklearn.metrics import r2_score

#----------------------------------------
# ✅ Data Loading and Cleaning
#----------------------------------------

# Load the dataset
data = pd.read_csv('../input/carmpg/car-mpg (1).csv')
print(data.head())

# Drop the non-numeric 'car_name' column as it's not useful for regression
data = data.drop(['car_name'], axis=1)

# Replace numeric codes in 'origin' column with meaningful labels
data['origin'] = data['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})

# Convert 'origin' categorical text labels into dummy/indicator variables (one-hot encoding)
data = pd.get_dummies(data, columns=['origin'])

# Replace any '?' character (non-numeric) with np.nan (Not a Number)
data = data.replace('?', np.nan)

# Replace all missing values (NaN) with the **median of the respective columns**
data = data.apply(lambda x: x.fillna(x.median()), axis=0)

print(data.head())

#----------------------------------------
# ✅ Feature Scaling
#----------------------------------------

# Define independent variables (features) and dependent variable (target)
X = data.drop(['mpg'], axis=1)  # All features except 'mpg'
y = data[['mpg']]              # Target variable in dataframe form

# Scale/standardize the features to have mean=0 and std=1
X_s = preprocessing.scale(X)
X_s = pd.DataFrame(X_s, columns=X.columns)

# Scale/standardize the target variable
y_s = preprocessing.scale(y)
y_s = pd.DataFrame(y_s, columns=y.columns)

#----------------------------------------
# ✅ Train-Test Split
#----------------------------------------

# Split dataset into 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.30, random_state=1)

print("Training data shape:", X_train.shape)

#----------------------------------------
# ✅ Linear Regression Model
#----------------------------------------

# Instantiate and fit the Linear Regression model
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

# Print each feature's coefficient
for idx, col_name in enumerate(X_train.columns):
    print(f'The coefficient for {col_name} is {regression_model.coef_[0][idx]}')

# Print the model's intercept
intercept = regression_model.intercept_[0]
print(f'The intercept is {intercept}')

#----------------------------------------
# ✅ Ridge Regression
#----------------------------------------

# Ridge regression with alpha=0.3 (regularization strength)
ridge_model = Ridge(alpha=0.3)
ridge_model.fit(X_train, y_train)

print('Ridge model coefficients:', ridge_model.coef_)

#----------------------------------------
# ✅ Lasso Regression
#----------------------------------------

# Lasso regression with alpha=0.1 (adds sparsity by shrinking coefficients to zero)
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

print('Lasso model coefficients:', lasso_model.coef_)

#----------------------------------------
# ✅ Model Performance Evaluation (R^2)
#----------------------------------------

# Linear Regression performance
print("Linear Regression R^2 on Train:", regression_model.score(X_train, y_train))
print("Linear Regression R^2 on Test :", regression_model.score(X_test, y_test))
print('*************************')

# Ridge Regression performance
print("Ridge Regression R^2 on Train:", ridge_model.score(X_train, y_train))
print("Ridge Regression R^2 on Test :", ridge_model.score(X_test, y_test))
print('*************************')

# Lasso Regression performance
print("Lasso Regression R^2 on Train:", lasso_model.score(X_train, y_train))
print("Lasso Regression R^2 on Test :", lasso_model.score(X_test, y_test))

#----------------------------------------
# ✅ Error Metrics (MSE and RMSE)
#----------------------------------------

# Calculate Mean Squared Error on test data
mse = np.mean((regression_model.predict(X_test) - y_test) ** 2)

# Calculate Root Mean Squared Error (square root of MSE)
import math
rmse = math.sqrt(mse)
print('Root Mean Squared Error (RMSE):', rmse)

#----------------------------------------
# ✅ Visualization: Residual Plots
#----------------------------------------

# Residual plot for 'hp' (horsepower)
fig = plt.figure(figsize=(10,8))
sns.residplot(x=X_test['hp'], y=y_test['mpg'], color='green', lowess=True)
plt.title('Residuals vs Horsepower')
plt.show()

# Residual plot for 'acc' (acceleration)
fig = plt.figure(figsize=(10,8))
sns.residplot(x=X_test['acc'], y=y_test['mpg'], color='green', lowess=True)
plt.title('Residuals vs Acceleration')
plt.show()

#----------------------------------------
# ✅ Visualization: Actual vs Predicted
#----------------------------------------

# Predict y values on test set
y_pred = regression_model.predict(X_test)

# Plot actual mpg vs predicted mpg
plt.scatter(y_test['mpg'], y_pred, color='blue')
plt.xlabel('Actual MPG')
plt.ylabel('Predicted MPG')
plt.title('Actual vs Predicted MPG')
plt.show()
