In [None]:
# ========================================
# 1. Import Libraries
# ========================================
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn._oldcore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Set plot style
sns.set(style='whitegrid')

# ========================================
# 2. Load Dataset
# ========================================
# Adjust the file path
life_df = pd.read_csv("/kaggle/input/life-expectancy-data/Life Expectancy Data.csv")
print(life_df.head())
# ========================================
# 3. Preprocessing
# ========================================
# Replace inf/-inf with NaN
life_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with missing values (including those from infs)
life_df.dropna(inplace=True)

# Basic info about the dataset
print(life_df.info())
print(life_df.describe())

# Encoding categorical variables (Country and Status columns)
life_df['Country'] = life_df['Country'].astype('category').cat.codes
life_df['Status'] = life_df['Status'].astype('category').cat.codes

# Defining features (X) and target (y)
X_reg = life_df.drop('Life expectancy ', axis=1)
y_reg = life_df['Life expectancy ']

# Feature scaling (for regression models)
scaler = StandardScaler()
X_reg_scaled = scaler.fit_transform(X_reg)

# ========================================
# 4. Train/Test Split
# ========================================
X_train, X_test, y_train, y_test = train_test_split(X_reg_scaled, y_reg, test_size=0.2, random_state=42)

# ========================================
# 5. Model Training
# ========================================
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Support Vector Regressor (SVR)
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train, y_train)

# Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr_model.fit(X_train, y_train)

# ========================================
# 6. Evaluation
# ========================================

def evaluate_regression(name, y_true, y_pred):
    print(f"--- {name} ---")
    print("RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("R² Score:", r2_score(y_true, y_pred))

# Evaluate models
evaluate_regression("Linear Regression", y_test, lr_model.predict(X_test))
evaluate_regression("Random Forest Regressor", y_test, rf_model.predict(X_test))
evaluate_regression("Support Vector Regressor", y_test, svr_model.predict(X_test))
evaluate_regression("Gradient Boosting Regressor", y_test, gbr_model.predict(X_test))

# ========================================
# 6.5 Evaluation Visualizations
# ========================================
model_names = ['Linear Regression', 'Random Forest', 'SVR', 'Gradient Boosting']
models = [lr_model, rf_model, svr_model, gbr_model]

rmse_list = []
mae_list = []
r2_list = []

plt.figure(figsize=(14, 10))
for i, model in enumerate(models):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    rmse_list.append(rmse)
    mae_list.append(mae)
    r2_list.append(r2)
    
    # Actual vs Predicted
    plt.subplot(2, 2, i+1)
    sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.title(f'{model_names[i]}: Actual vs Predicted')
    plt.xlabel("Actual Life Expectancy")
    plt.ylabel("Predicted Life Expectancy")
    
plt.tight_layout()
plt.show()

# Bar plots for metrics
metrics_df = pd.DataFrame({
    'Model': model_names,
    'RMSE': rmse_list,
    'MAE': mae_list,
    'R2 Score': r2_list
})

metrics_df.set_index('Model').plot(kind='bar', figsize=(10, 6))
plt.title("Regression Model Comparison")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# ========================================
# 7. Visualizations
# ========================================
# Pairplot for feature relationships
selected_features = [
    'Life expectancy ',
    'Adult Mortality',
    'infant deaths',
    'Alcohol',
    ' BMI ',
    'Income composition of resources',
    'Schooling'
]


sns.pairplot(life_df, vars=selected_features)
plt.suptitle("Focused Pairplot of Selected Features", y=1.02)
plt.show()

# Correlation heatmap for features
plt.figure(figsize=(14, 10))
correlation_matrix = life_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap", fontsize=16)
plt.show()

# Feature distribution histograms
life_df.hist(figsize=(12, 8), bins=20)
plt.suptitle("Histograms of Features", fontsize=16)
plt.tight_layout()
plt.show()