In [238]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("/kaggle/input/life-expectancy-who/Life Expectancy Data.csv")
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


# EDA:

In [239]:
print("DATA QUALITY:")
print("-" * 50)
print(" DATA SHAPE:\n")
print(f"-  {df.shape[0]} Rows\n-  {df.shape[1]} Columns\n")
print("   Columns:", list(df.columns))
print(" COMPLETENESS CHECK:\n")

# Completeness Check (Missing Values)
missing_values = df.isna().sum()[df.isnull().sum() > 0]
missing_percent = (missing_values / len(df)) * 100

# Combine missing count and percentage into a DataFrame
missing_data = pd.DataFrame({
    "Missing Values": missing_values,
    "Percent Missing": missing_percent
})
missing_data = missing_data[missing_data["Missing Values"] > 0]
print(missing_data)
print("\n")
print(" DATA TYPES AND ROW COUNTS:\n")
print(df.info())

DATA QUALITY:
--------------------------------------------------
 DATA SHAPE:

-  2938 Rows
-  22 Columns

   Columns: ['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling']
 COMPLETENESS CHECK:

                                 Missing Values  Percent Missing
Life expectancy                              10         0.340368
Adult Mortality                              10         0.340368
Alcohol                                     194         6.603131
Hepatitis B                                 553        18.822328
 BMI                                         34         1.157250
Polio                                        19         0.646698
Total expenditure                           226 

## Data Cleaning and Preprocessing:

In [240]:
# Changing the column names into a universal format
df.columns = df.columns.str.strip()
df.columns = df.columns.str.lower()

## Missing values:

In [241]:
cols_miss = df.isnull().sum()[df.isnull().sum() > 0].index
cols_miss

Index(['life expectancy', 'adult mortality', 'alcohol', 'hepatitis b', 'bmi',
       'polio', 'total expenditure', 'diphtheria', 'gdp', 'population',
       'thinness  1-19 years', 'thinness 5-9 years',
       'income composition of resources', 'schooling'],
      dtype='object')

In [None]:
from sklearn.impute import SimpleImputer
imputer_num = SimpleImputer(strategy="mean")

for col in cols_miss:
    if df[col].dtype != "object":
        df[col] = imputer_num.fit_transform(df[[col]])
        print(f"{col} done")

## Categoricals

We have two categorical variables only: country and status and as a part of EDA we can do the following to have a general idea of what we're dealing with and to see which encoding method suits our case best:
- we check how many countries we have to see if there are any multiple appearances of certain countries
- we then check the unique values of status to see the frequency of each value

In [None]:
df['country'].nunique()

In [None]:
df.groupby("country").count()["year"].reset_index(name="country frequencies").groupby("country frequencies").count()

We have 193 unique countries, which is too many for a numerical encoding like Label or One-Hot encoding.

- Label encoding introduces an artificial order, which doesn't make sense for countries.
- Regression models will misinterpret the numbers, leading to poor performance

Our best option here is Target Encoding because it handles high-cardinality well and it captures meaningful relationships. However, we only have two different country frequencies 1 and 16, this is more likely going to lead to high colinearity between the target variable and the encoded countries (183 countries are going to have the mean of the target variable across 16 rows, and the other 10 are going to have the same target value for their rows).

For that we are going to improve the target encoding using a smoothing approach.

In [None]:
# Compute mean life expectancy per country
country_means = df.groupby('country')['life expectancy'].mean()

# Replace country names with their corresponding mean life expectancy
df['country_encoded'] = df['country'].map(country_means)

In [None]:
df.groupby("status").count()["country"]

As we can see we have a binary categorical column like, in thi cas both One-Hot encoding and label encoding are valid, it's just that since we are using Linear Regression OHE is way better:
- Label Encoding is generally better for tree-based models because these models are not affected by numerical ordering.
- Linear models might misinterpret label encoding as a numerical relationship (i.e., "Developed" is better than "Developing"), which can bias predictions.

=> so OHE it is...


In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder(drop="first",sparse_output=False)
df["status"] = ohe.fit_transform(df[["status"]])

# Training:

In [None]:
corr = df.drop(columns=["country"]).corr()

plt.figure(figsize=(10, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', 
linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

Remarks:

- We notice some high colinearity between certain features, starting with:
    - infant deaths and under-five deaths, corr = 1.0 => redundancy, we drop under-five deaths.
    - Percentage expenditure and gdp with corr = 0.89 => redundancy, w drop percentage expenditure.
    - country_encoded and life expectancy (target variable) with corr = 0.96 => redundancy, we drop them both from the training set.

In [None]:
X_simple = df[["schooling"]] # training set for simple regression
# trainig set for other regression techniques:
X = df.drop(columns=["life expectancy","country","country_encoded","under-five deaths","percentage expenditure"]) 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

## Simple regression:

In [None]:

y = df["life expectancy"]

X_train, X_test, y_train, y_test = train_test_split(X_simple, y, test_size=0.2, random_state=42)

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

y_pred = linear_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Simple Linear Regression:")
print(f"MSE: {mse:.2f}")
print(f"R²: {r2:.2f}")

## Multiple regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

multiple_linear_reg = LinearRegression()
multiple_linear_reg.fit(X_train, y_train)

y_pred = multiple_linear_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nMultiple Linear Regression:")
print(f"MSE: {mse:.2f}")
print(f"R²: {r2:.2f}")

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features (degree=2)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train)

y_pred_poly = poly_reg.predict(X_test_poly)

mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

print("\nPolynomial Regression (Degree=2):")
print(f"MSE: {mse_poly:.2f}")
print(f"R²: {r2_poly:.2f}")

In [None]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=1.0)  # Alpha controls the strength of regularization
ridge_reg.fit(X_train_poly, y_train)

y_pred_ridge = ridge_reg.predict(X_test_poly)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("\nRidge Regression:")
print(f"MSE: {mse_ridge:.2f}")
print(f"R²: {r2_ridge:.2f}")

In [None]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_train_poly, y_train)

y_pred_lasso = lasso_reg.predict(X_test_poly)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("\nLasso Regression:")
print(f"MSE: {mse_lasso:.2f}")
print(f"R²: {r2_lasso:.2f}")