# S2 - L3 - E1 / Life Expectancy prediction

In this exercise, I will create a feed-forward neural network using PyTorch to solve a regression problem. The goal is to predict the life expectancy of different countries using various features such as Gross Domestic Product (GDP), health expenditure, adult mortality, access to education, and others. You will use a dataset provided by the World Health Organization (WHO) available on Kaggle.

### 1. Data preprocessing

In [None]:
import seaborn as sns
import numpy as np 
import pandas as pd 
import matplotlib as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import torch
import torch.nn as nn
from  torch.utils.data import DataLoader,Dataset

In [None]:
df = pd.read_csv('dataset.csv')

In [None]:
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [None]:
print("Number of features: ", df.shape[1])
print("Number of samples: ", df.shape[0])

Number of features:  22
Number of samples:  2938


In [None]:
print(df.columns)
df.columns = [c.strip().replace(" ","_") for c in df.columns]

print(df.columns)

Index(['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')
Index(['Country', 'Year', 'Status', 'Life_expectancy', 'Adult_Mortality',
       'infant_deaths', 'Alcohol', 'percentage_expenditure', 'Hepatitis_B',
       'Measles', 'BMI', 'under-five_deaths', 'Polio', 'Total_expenditure',
       'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness__1-19_years',
       'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling'],
      dtype='object')


Check null values in key columns

In [None]:
df.isnull().sum()

Country                              0
Year                                 0
Status                               0
Life_expectancy                     10
Adult_Mortality                     10
infant_deaths                        0
Alcohol                            194
percentage_expenditure               0
Hepatitis_B                        553
Measles                              0
BMI                                 34
under-five_deaths                    0
Polio                               19
Total_expenditure                  226
Diphtheria                          19
HIV/AIDS                             0
GDP                                448
Population                         652
thinness__1-19_years                34
thinness_5-9_years                  34
Income_composition_of_resources    167
Schooling                          163
dtype: int64

Erease null values in key columns

In [None]:
required_cols = [
    "BMI", "GDP", "Schooling", "Adult_Mortality",
    "Income_composition_of_resources", "Life_expectancy"
]
df = df.dropna(subset=required_cols).reset_index(drop=True)

Complete the rest of the columns missing values

In [None]:

numeric_cols = df.select_dtypes(include=[np.number]).columns
imp = SimpleImputer(strategy="mean")
df[numeric_cols] = imp.fit_transform(df[numeric_cols])

df.isnull().sum()

Country                            0
Year                               0
Status                             0
Life_expectancy                    0
Adult_Mortality                    0
infant_deaths                      0
Alcohol                            0
percentage_expenditure             0
Hepatitis_B                        0
Measles                            0
BMI                                0
under-five_deaths                  0
Polio                              0
Total_expenditure                  0
Diphtheria                         0
HIV/AIDS                           0
GDP                                0
Population                         0
thinness__1-19_years               0
thinness_5-9_years                 0
Income_composition_of_resources    0
Schooling                          0
dtype: int64

### 2. Split Dataset

In [None]:
y = df['Life_expectancy'].copy()
desired_cols = ["BMI","GDP","Schooling","Adult_Mortality","Income_composition_of_resources"]
x = df.drop('Life_expectancy', axis=1)
x = df[desired_cols]

print(x.shape)
print(y.shape)


(2458, 5)
(2458,)


### 3.  Standarize variebles to speed up convergence

In [None]:
x_train, x_test, y_train,y_test = train_test_split(x,y,test_size=0.15,shuffle=True)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
x_train,y_train,x_val,y_val = train_test_split(x_train,y_train,test_size=0.2,shuffle=True) 

### 4. Create Dataset Class and transform into tensors 

In [None]:
class LifeDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(np.array(y).astype(np.float32), dtype=torch.float32)
    
    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self, idx):
        return self.x[idx],self.y[idx]

In [None]:
train_dataset = LifeDataset(x_train,y_train)

test_dataset = LifeDataset(x_test,y_test)


In [None]:
val_dataset = LifeDataset(x_val,y_val)

### 5. Use Dataloaders to rearrange data in barches

In [None]:
batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,num_workers=4) 
val_dataloader = DataLoader(val_dataset,batch_size=batch_size,shuffle=False,num_workers=4) 
test_dataloader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False,num_workers=4)

### 6. Create model 

In [None]:
class MLPRegressor(nn.Module):
    def __init__(self, input_shape):
        super().__init__()
        
        self.fc1 = nn.Linear(input_shape,16)
        self.fc2 = nn.Linear(16,32)
        self.fc3 = nn.Linear(32,1)
        

    def forward(self, x):
        
        x = nn.ReLU(self.fc1(x))
        x = nn.ReLU(self.fc2(x))
        x = self.fc3(x)
        return x



In [None]:
model = MLPRegressor(input_shape = x.shape[1])

### 7. Define cost function and optimizer

In [None]:
learning_rate = 0.1
weight_decay = 0.01
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr = learning_rate,weight_decay=weight_decay) 

In [None]:
learning_rate = 0.01
epochs = 400 # Especificar número de épocas

optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate,weight_decay=0.01 ) # Especificar el optimizador con la tasa de aprendizaje y parametros del modelo
loss_fn = nn.MSELoss() # Especificar la función de coste

### 8. Model Training and Evaluation functions


In [None]:
def train(model,train_dataloader,optimizer,loss_fn):
    model.train()
    epoch_loss = 0 
    for i_batch,(x_train,y_train) in enumerate(train_dataloader):

        optimizer.zero_grad()

        pred = model(x_train)

        batch_loss = loss_fn(pred,y_train.reshape(-1,1))

        batch_loss.backward()

        optimizer.step()

        epoch_loss += batch_loss.item()

    loss_train = epoch_loss / i_batch

    return loss_train





In [None]:
def evaluation(model, val_dataloader, loss_fn):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():

        for i_batch, (x_val, y_val) in enumerate(val_dataloader):
            
            predictions = model(x_val)

            batch_loss = loss_fn(predictions, y_val.reshape(-1,1)) # Calcula el coste (función de coste con los valores reales y predichos)
            
            epoch_loss += batch_loss.item()
    
    loss_val = epoch_loss / i_batch
    
    return loss_val

### 9. Training and validation loop

In [None]:
import time

def training_evaluation_loop(epochs, model, train_dataloader, val_dataloader, optimizer, loss_fn):
    # Empty loss and accuracy lists to track values
    start = time.time()

    loss_values_train = []
    loss_values_val = []

    for epoch in range(epochs):

        loss_train = train(model, train_dataloader, optimizer, loss_fn) # Llama la función para el entrenamiento 
        loss_values_train.append(loss_train)

        loss_val = evaluation(model, val_dataloader, loss_fn) # Llama la función para la evaluación
        loss_values_val.append(loss_val)

        

        # Imprime cada 10 épocas loss_train y loss_val
        
    end = time.time()
    total_time = end - start

    print(f'Total training time: {total_time}')

    return loss_values_train, loss_values_val

In [None]:
def predictions(model,test_dataloader):

    predictions = []
    real_values = []
    
    model.eval()
    with torch.no_grad():
        for x_test,y_test in test_dataloader:

            outputs = model(x_test)
            predictions.append(outputs.detach().cpu().np())
            real_values.append(y_test.detach().cpu().np())
    predictions = np.vstack(predictions)
    real_values = np.hstack(real_values)
    
    # Calcula la métrica paa la regresión para ver el error entre los valores predichos y reales

    # Imprima la métrica

    return predictions, real_values

10. Train and test model 

In [None]:
EPOCHS = 100


train_losses, val_losses = training_evaluation_loop(
    epochs=EPOCHS,
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    optimizer=optimizer,
    loss_fn=loss_fn
)


RuntimeError: DataLoader worker (pid(s) 13664, 7452, 16852, 1444) exited unexpectedly

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(range(1, len(train_losses)+1), train_losses, label="Train MSE")
plt.plot(range(1, len(val_losses)+1),   val_losses,   label="Val MSE")
plt.xlabel("Época")
plt.ylabel("MSE")
plt.title("Curva de aprendizaje (MSE Train vs Val)")
plt.legend()
plt.grid(True)
plt.show()


NameError: name 'train_losses' is not defined

<Figure size 800x500 with 0 Axes>

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for xb, yb in test_dataloader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)
        preds = model(xb).cpu().numpy()
        y_pred.append(preds)
        y_true.append(yb.cpu().numpy())

y_true = np.vstack(y_true).ravel()
y_pred = np.vstack(y_pred).ravel()

test_mse = mean_squared_error(y_true, y_pred)
test_mae = mean_absolute_error(y_true, y_pred)
test_r2  = r2_score(y_true, y_pred)

print("======= Métricas en TEST =======")
print(f"MSE : {test_mse:.4f}")
print(f"MAE : {test_mae:.4f}")
print(f"R²  : {test_r2:.4f}")
