In [23]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [8]:
df = pd.read_csv("cleaned_data.csv")

In [9]:
#cleaning up the column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_')

Training the Linear Regression Model

In [10]:
features = [
    'status', 'gdp', 'adult_mortality', 'schooling', 
    'hiv/aids', 'income_composition_of_resources', 'bmi'
]
target = 'life_expectancy'

# Creating x and y
X = df[features]
y = df[target]

# Drop rows where the target is missing, as we can't use them for training
X = X[y.notna()]
y = y[y.notna()]

In [11]:
# Define which columns are which type
categorical_features = ['status']
numerical_features = ['gdp', 'adult_mortality', 'schooling', 'hiv/aids', 'income_composition_of_resources', 'bmi']

# Create a special pipeline for GDP to handle the log transformation
gdp_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log_transform', FunctionTransformer(np.log1p)), # np.log1p is safer than np.log for values near zero
    ('scaler', StandardScaler())
])

# Create a pipeline for other numerical features
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create a pipeline for categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Combine all preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('gdp', gdp_pipeline, ['gdp']),
        ('num', numeric_pipeline, [col for col in numerical_features if col != 'gdp']),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough' # Keep other columns if any (none in our case)
)

In [13]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (2350, 7)
Testing set shape: (588, 7)


In [20]:
# Create the full pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model!
print("Training the Linear Regression model...")
model_pipeline.fit(X_train, y_train)
print("Training complete")

Training the Linear Regression model...
Training complete


In [22]:
# Get predictions on the test set
y_pred = model_pipeline.predict(X_test)

# Calculate error metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation Results")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f} years")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f} years")


Model Evaluation Results
R-squared (R²): 0.7880
Mean Absolute Error (MAE): 3.0892 years
Root Mean Squared Error (RMSE): 4.2864 years



Linear Regression has given us very good resuls (Acc to errors)
- Now lets try another model.


In [24]:
# Create a simplified pipeline for all numerical features
numeric_pipeline_rf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Create the pipeline for categorical features (this stays the same)
categorical_pipeline_rf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor_rf = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline_rf, numerical_features),
        ('cat', categorical_pipeline_rf, categorical_features)
    ])

In [25]:
rf_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_rf),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42)) # n_jobs=-1 uses all available CPU cores
])

# Train the model!
print("Training the Random Forest Regressor model...")
rf_model_pipeline.fit(X_train, y_train)
print("Training complete!")

Training the Random Forest Regressor model...
Training complete!


In [26]:
# Get predictions
y_pred_rf = rf_model_pipeline.predict(X_test)

# Calculate error metrics
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print("\n--- Random Forest Evaluation Results ---")
print(f"R-squared (R²): {r2_rf:.4f}")
print(f"Mean Absolute Error (MAE): {mae_rf:.4f} years")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f} years")


--- Random Forest Evaluation Results ---
R-squared (R²): 0.9635
Mean Absolute Error (MAE): 1.2088 years
Root Mean Squared Error (RMSE): 1.7780 years


Conclusion: 

The Random Forest Regressor is the clear winner. 
It has dramatically reduced the prediction error (MAE) by more than half and significantly increased the R-squared value. 
This proves it was much better at understanding the complex and non-linear relationships in your dataset.

Based on this, the Random Forest model is the best one to use for your predictions.