# Linear Regression 

> Team Name: *S Legends*
> 
> Team Members: Myles, Tani, Arjan, Archie 

## Train-Test Split

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import statsmodels.tools

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [5]:
df = pd.read_csv('Life Expectancy Data.csv')

df.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,...,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
0,Turkiye,Middle East,2015,11.1,13.0,105.824,1.32,97,65,27.8,...,97,0.08,11006,78.53,4.9,4.8,7.8,0,1,76.5
1,Spain,European Union,2015,2.7,3.3,57.9025,10.35,97,94,26.0,...,97,0.09,25742,46.44,0.6,0.5,9.7,1,0,82.8
2,India,Asia,2007,51.5,67.9,201.0765,1.57,60,35,21.2,...,64,0.13,1076,1183.21,27.1,28.0,5.0,0,1,65.4
3,Guyana,South America,2006,32.8,40.5,222.1965,5.68,93,74,25.3,...,93,0.79,4146,0.75,5.7,5.5,7.9,0,1,67.0
4,Israel,Middle East,2012,3.4,4.3,57.951,2.89,97,89,27.0,...,94,0.08,33995,7.91,1.2,1.1,12.8,1,0,81.7


In [6]:
feature_cols = list(df.columns)
feature_cols.remove('Life_expectancy')

In [7]:
X = df[feature_cols]
y = df['Life_expectancy']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Feature Engineering

In [10]:
X_train.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing
2026,Sri Lanka,Asia,2014,7.9,9.3,111.2825,2.45,99,99,22.9,99,99,0.01,3694,20.78,15.2,15.0,10.9,0,1
651,Czechia,European Union,2004,3.7,4.6,114.2985,13.42,98,98,26.6,96,98,0.08,14070,10.2,2.1,2.2,11.6,1,0
2225,"Venezuela, RB",South America,2014,15.4,18.0,143.0785,6.6,78,83,26.6,79,78,0.4,16056,30.04,1.6,1.5,10.0,0,1
2357,Albania,Rest of Europe,2010,11.8,13.3,80.9365,4.88,99,98,26.1,99,99,0.03,3577,2.91,1.4,1.5,9.3,0,1
670,Namibia,Africa,2003,43.3,74.4,495.7265,2.29,83,64,23.2,82,79,9.74,3298,1.88,14.2,14.3,5.8,0,1


### One-Hot Endcoding

In [12]:
def ohe(df):
    df = df.copy()
    df.drop(columns = ['Country'], inplace = True)
    df = pd.get_dummies(df, columns = ['Region'], drop_first = True, prefix = 'Region', dtype=int)
    return df

**Get new column names from after the OHE and feature engineering**  
 (OHE created many new columns and the feature engineering added a constant column)

In [14]:
columns = list(ohe(X_train))
columns.insert(0,'Constant')
columns

['Constant',
 'Year',
 'Infant_deaths',
 'Under_five_deaths',
 'Adult_mortality',
 'Alcohol_consumption',
 'Hepatitis_B',
 'Measles',
 'BMI',
 'Polio',
 'Diphtheria',
 'Incidents_HIV',
 'GDP_per_capita',
 'Population_mln',
 'Thinness_ten_nineteen_years',
 'Thinness_five_nine_years',
 'Schooling',
 'Economy_status_Developed',
 'Economy_status_Developing',
 'Region_Asia',
 'Region_Central America and Caribbean',
 'Region_European Union',
 'Region_Middle East',
 'Region_North America',
 'Region_Oceania',
 'Region_Rest of Europe',
 'Region_South America']

### Scaling

In [16]:
from sklearn.preprocessing import PowerTransformer

In [17]:
def scaling(df):
    pt = PowerTransformer()
    pt.fit(df)
    df = pt.transform(df)
    return df

In [18]:
def feature_eng(df):
        df = df.copy()
        df = ohe(df)
        df = scaling(df)
        df = sm.add_constant(df) # CRUCIAL for statsmodels!!
        return pd.DataFrame(df, columns = columns)

In [20]:
X_train_fe = feature_eng(X_train)

### Feature Selection (By Variation Inflation Factor Method)

In [23]:
from statsmodels.stats.outliers_influence import variance_inflation_factor # a module to evaluate the (VIF)

In [25]:
## This a piece of code from stats.stackexchange.com

## It runs the model with all the variables.
## If any of them have a higher VIF than 5, it drops the max. 
## Then it keeps going until none of them have a higher VIF than 5.
## This leaves us with a nice set of features with no collineraity

def calculate_vif(X, thresh = 5.0):
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        # this bit uses list comprehension to gather all the VIF values of the different variables
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]
        
        maxloc = vif.index(max(vif)) # getting the index of the highest VIF value
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc] # we delete the highest VIF value on condition that it's higher than the threshold
            dropped = True # if we deleted anything, we set the 'dropped' value to True to stay in the while loop

    print('Remaining variables:')
    return list(X.columns[variables]) # finally, we print the variables that are still in our set
   

**Remember to exclude the constant column during the feature selection process, but to then use it when fitting the model**

In [28]:
selected_features = list(calculate_vif(X_train_fe.iloc[:,1:]))
selected_features

  vif = 1. / (1. - r_squared_i)


dropping 'Economy_status_Developed' at index: 16
dropping 'Under_five_deaths' at index: 2
dropping 'Diphtheria' at index: 8
dropping 'Infant_deaths' at index: 1
dropping 'Thinness_five_nine_years' at index: 11
dropping 'Economy_status_Developing' at index: 12
dropping 'GDP_per_capita' at index: 8
Remaining variables:


['Year',
 'Adult_mortality',
 'Alcohol_consumption',
 'Hepatitis_B',
 'Measles',
 'BMI',
 'Polio',
 'Incidents_HIV',
 'Population_mln',
 'Thinness_ten_nineteen_years',
 'Schooling',
 'Region_Asia',
 'Region_Central America and Caribbean',
 'Region_European Union',
 'Region_Middle East',
 'Region_North America',
 'Region_Oceania',
 'Region_Rest of Europe',
 'Region_South America']

## Test on Train

**Check that indices line up between X and y**

In [31]:
# Sanity check 1: Check that all record lengths match
print(f'Same number of records in Train: {X_train_fe.shape[0] == y_train.shape[0]}')
print(f'Same number of records in Test: {X_test.shape[0] == y_test.shape[0]}')

print('~'*50)
# Sanity check 2: Check that all indices match
print(f'Same indices in X_train and y_train: {all(X_train_fe.index == y_train.index)}')
print(f'Same indices in X_test and y_test: {all(X_test.index == y_test.index)}')

Same number of records in Train: True
Same number of records in Test: True
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Same indices in X_train and y_train: False
Same indices in X_test and y_test: True


In [50]:
X_train_fe.index = y_train.index

**Re-add the Constant column for the lin reg model**

In [35]:
selected_features.insert(0,'Constant')

### Fit the Linear Regression Model

In [37]:
lin_reg = sm.OLS(y_train, X_train_fe[selected_features])
results = lin_reg.fit()
results.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.949
Method:,Least Squares,F-statistic:,2257.0
Date:,"Sat, 12 Jul 2025",Prob (F-statistic):,0.0
Time:,23:17:07,Log-Likelihood:,-4977.6
No. Observations:,2291,AIC:,9995.0
Df Residuals:,2271,BIC:,10110.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Constant,68.7972,0.045,1542.913,0.000,68.710,68.885
Year,0.2685,0.048,5.599,0.000,0.174,0.363
Adult_mortality,-6.2569,0.083,-75.528,0.000,-6.419,-6.094
Alcohol_consumption,0.4253,0.082,5.197,0.000,0.265,0.586
Hepatitis_B,0.1411,0.079,1.788,0.074,-0.014,0.296
Measles,-0.1879,0.063,-2.990,0.003,-0.311,-0.065
BMI,0.4907,0.086,5.726,0.000,0.323,0.659
Polio,0.9143,0.090,10.174,0.000,0.738,1.091
Incidents_HIV,-1.0450,0.083,-12.609,0.000,-1.207,-0.882

0,1,2,3
Omnibus:,58.046,Durbin-Watson:,2.031
Prob(Omnibus):,0.0,Jarque-Bera (JB):,129.034
Skew:,-0.079,Prob(JB):,9.57e-29
Kurtosis:,4.152,Cond. No.,7.97


### Find RMSE of the model predicting the training target

In [56]:
y_pred = results.predict(X_train_fe[selected_features])

rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)

print(rmse)

2.124896129320496


## Test on the Test Set

In [41]:
## We apply feature_eng to the X_test set! 
## This is why having a nice neat function is very nice! 
X_test_fe = feature_eng(X_test)

## Now we predict using the X_test_fe set!
## We don't "fit" the model again! 
## We want to see test results that are similar to the training results!

In [43]:
y_test_pred = results.predict(X_test_fe[selected_features])
rmse = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)
print(rmse)

2.225885732823611
