# Linear Regression 

> Team Name: *S Legends*
> 
> Team Members: Myles, Tani, Arjan, Archie 

## Train-Test Split

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import statsmodels.tools

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [5]:
df = pd.read_csv('Life Expectancy Data.csv')

df.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,...,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
0,Turkiye,Middle East,2015,11.1,13.0,105.824,1.32,97,65,27.8,...,97,0.08,11006,78.53,4.9,4.8,7.8,0,1,76.5
1,Spain,European Union,2015,2.7,3.3,57.9025,10.35,97,94,26.0,...,97,0.09,25742,46.44,0.6,0.5,9.7,1,0,82.8
2,India,Asia,2007,51.5,67.9,201.0765,1.57,60,35,21.2,...,64,0.13,1076,1183.21,27.1,28.0,5.0,0,1,65.4
3,Guyana,South America,2006,32.8,40.5,222.1965,5.68,93,74,25.3,...,93,0.79,4146,0.75,5.7,5.5,7.9,0,1,67.0
4,Israel,Middle East,2012,3.4,4.3,57.951,2.89,97,89,27.0,...,94,0.08,33995,7.91,1.2,1.1,12.8,1,0,81.7


In [6]:
feature_cols = list(df.columns)
feature_cols.remove('Life_expectancy')

In [7]:
X = df[feature_cols]
y = df['Life_expectancy']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Feature Engineering

In [10]:
X_train.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing
2026,Sri Lanka,Asia,2014,7.9,9.3,111.2825,2.45,99,99,22.9,99,99,0.01,3694,20.78,15.2,15.0,10.9,0,1
651,Czechia,European Union,2004,3.7,4.6,114.2985,13.42,98,98,26.6,96,98,0.08,14070,10.2,2.1,2.2,11.6,1,0
2225,"Venezuela, RB",South America,2014,15.4,18.0,143.0785,6.6,78,83,26.6,79,78,0.4,16056,30.04,1.6,1.5,10.0,0,1
2357,Albania,Rest of Europe,2010,11.8,13.3,80.9365,4.88,99,98,26.1,99,99,0.03,3577,2.91,1.4,1.5,9.3,0,1
670,Namibia,Africa,2003,43.3,74.4,495.7265,2.29,83,64,23.2,82,79,9.74,3298,1.88,14.2,14.3,5.8,0,1


### One-Hot Endcoding

In [73]:
def ohe(df):
    df = df.copy()
    df = pd.get_dummies(df, columns = ['Country'], drop_first = True, prefix = 'Country', dtype=int)
    df = pd.get_dummies(df, columns = ['Region'], drop_first = True, prefix = 'Region', dtype=int)
    return df

**Get new column names from after the OHE and feature engineering**  
 (OHE created many new columns and the feature engineering added a constant column)

### Scaling

In [62]:
from sklearn.preprocessing import PowerTransformer

In [64]:
def scaling(df):
    pt = PowerTransformer()
    pt.fit(df)
    df = pt.transform(df)
    return df

In [66]:
def feature_eng(df):
        df = df.copy()
        df = ohe(df)
        df = scaling(df)
        df = sm.add_constant(df) # CRUCIAL for statsmodels!!
        return df

In [112]:
X_train_fe = pd.DataFrame(feature_eng(X_train), columns = columns)

### Feature Selection (By Variation Inflation Factor Method)

In [19]:
from statsmodels.stats.outliers_influence import variance_inflation_factor # a module to evaluate the (VIF)

In [None]:
## This a piece of code from stats.stackexchange.com

## It runs the model with all the variables.
## If any of them have a higher VIF than 5, it drops the max. 
## Then it keeps going until none of them have a higher VIF than 5.
## This leaves us with a nice set of features with no collineraity

def calculate_vif(X, thresh = 5.0):
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        # this bit uses list comprehension to gather all the VIF values of the different variables
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]
        
        maxloc = vif.index(max(vif)) # getting the index of the highest VIF value
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc] # we delete the highest VIF value on condition that it's higher than the threshold
            dropped = True # if we deleted anything, we set the 'dropped' value to True to stay in the while loop

    print('Remaining variables:')
    print(X.columns[variables]) # finally, we print the variables that are still in our set
    return X.iloc[:, variables] # and return our X cut down to the remaining variables

**Remember to exclude the constant column during the feature selection process, but to then use it when fitting the model**

In [147]:
calculate_vif(X_train_fe.iloc[:,1:]).head()

  vif = 1. / (1. - r_squared_i)


dropping 'Economy_status_Developed' at index: 16
dropping 'Economy_status_Developing' at index: 16
dropping 'Country_Albania' at index: 16
dropping 'Country_Algeria' at index: 16
dropping 'Country_Antigua and Barbuda' at index: 17
dropping 'Country_Argentina' at index: 17
dropping 'Country_Australia' at index: 18
dropping 'Country_Austria' at index: 18
dropping 'Country_Bahrain' at index: 20
dropping 'Country_Canada' at index: 38
dropping 'Under_five_deaths' at index: 2
dropping 'Population_mln' at index: 11
dropping 'BMI' at index: 6
dropping 'Region_European Union' at index: 185
dropping 'Region_Asia' at index: 183
dropping 'Infant_deaths' at index: 1


LinAlgError: SVD did not converge

##