# Linear Regression Model 

## GDP, Undernourishment and Life Expectancy Correlation for Pakistan

### Importing the libraries

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from scipy import stats as st
np.set_printoptions(suppress=True) #to discard the scientific notation 

### Reading the data 

In [32]:
df = pd.read_csv('pakistan.csv') #NaN values were already dropped
df.index = np.arange(1, (len(df)+1)) 
df.head()

Unnamed: 0,date,GDP Per capita,Prevalence of undernourishment (% of population),"Life expectancy at birth, total (years)"
1,2019-01-01,4888.845087,12.9,67.273
2,2018-01-01,4853.266557,12.2,67.114
3,2017-01-01,4571.205078,12.1,66.947
4,2016-01-01,4410.006565,12.4,66.77
5,2015-01-01,4373.014475,12.8,66.577


In [7]:
df.count() 

date                                                19
GDP Per capita                                      19
Prevalence of undernourishment (% of population)    19
Life expectancy at birth, total (years)             19
dtype: int64

In [4]:
df.columns

Index(['date', 'GDP Per capita',
       'Prevalence of undernourishment (% of population)',
       'Life expectancy at birth, total (years)'],
      dtype='object')

### Generating matrices of the outcome the and explanatory variables 

In [8]:
#explanatory variables
X = df[['GDP Per capita','Prevalence of undernourishment (% of population)']]
X = np.array(X)
X

array([[4888.84508713,   12.9       ],
       [4853.2665574 ,   12.2       ],
       [4571.20507841,   12.1       ],
       [4410.00656495,   12.4       ],
       [4373.01447488,   12.8       ],
       [4238.6338674 ,   13.9       ],
       [4103.62956352,   15.1       ],
       [4018.73273732,   15.9       ],
       [3993.55936699,   15.9       ],
       [3890.29061861,   15.9       ],
       [3868.74394234,   15.9       ],
       [3818.07198504,   15.8       ],
       [3766.85556187,   15.8       ],
       [3579.96243152,   16.4       ],
       [3357.36342582,   17.6       ],
       [3127.69076877,   19.        ],
       [2898.03298256,   21.        ],
       [2753.54073973,   21.8       ],
       [2709.02790117,   21.1       ]])

In [9]:
#a column of ones is created to add to the explanatory variable matrix 
column_of_ones = np.ones((len(X),1), dtype=int)
column_of_ones

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [10]:
#concatenating the explanatory variables matrix with the column of ones 
X_final = np.concatenate((column_of_ones,X), axis=1)
X_final

array([[   1.        , 4888.84508713,   12.9       ],
       [   1.        , 4853.2665574 ,   12.2       ],
       [   1.        , 4571.20507841,   12.1       ],
       [   1.        , 4410.00656495,   12.4       ],
       [   1.        , 4373.01447488,   12.8       ],
       [   1.        , 4238.6338674 ,   13.9       ],
       [   1.        , 4103.62956352,   15.1       ],
       [   1.        , 4018.73273732,   15.9       ],
       [   1.        , 3993.55936699,   15.9       ],
       [   1.        , 3890.29061861,   15.9       ],
       [   1.        , 3868.74394234,   15.9       ],
       [   1.        , 3818.07198504,   15.8       ],
       [   1.        , 3766.85556187,   15.8       ],
       [   1.        , 3579.96243152,   16.4       ],
       [   1.        , 3357.36342582,   17.6       ],
       [   1.        , 3127.69076877,   19.        ],
       [   1.        , 2898.03298256,   21.        ],
       [   1.        , 2753.54073973,   21.8       ],
       [   1.        , 2709.

In [11]:
#creating an outcome matrix 
Y = np.array(np.array(df['Life expectancy at birth, total (years)']))
Y

array([67.273, 67.114, 66.947, 66.77 , 66.577, 66.36 , 66.117, 65.849,
       65.562, 65.264, 64.969, 64.685, 64.42 , 64.176, 63.951, 63.736,
       63.522, 63.3  , 63.066])

In [12]:
Y = np.array(Y).reshape((len(Y),1))

### The Normal Equation  

In [13]:
#I divided the equation into four pieces rather than doing it
#in one line because I made use of them 
#in variance calculation.
res1 = np.dot(X_final.T,X_final) 
res1

array([[1.90000000e+01, 7.32204737e+04, 3.03500000e+02],
       [7.32204737e+04, 2.89885153e+08, 1.13500744e+06],
       [3.03500000e+02, 1.13500744e+06, 5.01217000e+03]])

In [14]:
#(X^T.X)^-1
res2 = np.linalg.inv(res1)
res2

array([[124.31775184,  -0.0169962 ,  -3.67896985],
       [ -0.0169962 ,   0.00000235,   0.00049608],
       [ -3.67896985,   0.00049608,   0.11063273]])

In [15]:
#(X^T.X)^-1.X.T
res3 = np.dot(res2,X_final.T)
res3

array([[-6.23275618, -3.05277742,  2.10909334,  3.74516483,  2.90230191,
         1.13939498, -0.98080847, -2.48106077, -2.0532091 , -0.29803262,
         0.06817903,  1.29730682,  2.16779148,  3.1368829 ,  2.50545667,
         1.25846167, -2.19616799, -2.68352457,  0.64830351],
       [ 0.00091199,  0.00048098, -0.00023262, -0.00046327, -0.00035192,
        -0.00012258,  0.00015491,  0.00035193,  0.00029267,  0.00004956,
        -0.00000116, -0.00017005, -0.00029062, -0.00043293, -0.00036165,
        -0.0002078 ,  0.00024373,  0.00030045, -0.0001516 ],
       [ 0.1734627 ,  0.0783699 , -0.07261912, -0.11939706, -0.09349509,
        -0.03846295,  0.02732307,  0.07371343,  0.06122536,  0.00999554,
        -0.00069338, -0.03689413, -0.0623017 , -0.08863646, -0.06630464,
        -0.02535539,  0.08198088,  0.098807  , -0.00071796]])

In [16]:
#(X^T.X)^-1.X.T.Y
res4 = np.dot(res3,Y)
res4
#Variable Coefficients:
#Intercept
#GDP Per capita
#Undernourishment

array([[54.88897342],
       [ 0.00238766],
       [ 0.07229738]])

In [17]:
Y_pred = np.dot(X_final,res4)
Y_pred

array([[67.49448637],
       [67.35892894],
       [66.67823363],
       [66.31503636],
       [66.25563096],
       [66.01430351],
       [65.77871663],
       [65.63385018],
       [65.57374485],
       [65.32717468],
       [65.27572864],
       [65.14751174],
       [65.02522458],
       [64.62236664],
       [64.17763381],
       [63.73047101],
       [63.32672215],
       [63.03956239],
       [62.88267291]])

### Errors of the Estimate

In [18]:
#Sum of Squared Errors (SSE)
#E = Errors 
E = Y - Y_pred
sse = np.dot(E.T,E)
sse

array([[1.84041486]])

In [19]:
#Mean of Squared Errors (MSE) 
E = Y - Y_pred
n = len(Y)
mse = sse / (n - X.shape[1])
mse

array([[0.1082597]])

In [20]:
#Standard Error of the Estimate (SE)
se = math.sqrt(mse)
se

0.32902841484366707

### Standard Errors of the Variables via the Variance

In [21]:
#a list of variable standard errors 
se_variables = np.sqrt(np.diag(mse*res2)).tolist()
se_variables

[3.6685967674061812, 0.0005048286039150371, 0.10943978379893991]

In [22]:
#a list of betas
betalist = res4.reshape(res4.shape[0],).tolist()
betalist

[54.88897341528781, 0.002387655269639688, 0.07229738168175182]

In [23]:
#a dictionary of beta - se tuples to be used in the CI formula 
d1 = dict(zip(betalist,se_variables))
print(d1)

{54.88897341528781: 3.6685967674061812, 0.002387655269639688: 0.0005048286039150371, 0.07229738168175182: 0.10943978379893991}


### Calculating the t-stat using the scipy module 

In [24]:
#%95 confidence
#n > 1000 
t = st.t.ppf(1-0.025,n-X.shape[1])
#round(t,2)
t

2.1098155778331806

### %95 Confidence intervals 

In [25]:
#Lower CI's

#intercept lower
#GDP lower
#Undenourishment lower 
for beta in d1:
    print(beta-(t*d1[beta]))

47.1489108066258
0.001322560016963966
-0.15860037901194696


In [26]:
#Upper CI's

#intercept upper
#GDP upper
#Undernourishment upper

for beta in d1:
    print(beta+(t*d1[beta]))

62.629036023949816
0.0034527505223154098
0.3031951423754506


### t-values 

In [27]:
t_list = []

In [28]:
#intercept t-value
#GDP t-value
#Undernourishment t-value 
for beta in d1:  
    beta/d1[beta]
    t_list.append(beta/d1[beta])

In [29]:
t_list

[14.961844240542174, 4.729635466617758, 0.6606133452764745]

In [30]:
#intercept t
#GDP t
#Undernourishment t 
for t_v in t_list:
    if t_v >= t:
        print('Null hypothesis rejected.')
    else:
        print("Failed to reject the null hypothesis.")

Null hypothesis rejected.
Null hypothesis rejected.
Failed to reject the null hypothesis.
