In [1]:
import requests
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns


In [2]:
df = pd.read_csv('Cleaned-data-vc.csv', index_col = 0)

In [3]:
print(df.keys())

Index(['vaxView', 'year', 'value', 'lowerLimit', 'upperLimit', 'sampleSize',
       'confidenceInterval', 'demographicClass', 'upperError', 'lowerError'],
      dtype='object')


In [4]:
np.random.seed(0)
df_train, df_test = train_test_split(df, train_size = 0.7, test_size = 0.3, random_state = 100)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Applying scaler() to all the columns except the 'yes-no' and 'dummy' variables
num_vars = ['vaxView', 'lowerLimit', 'upperLimit', 'sampleSize', 'confidenceInterval','demographicClass','upperError','lowerError']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,vaxView,year,value,lowerLimit,upperLimit,sampleSize,confidenceInterval,demographicClass,upperError,lowerError
676,0.000000,2016,75.8,0.674919,0.810251,0.012329,0.232394,0.416667,0.245211,0.226415
1097,0.000000,2001,93.4,0.904198,0.958561,0.011280,0.091549,0.000000,0.091954,0.110063
1045,0.666667,2017,44.9,0.240043,0.607415,0.000393,0.647887,0.833333,0.716475,0.525157
1396,0.000000,2006,88.2,0.817008,0.924755,0.004722,0.179577,0.416667,0.172414,0.201258
1403,0.666667,2016,34.3,0.247578,0.359869,0.011455,0.204225,0.333333,0.252874,0.169811
...,...,...,...,...,...,...,...,...,...,...
56,0.000000,2005,88.8,0.809473,0.935660,0.005815,0.207746,0.166667,0.187739,0.242138
385,0.666667,2009,45.4,0.240043,0.617230,0.000481,0.665493,0.333333,0.731801,0.540881
86,0.000000,2008,93.9,0.891281,0.968375,0.009138,0.126761,0.333333,0.107280,0.163522
876,0.000000,2013,70.5,0.571582,0.788441,0.002842,0.373239,0.083333,0.371648,0.361635


In [5]:
# Separating the training set into target and predictor variables

y_train = df_train.pop('value')
x_train = df_train
x_train

Unnamed: 0,vaxView,year,lowerLimit,upperLimit,sampleSize,confidenceInterval,demographicClass,upperError,lowerError
676,0.000000,2016,0.674919,0.810251,0.012329,0.232394,0.416667,0.245211,0.226415
1097,0.000000,2001,0.904198,0.958561,0.011280,0.091549,0.000000,0.091954,0.110063
1045,0.666667,2017,0.240043,0.607415,0.000393,0.647887,0.833333,0.716475,0.525157
1396,0.000000,2006,0.817008,0.924755,0.004722,0.179577,0.416667,0.172414,0.201258
1403,0.666667,2016,0.247578,0.359869,0.011455,0.204225,0.333333,0.252874,0.169811
...,...,...,...,...,...,...,...,...,...
56,0.000000,2005,0.809473,0.935660,0.005815,0.207746,0.166667,0.187739,0.242138
385,0.666667,2009,0.240043,0.617230,0.000481,0.665493,0.333333,0.731801,0.540881
86,0.000000,2008,0.891281,0.968375,0.009138,0.126761,0.333333,0.107280,0.163522
876,0.000000,2013,0.571582,0.788441,0.002842,0.373239,0.083333,0.371648,0.361635


In [6]:
# Building a model with all the variables

x_train_lin_model = sm.add_constant(x_train)

limmodel_full = sm.OLS(y_train, x_train_lin_model).fit()

limmodel_full.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,value,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,4.462e+23
Date:,"Wed, 05 May 2021",Prob (F-statistic):,0.0
Time:,11:49:38,Log-Likelihood:,25605.0
No. Observations:,1265,AIC:,-51190.0
Df Residuals:,1256,BIC:,-51150.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.9280,3.96e-09,1.75e+09,0.000,6.928,6.928
vaxView,3.197e-14,4.94e-11,0.001,0.999,-9.68e-11,9.69e-11
year,-1.955e-13,1.98e-12,-0.099,0.921,-4.07e-12,3.68e-12
lowerLimit,45.1510,1.76e-10,2.56e+11,0.000,45.151,45.151
upperLimit,47.1322,2.1e-10,2.25e+11,0.000,47.132,47.132
sampleSize,1.776e-14,1.78e-10,9.96e-05,1.000,-3.5e-10,3.5e-10
confidenceInterval,-2.061e-13,1.22e-09,-0.000,1.000,-2.39e-09,2.39e-09
demographicClass,2.465e-14,4.93e-11,0.001,1.000,-9.66e-11,9.67e-11
upperError,-13.4149,7.83e-10,-1.71e+10,0.000,-13.415,-13.415

0,1,2,3
Omnibus:,363.125,Durbin-Watson:,0.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75.132
Skew:,-0.29,Prob(JB):,4.85e-17
Kurtosis:,1.957,Cond. No.,2.09e+19


In [None]:
# The model fitting indicates a strong multicollinearity. We shall use VIF to find the issues.
 # Creating a dataframe that will contain the names of all the feature variables and their VIFs
vif = pd.DataFrame()
vif['Features'] = x_train.columns
vif['VIF'] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif


In [None]:
x = x_train.drop('lowerLimit', 1,)

# Build a fitted model after dropping the variable
x_train_lm1 = sm.add_constant(x)

lr_1 = sm.OLS(y_train, x_train_lm1).fit()

# Printing the summary of the model
print(lr_1.summary())

In [None]:
vif = pd.DataFrame()
vif['Features'] = x.columns
vif['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
x = x.drop('confidenceInterval', 1,)

# Build a fitted model after dropping the variable
x_train_lm2 = sm.add_constant(x)

lr_2 = sm.OLS(y_train, x_train_lm2).fit()

# Printing the summary of the model
print(lr_2.summary())

In [None]:
vif = pd.DataFrame()
vif['Features'] = x.columns
vif['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
x = x.drop('year', 1,)

# Build a fitted model after dropping the variable
x_train_lm3 = sm.add_constant(x)

lr_3 = sm.OLS(y_train, x_train_lm3).fit()

# Printing the summary of the model
print(lr_3.summary())

In [None]:
vif = pd.DataFrame()
vif['Features'] = x.columns
vif['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
x = x.drop('lowerError', 1,)

# Build a fitted model after dropping the variable
x_train_lm4 = sm.add_constant(x)

lr_4 = sm.OLS(y_train, x_train_lm4).fit()

# Printing the summary of the model
print(lr_4.summary())

In [None]:
vif = pd.DataFrame()
vif['Features'] = x.columns
vif['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
y_train_price = lr_4.predict(x_train_lm4)

# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Term Distribution', fontsize = 15)                  
plt.xlabel('Errors', fontsize = 15)     

In [None]:
y_test = df_test.pop('value')
x_test = df_test

# Adding constant variable to test dataframe
x_test_m4 = sm.add_constant(x_test)

# Creating X_test_m4 dataframe by dropping variables from X_test_m4
x_test_m4 = x_test_m4.drop(["lowerLimit", "confidenceInterval", "year", "lowerError"], axis = 1)

# Making predictions using the final model
y_pred_m4 = lr_4.predict(x_test_m4)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_true = y_test, y_pred = y_pred_m4)