-
Notifications
You must be signed in to change notification settings - Fork 0
/
021_multiple_regression.R
214 lines (130 loc) · 5.13 KB
/
021_multiple_regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
## Multiple Regression ----
### loading libraries ----
library(ggplot2) # for plotting
library(modelr) # for some model functions
library(broom) # for some model functions
### loading data ----
advertising <- read.csv("Advertising.csv")
head(as.data.frame(advertising))
# Generate model
mod2 <- lm(sales ~ TV + radio + newspaper, data = advertising)
summary(mod2)
#confidence intervals
confint(mod2)
# Simple linear regression from before
mod1 <- lm(sales ~ TV, data = advertising)
list(mod1 = broom::glance(mod1), mod2 = broom::glance(mod2))
# Model 2 R2 > Model 1 R2
# Model 2 Adj R2 > Model 1 Adj R2 (Adj R2 adjusted for number of predictors in the model)
# Model 2 sigma (residual standard error) < Model 1 sigma
# Model 2 therefore has better fit of predicted values
# (remember sigma = the mean distance the observed values are from the regression line)
summary(mod1)
summary(mod2)
# Model2 F-statistic > Model 1 F-statistic
# Generate comparison of fit of model1 vs model2
model1_results <- broom::augment(mod1, advertising) %>% mutate(Model = "Model 1")
model2_results <- broom::augment(mod2, advertising) %>% mutate(Model = "Model 2")
rbind( model1_results, model2_results ) %>%
ggplot(., aes(.fitted, .resid)) +
geom_ref_line(h = 0) +
geom_point() +
geom_smooth(se = FALSE) +
facet_wrap(~ Model) +
ggtitle("Residuals vs Fitted")
# Model 1 has issues with heteroskedasticity.
# Model 2 has issues with linearity (pattern in residuals).
# Further Diagnostics
par(mfrow=c(1, 2))
# Model 1
qqnorm(model1_results$.resid); qqline(model1_results$.resid)
# Model 2
qqnorm(model2_results$.resid); qqline(model2_results$.resid) #issues with normality of residuals
dev.off()
### Making Predictions----
# 1. split into train/test datasets
set.seed(17)
my.sample <- sample(c(TRUE, FALSE), nrow(advertising), replace = T, prob = c(0.75,0.25))
train <- advertising[my.sample, ]
test <- advertising[!my.sample, ]
# 2. Calculate MSE for both models
test %>%
modelr::gather_predictions(mod1, mod2) %>%
group_by(model) %>%
summarise(MSE = mean((sales-pred)^2))
# model MSE
# <chr> <dbl>
# 1 mod1 9.62
# 2 mod2 1.78
# Model 2 has better fit against training data than Model1
# but there are still issues to do with the normality of residuals
### Interactions ----
# option A
mod3 <- lm(sales ~ TV + radio + TV * radio, data = advertising)
# option B
mod3 <- lm(sales ~ TV * radio, data = advertising)
tidy(mod3)
list(model1 = broom::glance(mod1),
model2 = broom::glance(mod2),
model3 = broom::glance(mod3))
#### Collinearity ----
# When two or more predictor variables are closely related to each other
# Makes it difficult to parse out respective influence of each predictor
credit<-read.csv("credit.csv")
head(credit)
mod4 <- lm(Balance ~ Age + Limit, data = credit)
mod5 <- lm(Balance ~ Rating + Limit, data = credit)
list(`Model 1` = tidy(mod4),
`Model 2` = tidy(mod5))
summary(mod4)
summary(mod5)
# Mod4: age and limit are highly significant
# Mod5, collinearity between limit and rating leads to hugely increase standard error of the limit coefficient
# leads to this estimate not being significant
# Plot a correlation of the predictors
plot(credit$Rating, credit$Limit)
# However, we also need to more formally test this
# it is theoretically possible for collinearity to exist between multiple variables
# and this may even occur without 2 variables appearing to be too highly correlated
# Multicolinearity
# Test with Variance Inflation Factor (VIF)
# So use variance inflation factor (VIF). (if close to 1 indicates an absence of collinearity)
# none VIF=1, moderately 1<VIF<5, ** highly VIF>5
# Some suggest that highest VIF should be <10, and average of all predictors should be close to 1.
car::vif(mod4)
# Age Limit
# 1.010283 1.010283
car::vif(mod5)
# Rating Limit
# 160.4933 160.4933
# Mod5 has collinearity
## Durbin Watson Test
# To test that there are independent errors
# redidual terms should be uncorrelated (no patterns - autocorrelation is an issue)
# DW stat ranges from 0-4 (with 2 being indicator of no correlation between residual terms)
car::durbinWatsonTest(mod4)
car::durbinWatsonTest(mod5)
# both pvalues > .05 so no evidence that this assumption is violated.
######
### Testing a subset of variables using partial F-test
# tests if certain variables significantly add to the model or not.
housing <- read_csv("housing.csv")
head(housing)
simple <-lm(price ~ sqft, data=housing) #Simple model
reduced = lm(price ~ sqft + bedrooms, data=housing) # Reduced model
full = lm(price ~ sqft + bedrooms + baths, data=housing) # Full model
anova(simple,reduced, full)
# Analysis of Variance Table
#
# Model 1: price ~ sqft
# Model 2: price ~ sqft + bedrooms
# Model 3: price ~ sqft + bedrooms + baths
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 81 3.9969e+12
# 2 80 3.8991e+12 1 9.7842e+10 1.9913 0.1621
# 3 79 3.8817e+12 1 1.7399e+10 0.3541 0.5535
summary(simple)
summary(reduced)
summary(full)
# both p>.16 so cannot reject null,
# therefore, appears adding baths to model does not contribute signifciantly improved fit.