-
Notifications
You must be signed in to change notification settings - Fork 0
/
013_distributions.R
228 lines (137 loc) · 6.95 KB
/
013_distributions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# Distributions
library(tidyverse)
# load data
dlf <- read.csv("dlf.csv")
dim(dlf)
head(dlf)
#low scores indicate poorer hygeine
#### Examining Normality:
### visualize distributions using histogram and density plots ----
ggplot(dlf, aes(x = day1)) + geom_histogram(color='white')
ggplot(dlf, aes(x = day1)) + geom_histogram(aes(y = ..density..), color='white')
# add a normal curve
ggplot(dlf, aes(x = day1)) + geom_histogram(aes(y = ..density..), color='white') +
stat_function(fun = dnorm,
args = list(mean = mean(dlf$day1, na.rm=T, sd = sd(dlf$day1, na.rm=T))),
color="red",
size = 1)
# change binwidth
ggplot(dlf, aes(x = day1)) + geom_histogram(aes(y = ..density..), color='white', bins=40) +
stat_function(fun = dnorm,
args = list(mean = mean(dlf$day1, na.rm=T, sd = sd(dlf$day1, na.rm=T))),
color="red",
size = 1)
# to plot a smoothed density plot
ggplot(dlf, aes(x = day1)) + geom_density()
### QQ plot ----
# Quantile-Quantile plots
# plot the proportion of cases below each value
# plots the cumulative values in our data against theoretical cumulative values from normal distribution
qplot(sample = dlf$day1)
#another way
qqnorm(dlf$day1)
qqline(dlf$day1, col="red") # adds a line to existing one
# Days 1, 2 and 3 together
#devtools::install_github("thomasp85/patchwork")
library(patchwork)
library(ggplot2)
p1a <- ggplot(dlf, aes(x = day1)) + geom_histogram(aes(y = ..density..), color='white', bins=40) +
stat_function(fun = dnorm, args = list(mean = mean(dlf$day1, na.rm=T, sd = sd(dlf$day1, na.rm=T))),
color="red",size = 1)
p2a <- ggplot(dlf, aes(x = day2)) + geom_histogram(aes(y = ..density..), color='white', bins=20) +
stat_function(fun = dnorm, args = list(mean = mean(dlf$day2, na.rm=T, sd = sd(dlf$day2, na.rm=T))),
color="red",size = 1)
p3a <- ggplot(dlf, aes(x = day3)) + geom_histogram(aes(y = ..density..), color='white', bins=20) +
stat_function(fun = dnorm, args = list(mean = mean(dlf$day3, na.rm=T, sd = sd(dlf$day3, na.rm=T))),
color="red",size = 1)
p1b <- qplot(sample = dlf$day1)
p2b <- qplot(sample = dlf$day2)
p3b <- qplot(sample = dlf$day3)
# layout in a 3 x 2 grid
(p1a | p1b) /(p2a | p2b) / (p3a | p3b)
### Descriptives ----
# 'describe' from the 'psych' package gives useful information:
psych::describe(dlf$day1)
psych::describe(dlf$day2)
# 'stat.desc' from the 'pastecs' package is also useful:
pastecs::stat.desc(dlf$day1, basic = F, norm = T)
pastecs::stat.desc(dlf$day2, basic = F, norm = T)
pastecs::stat.desc(dlf[,3:5], basic = F, norm = T)
round(pastecs::stat.desc(dlf[,3:5], basic = F, norm = T), 3)
# skew and kurtosis should be 0 in a truly normal distribution.
# positive skew values mean a long right tail (more scores on left of distribution)
# positive kurtosis values = pointy and heavy-tailed distribution
# negative kurtosis values = flatter and thinner-tailed distribution
# convert skew or kurtosis to a Z score (S-0 / SES, K-0 / SEK) to evaluate significance
# here, if skew.2SE/kurt.2SE > 1 (or < -1) then likely significant
# for alpha = .01 cut-off is 1.29, for alpha = 0.001 cut-off is 1.65
# this holds true only for smaller samples (<200)
# normtest.W normtest.p - these are outcomes of Shapiro-Wilk test (discussed in a moment)
### analyzing by group ----
# load data
exam <- read.csv('exam.csv')
head(exam)
str(exam)
# all numeric variables
round( pastecs::stat.desc(exam[,1:4], basic = F, norm = T) , 3)
# remember to visualize data too!
ggplot(exam, aes(x = numeracy)) + geom_histogram(aes(y = ..density..), color='white',binwidth = 1)
ggplot(exam, aes(x = exam)) + geom_histogram(aes(y = ..density..), color='white')
# by group
by(data = exam$exam, INDICES = exam$uni, FUN = psych::describe)
by(data = exam$exam, INDICES = exam$uni, FUN = pastecs::stat.desc, basic = F, norm = T)
by(data = exam[,1:4], INDICES = exam$uni, FUN = pastecs::stat.desc, basic = F, norm = T)
# when comparing individual groups, appears that data is normally distributed within groups
ggplot(exam, aes(x = exam)) + geom_histogram(aes(y = ..density..), color='white') + facet_wrap(~uni)
ggplot(exam, aes(x = exam)) +
geom_histogram(aes(y = ..density.., fill=uni), alpha =.6, position="identity", color='white') +
scale_fill_manual(values = c("dodgerblue", "orange"))
### Shapiro-Wilk Test ----
# compare sample to a normal distribution with mean and sd same as the sample
# if p<.05 distributions significantly differ
# can be overly sensitive with large samples
# combine with visual inspection (histograms, qqplot) and evaluation of skew and kurtosis
shapiro.test(exam$exam)
shapiro.test(exam$numeracy)
by(data = exam$exam,
INDICES = exam$uni,
FUN = shapiro.test) # when comparing groups, need to know the distribution within each group
by(data = exam$numeracy,
INDICES = exam$uni,
FUN = shapiro.test)
### Levene's Test ----
# Test for homogeneity of variance across all levels of a variable
# Essentially a one-way anova across groups on deviations of each score from its group mean
car::leveneTest(exam$exam, exam$uni) #1st = continuous outcome variable, 2nd = grouping variable
# if p-value <.05 suggests that there are differences in variances across groups (i.e. heterogeneity)
# also has issues with being overly sensitive for large sample sizes
# (power of test leads to increased sensitivity)
## Hartley's Fmax (variance ratio)
# ratio of the biggest group variance to smallest group variance.
# An assumption is that there are an equal number of participants in each group.
# It's also very sensitive with deviations from normality
# This can be calculated easily - but the value needs to be compared to a critical table
# e.g. http://archive.bio.ed.ac.uk/jdeacon/statistics/table8.htm
vars <- by(data = exam$numeracy,
INDICES = exam$uni,
FUN = var)
vars
vars[2]/vars[1] #largest/smallest #2.21 #critical value (approx 2.0), so NS
### Some Normality Test Extensions ----
library(nortest)
# Most statisticians suggest using Shapiro-Wilk (along with visual inspection and skew/kurtosis),
# as it has more power than others - but several others exist:
# e.g. Kolmogorov Smirnov Test (though sensitive to outliers)
# e.g. Lilliefors Test (less conservative than KS)
nortest::lillie.test(exam$exam) #Lillie
nortest::ad.test(exam$exam) #Anderson-Darling
by(data = exam$exam, INDICES = exam$uni, FUN = nortest::lillie.test)
by(data = exam$exam, INDICES = exam$uni, FUN = nortest::ad.test)
# the package 'nortest' has several other normality tests, as does the package 'normtest'
# 'normtest' also includes some more robust skewness/kurtosis tests:
library(normtest)
by(data = exam$exam, INDICES = exam$uni, FUN = nortest::lillie.test)
examA <- subset(exam, uni == "Uni-A")
examB <- subset(exam, uni == "Uni-B")
normtest::kurtosis.norm.test(examA$numeracy, nrepl=2000) #p>.05, ok for kurtosis
normtest::skewness.norm.test(examA$numeracy, nrepl=2000) #p>.05, ok for skewness