-
Notifications
You must be signed in to change notification settings - Fork 1
/
1-exclusions.R
114 lines (95 loc) · 5.77 KB
/
1-exclusions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
## exclusions
## execute this to create the "complete cases" file that is loaded in almost all
## other analyses
rm(list=ls())
source("helper/useful.R")
d <- read.csv("data/zenith all data.csv")
######## COMPUTE COMPLETE CASES (EG NO DROPOUT) ########
# policy: use only complete cases (some data for all years)
d$completed <- !is.na(d$ravens) | !is.na(d$wiat) | !is.na(d$verbalwm) | !is.na(d$arith)
dropouts <- d %>%
group_by(year, abacus) %>%
summarise(complete=sum(completed))
# a few more kids leave in the abacus group
qplot(year,complete,colour=factor(abacus),data=dropouts,
geom="line",group=abacus) + ylim(c(0,104))
completed.cases <- d %>%
group_by(subnum) %>%
summarise(completed = sum(completed))
complete.subnums <- completed.cases$subnum[completed.cases$completed==4] # all four years
d <- filter(d, subnum %in% complete.subnums)
## exclusions on computer/2AFC tasks for those kids who didn't understand
### ANS ###
# policy: exclude 3 sds above the full population mean
d$ans[d$ans > mean(d$ans,na.rm=T) + 3*sd(d$ans,na.rm=T)] <- NA
### MENTAL ROTATION ###
# check the sum scoring compared with the non-sum
## SUM wins hands down
mrot.corrs <- data.frame(year=factor(rep(c("0-1","1-2","2-3"),4)),
measure=factor(c(rep("shapes",6),rep("letters",6))),
method=factor(rep(c(rep("unpenalized",3),rep("penalized",3)),2)),
corrs=c(cor.test(d$mental.rot.shapes.prop[d$year==0],d$mental.rot.shapes.prop[d$year==1])$estimate,
cor.test(d$mental.rot.shapes.prop[d$year==1],d$mental.rot.shapes.prop[d$year==2])$estimate,
cor.test(d$mental.rot.shapes.prop[d$year==2],d$mental.rot.shapes.prop[d$year==3])$estimate,
#vs
cor.test(d$mental.rot.shapes.sum[d$year==0],d$mental.rot.shapes.sum[d$year==1])$estimate,
cor.test(d$mental.rot.shapes.sum[d$year==1],d$mental.rot.shapes.sum[d$year==2])$estimate,
cor.test(d$mental.rot.shapes.sum[d$year==2],d$mental.rot.shapes.sum[d$year==3])$estimate,
#letters
cor.test(d$mental.rot.letters.prop[d$year==0],d$mental.rot.letters.prop[d$year==1])$estimate,
cor.test(d$mental.rot.letters.prop[d$year==1],d$mental.rot.letters.prop[d$year==2])$estimate,
cor.test(d$mental.rot.letters.prop[d$year==2],d$mental.rot.letters.prop[d$year==3])$estimate,
#vs
cor.test(d$mental.rot.letters.sum[d$year==0],d$mental.rot.letters.sum[d$year==1])$estimate,
cor.test(d$mental.rot.letters.sum[d$year==1],d$mental.rot.letters.sum[d$year==2])$estimate,
cor.test(d$mental.rot.letters.sum[d$year==2],d$mental.rot.letters.sum[d$year==3])$estimate))
qplot(year,corrs,colour=measure,linetype=method,
group=measure:method,
geom=c("point","line"),
data=mrot.corrs) + ylim(c(0,1)) +
xlab("Year pair") + ylab("Correlation coefficient")
# policy: if you're significantly below chance, exclude
x <- rbinom(100000,48,.5)
score <- x - (48 - x)
d$mental.rot.letters.sum[d$mental.rot.letters.sum < quantile(score,c(.05))] <- NA
d$mental.rot.shapes.sum[d$mental.rot.shapes.sum < quantile(score,c(.05))] <- NA
d$mental.rot <- (d$mental.rot.letters.sum + d$mental.rot.shapes.sum)/96
d <- d[,!names(d) %in% c("mental.rot.letters.prop","mental.rot.letters.sum",
"mental.rot.shapes.prop","mental.rot.shapes.sum")]
# commutativity
# check sum
commute.corrs <- data.frame(year=factor(rep(c("0-1","1-2","2-3"),2)),
method=factor(c(rep("unpenalized",3),rep("penalized",3))),
corrs=c(cor.test(d$commute.prop[d$year==0],d$commute.prop[d$year==1])$estimate,
cor.test(d$commute.prop[d$year==1],d$commute.prop[d$year==2])$estimate,
cor.test(d$commute.prop[d$year==2],d$commute.prop[d$year==3])$estimate,
#vs
cor.test(d$commute.sum[d$year==0],d$commute.sum[d$year==1])$estimate,
cor.test(d$commute.sum[d$year==1],d$commute.sum[d$year==2])$estimate,
cor.test(d$commute.sum[d$year==2],d$commute.sum[d$year==3])$estimate))
# policy: exclude if you don't have two questions right (first two are check questions)#d$commute[d$commute < 3/26] <- NA
# done by hand via Jess
# also, this measure is not useful
d <- d[,!names(d) %in% c("commute.sum","commute.prop")]
# ravens
# policy: exclude if you got not a single question correct
d$ravens[d$ravens == 0] <- NA
# WIAT
# policy: exclude if you got not a single question correct
d$wiat[d$wiat == 0] <- NA
# PLACE VAL - reliability computation
pv.corrs <- data.frame(year=factor(c("0-1","1-2","2-3")),
corrs=c(cor.test(d$placeval[d$year==0],d$placeval[d$year==1])$estimate,
cor.test(d$placeval[d$year==1],d$placeval[d$year==2])$estimate,
cor.test(d$placeval[d$year==2],d$placeval[d$year==3])$estimate))
## write out
write.csv(d,"data/zenith all data complete cases.csv", row.names=FALSE)
## compute the missing data proportion ------------------
library(tidyr)
gd <- d %>%
select(-english, -math, -science, -computer, -music,
-art, -phys.ed, -attendance, -completed, -wholegroupsums, -class) %>%
gather(measure, value, placeval, wiat, woodcock, arith,
ravens, verbalwm, spatialwm, ans, mental.rot) %>%
group_by(variable) %>%
summarise(prop.na = mean(is.na(value)))