In [2]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.0.0     ✔ purrr   0.2.5
✔ tibble  1.4.2     ✔ dplyr   0.7.6
✔ tidyr   0.8.1     ✔ stringr 1.3.1
✔ readr   1.1.1     ✔ forcats 0.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [3]:
stemming <- read.table("../stemming-simple/all.ten", col.names=c("run", "coll", "meas", "topic", "score"))
stemming$topic <- factor(stemming$topic)


# Collect standardization factors

For TREC 6,7,8, and Terabyte 2004-2006, we will use the factors from the Webber, Moffat, and Zobel (2008) paper.  For Blogs06, we will compute them using all runs submitted in all three blog tracks.

In [4]:
wmz.tb.04.mn <- read.csv("std-data/std/trec2004.tb.mn.csv")
summary(wmz.tb.04.mn)

       X             SP                  AP                DCG         
 Min.   :701   Min.   :  0.04684   Min.   :0.009147   Min.   : 0.2772  
 1st Qu.:714   1st Qu.:  7.81094   1st Qu.:0.091050   1st Qu.: 6.5689  
 Median :726   Median : 20.23941   Median :0.163274   Median :11.4601  
 Mean   :726   Mean   : 37.97336   Mean   :0.165662   Mean   :14.0582  
 3rd Qu.:738   3rd Qu.: 45.90068   3rd Qu.:0.220907   3rd Qu.:17.9318  
 Max.   :750   Max.   :158.50777   Max.   :0.459123   Max.   :38.8269  
      nDCG              VDCG             nVDCG              P.10         
 Min.   :0.06771   Min.   : 0.2637   Min.   :0.07249   Min.   :0.008571  
 1st Qu.:0.29016   1st Qu.: 6.3638   1st Qu.:0.29317   1st Qu.:0.250000  
 Median :0.36945   Median :11.0343   Median :0.37314   Median :0.411429  
 Mean   :0.37403   Mean   :13.7114   Mean   :0.37452   Mean   :0.403703  
 3rd Qu.:0.45929   3rd Qu.:17.4420   3rd Qu.:0.45584   3rd Qu.:0.537143  
 Max.   :0.68450   Max.   :38.1382   Max.   :0.68527

In [5]:
wmz.tb.04.sd <- read.csv("std-data/std/trec2004.tb.sd.csv")
wmz.tb.05.mn <- read.csv("std-data/std/trec2005.tb.mn.csv")
wmz.tb.05.sd <- read.csv("std-data/std/trec2005.tb.sd.csv")
wmz.tb.06.mn <- read.csv("std-data/std/trec2006.tb.adhoc.mn.csv")
wmz.tb.06.sd <- read.csv("std-data/std/trec2006.tb.adhoc.sd.csv")
wmz.tb <- cbind(
    rbind(select(wmz.tb.04.mn, X, AP), select(wmz.tb.05.mn, X, AP), select(wmz.tb.06.mn, X, AP)),
    rbind(select(wmz.tb.04.sd, AP), select(wmz.tb.05.sd, AP), select(wmz.tb.06.sd, AP)))
colnames(wmz.tb) <- c("topic", "mean", "sd")
summary(wmz.tb)

     topic          mean                sd         
 Min.   :701   Min.   :0.009147   Min.   :0.02173  
 1st Qu.:739   1st Qu.:0.135335   1st Qu.:0.09247  
 Median :776   Median :0.218238   Median :0.13215  
 Mean   :776   Mean   :0.238902   Mean   :0.12858  
 3rd Qu.:813   3rd Qu.:0.324351   3rd Qu.:0.16413  
 Max.   :850   Max.   :0.647560   Max.   :0.26398  

In [6]:
wmz.trec6.mn <- read.csv("std-data/std/trec6.adhoc.mn.csv")
wmz.trec6.sd <- read.csv("std-data/std/trec6.adhoc.sd.csv")
wmz.trec7.mn <- read.csv("std-data/std/trec7.adhoc.mn.csv")
wmz.trec7.sd <- read.csv("std-data/std/trec7.adhoc.sd.csv")
wmz.trec8.mn <- read.csv("std-data/std/trec8.adhoc.mn.csv")
wmz.trec8.sd <- read.csv("std-data/std/trec8.adhoc.sd.csv")
wmz.cd45 <- cbind(
    rbind(select(wmz.trec6.mn, X, AP), select(wmz.trec7.mn, X, AP), select(wmz.trec8.mn, X, AP)),
    rbind(select(wmz.trec6.sd, AP), select(wmz.trec7.sd, AP), select(wmz.trec8.mn, AP)))
colnames(wmz.cd45) <- c("topic", "mean", "sd")
head(wmz.cd45)

topic,mean,sd
301,0.12117027,0.15444657
302,0.39023378,0.18481751
303,0.16420946,0.15358026
304,0.06940676,0.07698976
305,0.01190541,0.02123657
306,0.09856892,0.08275474


In [7]:
blog06 <- read.table("std-data/blog-maps", col.names=c("coll", "run", "meas", "topic", "score"))
summary(blog06)

          coll                 run         meas           topic       
 blog06.2006: 2850   B1DocOpinAZN:  150   map:40500   Min.   : 851.0  
 blog06.2007: 2850   B1DocOpinSWN:  150               1st Qu.: 880.0  
 blog06.2008:34800   B1PsgOpinAZN:  150               Median : 913.0  
                     B1PsgOpinSWN:  150               Mean   : 932.8  
                     B2DocOpinAZN:  150               3rd Qu.:1007.0  
                     B2DocOpinSWN:  150               Max.   :1050.0  
                     (Other)     :39600                               
     score       
 Min.   :0.0000  
 1st Qu.:0.1633  
 Median :0.3444  
 Mean   :0.3454  
 3rd Qu.:0.5065  
 Max.   :0.9826  
                 

In [8]:
blog.factors <- blog06 %>% group_by(topic) %>% summarize(mean=mean(score), sd=sd(score))
head(blog.factors)

topic,mean,sd
851,0.33758468,0.15975621
852,0.28666561,0.19240981
853,0.09178295,0.06540501
854,0.40109855,0.14499622
855,0.33268815,0.17501772
856,0.38728988,0.19323377


In [9]:
std.factors <- rbind(wmz.cd45, wmz.tb, blog.factors)
std.factors$topic <- factor(std.factors$topic)
head(std.factors)

topic,mean,sd
301,0.12117027,0.15444657
302,0.39023378,0.18481751
303,0.16420946,0.15358026
304,0.06940676,0.07698976
305,0.01190541,0.02123657
306,0.09856892,0.08275474


Now that we have standardization factors for all the topics in our experiments, and (lucky for us) the topic numbers don't overlap at all, we can make one big factor table, join it against the experiment data, and then standardize in one go.

In [10]:
stem.std <- stemming %>% 
    filter(!str_detect(coll, 'nyt'), meas == 'map') %>% 
    left_join(std.factors, by='topic') %>%
    mutate(std = pnorm(score - mean / sd))
head(stem.std)

“Column `topic` joining factors with different levels, coercing to character vector”

run,coll,meas,topic,score,mean,sd,std
no_stem,blogs06.2008,map,1001,0.3825,0.3697664,0.13975714,0.01180928
no_stem,blogs06.2008,map,1002,0.2251,0.1484142,0.08560553,0.06570064
no_stem,blogs06.2008,map,1003,0.4884,0.4215698,0.1422569,0.006661059
no_stem,blogs06.2008,map,1004,0.6242,0.5896099,0.11818018,6.359743e-06
no_stem,blogs06.2008,map,1005,0.4914,0.5018892,0.12671932,0.0002609695
no_stem,blogs06.2008,map,1006,0.1787,0.1996591,0.08704201,0.01720969


# Results for stemming

First, let's just look at the raw scores without standardization. Compute the mean AP scores and look at the raw difference, then do a T-test within each collection.

In [11]:
options(digits=3)
options(scipen=10)

In [12]:
stemming %>% 
    filter(meas == 'map') %>%
    group_by(run, coll) %>% 
    summarize(map=mean(score)) %>% 
    spread(run, map) %>%
    mutate(mean.diff = krovetz - no_stem)

coll,krovetz,no_stem,mean.diff
blogs06.2006,0.289,0.302,-0.0124
blogs06.2007,0.344,0.359,-0.0147
blogs06.2008,0.304,0.318,-0.014
cd45.TREC6,0.237,0.178,0.0587
cd45.TREC7,0.199,0.186,0.0135
cd45.TREC8,0.25,0.213,0.0362
gov2.2004,0.252,0.239,0.0137
gov2.2005,0.325,0.278,0.0461
gov2.2006,0.279,0.277,0.0019
nyt,0.46,0.429,0.0313


In [13]:
stemming %>%
    filter(meas == 'map') %>%
    select(run, coll, topic, score) %>%
    group_by(coll)%>%
    summarize(p=t.test(score ~ run, paired=T)$p.value,
              ci_low=t.test(score ~ run, paired=T)$conf.int[1],
              ci_hi=t.test(score ~ run, paired=T)$conf.int[2])

coll,p,ci_low,ci_hi
blogs06.2006,0.075727,-0.0261,0.001334
blogs06.2007,0.107058,-0.0328,0.003301
blogs06.2008,0.059355,-0.0286,0.000574
cd45.TREC6,0.009169,0.0152,0.102225
cd45.TREC7,0.291935,-0.012,0.039086
cd45.TREC8,0.003103,0.0128,0.059644
gov2.2004,0.313255,-0.0133,0.040784
gov2.2005,0.000109,0.0241,0.068149
gov2.2006,0.883149,-0.0239,0.027684
nyt,0.001412,0.0126,0.050023


# Standardization for the Stemming experiment

Now, let's compute the mean standardized AP for each run, and the difference.

In [14]:
stem.std %>% 
    group_by(run, coll) %>% 
    summarize(map=mean(std)) %>% 
    spread(run, map) %>%
    mutate(mean.diff = krovetz - no_stem)

coll,krovetz,no_stem,mean.diff
blogs06.2006,0.0604,0.0621,-0.001723
blogs06.2007,0.0375,0.0385,-0.000953
blogs06.2008,0.0358,0.037,-0.001248
cd45.TREC6,0.1992,0.1861,0.013061
cd45.TREC7,0.1217,0.12,0.001688
cd45.TREC8,0.2314,0.2202,0.011271
gov2.2004,0.156,0.153,0.003072
gov2.2005,0.062,0.0582,0.003833
gov2.2006,0.0667,0.0677,-0.001041


The mean difference doesn't tell us if there is a significant difference.  So we'll run a standard t-test and look at the p-value and the 95% confidence interval.

In [15]:
stem.std %>%
    select(run, coll, topic, std) %>% 
    group_by(coll) %>%
    summarize(p=t.test(std ~ run, paired=T)$p.value,
              ci_low=t.test(std ~ run, paired=T)$conf.int[1],
              ci_hi=t.test(std ~ run, paired=T)$conf.int[2])

coll,p,ci_low,ci_hi
blogs06.2006,0.05803,-0.00351,6.09e-05
blogs06.2007,0.17611,-0.00235,0.0004422
blogs06.2008,0.08656,-0.00268,0.000186
cd45.TREC6,0.02348,0.00184,0.0242839
cd45.TREC7,0.57527,-0.00433,0.0077021
cd45.TREC8,0.00342,0.00391,0.0186336
gov2.2004,0.36307,-0.00365,0.0097987
gov2.2005,0.00105,0.00162,0.0060433
gov2.2006,0.56789,-0.00468,0.0025959


Note that the p-values with standardization are different than they were on the raw scores.

# Pseudorelevance feedback, now

In [17]:
prf <- read.table("../rocchio/all.ten", col.names=c("run", "coll", "meas", "topic", "score"))
prf$topic <- factor(prf$topic)

In [18]:
prf.std <- prf %>% 
    filter(!str_detect(coll, 'nyt'), meas == 'map') %>% 
    left_join(std.factors, by='topic') %>%
    mutate(std = pnorm(score - mean / sd))
head(prf.std)

“Column `topic` joining factors with different levels, coercing to character vector”

run,coll,meas,topic,score,mean,sd,std
no_stem,blogs06.2008,map,1001,0.383,0.37,0.1398,0.01180928
no_stem,blogs06.2008,map,1002,0.225,0.148,0.0856,0.06570064
no_stem,blogs06.2008,map,1003,0.488,0.422,0.1423,0.00666106
no_stem,blogs06.2008,map,1004,0.624,0.59,0.1182,6.36e-06
no_stem,blogs06.2008,map,1005,0.491,0.502,0.1267,0.00026097
no_stem,blogs06.2008,map,1006,0.179,0.2,0.087,0.01720969


# T-test on raw scores

In [19]:
prf %>% 
    filter(meas == 'map') %>%
    group_by(run, coll) %>% 
    summarize(map=mean(score)) %>% 
    spread(run, map) %>%
    mutate(mean.diff = prf - no_stem)

coll,no_stem,prf,mean.diff
blogs06.2006,0.302,0.303,0.00156
blogs06.2007,0.359,0.36,0.00076
blogs06.2008,0.318,0.324,0.00591
cd45.TREC6,0.178,0.186,0.00834
cd45.TREC7,0.186,0.202,0.01594
cd45.TREC8,0.213,0.225,0.01164
gov2.2004,0.239,0.259,0.02005
gov2.2005,0.278,0.305,0.02623
gov2.2006,0.277,0.301,0.02423
nyt,0.429,0.452,0.02274


In [20]:
prf %>%
    filter(meas == 'map') %>%
    select(run, coll, topic, score) %>%
    group_by(coll)%>%
    summarize(p=t.test(score ~ run, paired=T)$p.value,
              ci_low=t.test(score ~ run, paired=T)$conf.int[1],
              ci_hi=t.test(score ~ run, paired=T)$conf.int[2])

coll,p,ci_low,ci_hi
blogs06.2006,0.72716019,-0.0105,0.00739
blogs06.2007,0.92680314,-0.0173,0.01578
blogs06.2008,0.1750269,-0.0145,0.00272
cd45.TREC6,0.02350108,-0.0155,-0.00117
cd45.TREC7,8.083e-05,-0.0234,-0.00849
cd45.TREC8,0.00210992,-0.0188,-0.00443
gov2.2004,0.00187313,-0.0323,-0.0078
gov2.2005,0.00034396,-0.0399,-0.01254
gov2.2006,0.00285294,-0.0397,-0.00873
nyt,3.53e-06,-0.0316,-0.01385


# Now with standardization

In [21]:
prf.std %>% 
    group_by(run, coll) %>% 
    summarize(map=mean(std)) %>% 
    spread(run, map) %>%
    mutate(mean.diff = prf - no_stem)

coll,no_stem,prf,mean.diff
blogs06.2006,0.0621,0.0623,0.000163
blogs06.2007,0.0385,0.0379,-0.0005937
blogs06.2008,0.037,0.037,-3.68e-05
cd45.TREC6,0.1861,0.1882,0.0021339
cd45.TREC7,0.12,0.1222,0.0022102
cd45.TREC8,0.2202,0.2238,0.0035778
gov2.2004,0.153,0.1575,0.0045336
gov2.2005,0.0582,0.0609,0.0026598
gov2.2006,0.0677,0.0711,0.0034394


In [22]:
prf.std %>%
    select(run, coll, topic, std) %>% 
    group_by(coll) %>%
    summarize(p=t.test(std ~ run, paired=T)$p.value,
              ci_low=t.test(std ~ run, paired=T)$conf.int[1],
              ci_hi=t.test(std ~ run, paired=T)$conf.int[2])

coll,p,ci_low,ci_hi
blogs06.2006,0.773904,-0.001297,0.000971
blogs06.2007,0.439652,-0.000938,0.002125
blogs06.2008,0.947174,-0.001074,0.001148
cd45.TREC6,0.073997,-0.004483,0.000215
cd45.TREC7,0.000169,-0.0033,-0.00112
cd45.TREC8,0.002992,-0.005879,-0.001276
gov2.2004,0.005248,-0.00765,-0.001417
gov2.2005,0.003851,-0.004421,-0.000898
gov2.2006,0.022515,-0.006373,-0.000506
