In [2]:
library(data.table)
library(IRanges)

In [3]:
library(parallel)
options(mc.cores = detectCores())

In [4]:
real <- c('A*02:01', 'A*03:01', 'B*13:02', 'B*51:01', 'C*01:02', 'C*06:02', 'DRB1*07:01', 'DRB1*11:01', 'DQB1*02:AB', 'DQB1*03:01', 'DPB1*02:01', 'DPB1*02:01')

In [5]:
dt <- fread('187521910.m8')

Read 16.1% of 5846112 rowsRead 35.1% of 5846112 rowsRead 54.1% of 5846112 rowsRead 73.0% of 5846112 rowsRead 92.0% of 5846112 rowsRead 5846112 rows and 12 (of 12) columns from 0.421 GB file in 00:00:07


In [6]:
setnames(dt, c('q', 't', 'iden', 'len', 'mis', 'gap', 'qf', 'qt', 'tf', 'tt', 'e', 'score'))
#dt <- dt[iden == 100]
tt <- dt[, gene := sub('\\*.+', '', t)]
setkey(dt, q)

In [7]:
nexpr <- dt[grepl('\\D-', t)]
dt <- dt[grepl('\\d-', t)]
tt <- dt[, c('LEN', 'GOOD', 'GOOD.N', 'BAD.N', 'N') := .(
    max(len), 
    paste(unique(gene[len == max(len)]), collapse = ','),
    sum(len == max(len) & iden == 100),
    sum(len < max(len) | iden < 100),
    .N)
, by = q]

In [8]:
good <- dt[len == LEN & iden == 100]
dt <- dt[q %in% good$q]
bad <- dt[gene == GOOD & (len < LEN | iden < 100)]
setkey(good, t)
setkey(bad, t)

In [9]:
score.cov <- good[, .(
    nreads = .N,
    cov = sum(width(reduce(IRanges(tf, tt))))
), keyby = t]
tt <- score.cov[, total := as.integer(sub('.+-', '', t))]

In [None]:
score.bad <- bad[, .(
    nreads.bad = .N,
    cov.bad = sum(width(reduce(IRanges(tf, tt))))
), keyby = t]
score.cov <- score.bad[score.cov]

In [10]:
rescue <- function(to.rescue, rescue.by, BAD = bad, GOOD = good){
    explained <- unique(GOOD[t %in% rescue.by, q])
    g <- sub('\\*.+', '', to.rescue)
    BAD[t %in% to.rescue & !(q %in% explained), .N]
}
a <- 'A*02:01-365'
b <- 'A*03:01-365'
rescue(b, a)

In [11]:
tt <- score.cov[, mapped := cov / total]
score.cov <- score.cov[order(-mapped)]
head(score.cov)

Unnamed: 0,t,nreads,cov,total,mapped
1,C*06:51-181,49,181,181,1.0
2,DOA*01:01-250,201,250,250,1.0
3,DRB1*11:136-89,25,89,89,1.0
4,A*02:20-273,84,272,273,0.996337
5,A*02:327-273,97,272,273,0.996337
6,DRA*01:01-254,215,253,254,0.996063


In [12]:
pairs <- function(G){
    scoreA <- score.cov[grepl(sprintf('^%s', G), t)][mapped/max(mapped) >= 0.98 | cov/max(cov) >= 0.95][order(-cov)]
    goodA <- good[t %in% scoreA$t]
    badA <- bad[gene == G & q %in% goodA$q]
    todo <- combn(scoreA$t, 2, simplify = F)
    cand <- do.call(rbind, mclapply(todo, function(p){
        h1 <- rescue(p[1], p[2], badA, goodA)
        h2 <- rescue(p[2], p[1], badA, goodA)   
        data.table(t1 = p[1], t2 = p[2], h1 = h1, h2 = h2, hopeless = h1 + h2)
    }))
    cand <- cand[order(hopeless)]
    setkey(scoreA, t)
    setkey(cand, t1)
    cand <- cbind(cand, scoreA[cand, .(nreads, cov, mapped)])
    setnames(cand, c('nreads', 'cov', 'mapped'), c('nreads1', 'cov1', 'mapped1'))
    setkey(cand, t2)
    cand <- cbind(cand, scoreA[cand, .(nreads, cov, mapped)])
    setnames(cand, c('nreads', 'cov', 'mapped'), c('nreads2', 'cov2', 'mapped2'))
    tt <- cand[, score := nreads1 + nreads2 - 10 * hopeless]
    return(cand[order(-score)])
}

In [13]:
head(pairs('A'))

Unnamed: 0,t1,t2,h1,h2,hopeless,nreads1,cov1,mapped1,nreads2,cov2,mapped2,score
1,A*02:01-365,A*03:01-365,4,4,8,107,297,0.8136986,147,296,0.8109589,174
2,A*02:01-365,A*03:26-365,4,4,8,107,297,0.8136986,146,294,0.8054795,173
3,A*03:01-365,A*02:134-337,4,4,8,147,296,0.8109589,104,293,0.8694362,171
4,A*03:01-365,A*02:294-273,5,4,9,147,296,0.8109589,114,271,0.992674,171
5,A*03:26-365,A*02:134-337,4,4,8,146,294,0.8054795,104,293,0.8694362,170
6,A*03:26-365,A*02:294-273,5,4,9,146,294,0.8054795,114,271,0.992674,170


In [14]:
head(pairs('B'))

Unnamed: 0,t1,t2,h1,h2,hopeless,nreads1,cov1,mapped1,nreads2,cov2,mapped2,score
1,B*51:30-273,B*13:69-273,0,0,0,88,268,0.981685,85,265,0.970696,173
2,B*51:01-362,B*13:69-273,0,0,0,80,297,0.820442,85,265,0.970696,165
3,B*51:193-362,B*13:69-273,0,0,0,80,297,0.820442,85,265,0.970696,165
4,B*51:96-337,B*13:69-273,0,0,0,79,295,0.8753709,85,265,0.970696,164
5,B*51:187-362,B*13:69-273,0,0,0,78,294,0.8121547,85,265,0.970696,163
6,B*51:188-362,B*13:69-273,0,0,0,78,289,0.7983425,85,265,0.970696,163


In [15]:
head(pairs('DRB1'))

Unnamed: 0,t1,t2,h1,h2,hopeless,nreads1,cov1,mapped1,nreads2,cov2,mapped2,score
1,DRB1*07:01-266,DRB1*11:01-266,0,1,1,65,253,0.9511278,69,251,0.943609,124
2,DRB1*07:01-266,DRB1*11:04-266,0,1,1,65,253,0.9511278,67,242,0.9097744,122
3,DRB1*07:01-266,DRB1*14:141-266,0,0,0,65,253,0.9511278,53,251,0.943609,118
4,DRB1*11:01-266,DRB1*11:04-266,1,1,2,69,251,0.943609,67,242,0.9097744,116
5,DRB1*11:01-266,DRB1*14:141-266,1,0,1,69,251,0.943609,53,251,0.943609,112
6,DRB1*14:141-266,DRB1*11:04-266,0,1,1,53,251,0.943609,67,242,0.9097744,110


In [16]:
head(pairs('DQB1'))

Unnamed: 0,t1,t2,h1,h2,hopeless,nreads1,cov1,mapped1,nreads2,cov2,mapped2,score
1,DQB1*03:01-261,DQB1*03:116-261,0,0,0,54,240,0.9195402,46,235,0.9003831,100
2,DQB1*03:01-261,DQB1*03:19-183,0,0,0,54,240,0.9195402,15,172,0.9398907,69
3,DQB1*03:01-261,DQB1*03:191-183,0,0,0,54,240,0.9195402,15,172,0.9398907,69
4,DQB1*03:01-261,DQB1*03:29-183,0,0,0,54,240,0.9195402,15,172,0.9398907,69
5,DQB1*03:01-261,DQB1*03:165-173,0,0,0,54,240,0.9195402,14,162,0.9364162,68
6,DQB1*03:01-261,DQB1*03:120-89,0,0,0,54,240,0.9195402,8,85,0.9550562,62


In [17]:
head(pairs('DPB1'))

Unnamed: 0,t1,t2,h1,h2,hopeless,nreads1,cov1,mapped1,nreads2,cov2,mapped2,score
1,DPB1*02:01-258,DPB1*02:02-258,0,0,0,187,215,0.8333333,148,208,0.8062016,335
2,DPB1*02:01-258,DPB1*105:01-258,1,1,2,187,215,0.8333333,153,214,0.8294574,320
3,DPB1*02:01-258,DPB1*04:02-258,1,1,2,187,215,0.8333333,115,205,0.7945736,282
4,DPB1*02:01-258,DPB1*141:01-181,1,1,2,187,215,0.8333333,114,180,0.9944751,281
5,DPB1*105:01-258,DPB1*141:01-181,1,1,2,153,214,0.8294574,114,180,0.9944751,247
6,DPB1*04:02-258,DPB1*141:01-181,1,1,2,115,205,0.7945736,114,180,0.9944751,209


In [18]:
head(pairs('C'))

Unnamed: 0,t1,t2,h1,h2,hopeless,nreads1,cov1,mapped1,nreads2,cov2,mapped2,score
1,C*06:73-273,C*06:02-366,0,0,0,105,262,0.959707,98,261,0.7131148,203
2,C*06:73-273,C*06:108-273,0,0,0,105,262,0.959707,98,261,0.956044,203
3,C*06:73-273,C*06:110-298,0,0,0,105,262,0.959707,98,261,0.8758389,203
4,C*06:73-273,C*06:55-273,0,0,0,105,262,0.959707,98,261,0.956044,203
5,C*06:73-273,C*06:83-365,0,0,0,105,262,0.959707,98,261,0.7150685,203
6,C*06:73-273,C*06:30-273,0,0,0,105,262,0.959707,94,259,0.9487179,199
