In [1]:
suppressWarnings(library(tidyverse))
suppressWarnings(library(magrittr)) # https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.1.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.1
[32m✔[39m [34mtidyr  [39m 0.8.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.1.1     [32m✔[39m [34mforcats[39m 0.3.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract



In [2]:
# Parse domains file to have one domain per row
domains <- read.table("../_cloud/smartdomssummary.txt", sep='\t', col.names=c('Entry', 'Gene', 'DomainsStr'), stringsAsFactors=FALSE) %>%
    #subset(Gene %in% c('PIK3R1', 'SOS1')) %>%
    separate_rows(DomainsStr, sep=',') %>%
    mutate(DomainsStr = trimws(DomainsStr)) %>%
    separate(col=DomainsStr, into=c('DomainType', 'DomainStartStop'), sep='\\(', remove=F) %>%
    separate(col=DomainStartStop, into=c('DomainStart', 'DomainStop'), sep='\\-', remove=T) %>%
    mutate(DomainStop = substr(DomainStop, start=1, stop=nchar(DomainStop)-1)) %>%
    mutate(DomainStart = as.numeric(DomainStart)) %>%
    mutate(DomainStop = as.numeric(DomainStop))
head(domains)

Entry,Gene,DomainsStr,DomainType,DomainStart,DomainStop
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
FN3K_HUMAN,FN3K,Pfam:Fructosamin_kin(2-309),Pfam:Fructosamin_kin,2,309
FN3K_HUMAN,FN3K,Pfam:APH(28-279),Pfam:APH,28,279
FNBP4_HUMAN,FNBP4,WW(215-248),WW,215,248
FNBP4_HUMAN,FNBP4,WW(596-629),WW,596,629
FND3A_HUMAN,FNDC3A,FN3(266-358),FN3,266,358
FND3A_HUMAN,FNDC3A,FN3(371-452),FN3,371,452


In [3]:
GetDomains <- Vectorize(function(gene, pos) {
    x <- domains %>% subset((Gene == gene) & (DomainStart <= pos) & (pos <= DomainStop))
    if (length(x) > 0) {
        toString(x$DomainsStr)
    } else {
        ''
    }
})

GAFs.GEPs.domains <- read.table("MSB2013_GAFs_GEPs.tsv", sep='\t', header=T, stringsAsFactors=FALSE) %>%
    mutate(domain=GetDomains(gene, mut_position))
head(GAFs.GEPs.domains)

sample_id,mut_position,wt_residue,mut_residue,count,status,active_region,Freq,gene,PTM_position,residue,kinase,active_region_p,domain
<chr>,<int>,<chr>,<chr>,<int>,<chr>,<int>,<int>,<chr>,<int>,<chr>,<chr>,<dbl>,<chr>
TCGA-23-1124-01A,1750,S,I,1,"DI,N2",25,1740,AKAP13,1754,S,,0.001596489,
TCGA-23-1124-01A,1750,S,I,1,"DI,N2",25,1740,AKAP13,1747,T,,0.001596489,
TCGA-23-1124-01A,1750,S,I,1,"DI,N2",25,1740,AKAP13,1750,S,,0.001596489,
TCGA-23-1124-01A,1750,S,I,1,"DI,N2",25,1761,AKAP13,1754,S,,0.001596489,
TCGA-23-1124-01A,1750,S,I,1,"DI,N2",25,1761,AKAP13,1747,T,,0.001596489,
TCGA-23-1124-01A,1750,S,I,1,"DI,N2",25,1761,AKAP13,1750,S,,0.001596489,


In [4]:
write.table(GAFs.GEPs.domains, file='MSB2013_GAFs_GEPs_domains.tsv', sep="\t", quote=F, row.names=FALSE)