# Make personalized HLA references for each cohort

Amber Shen

March 2022

In [1]:
#if (!require("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("rtracklayer")

suppressPackageStartupMessages({
    library(rtracklayer)
    library(Biostrings)
    library(Matrix)
    library(dplyr)
    library(ggplot2)
    library(stats)
    library(stringr)
    library(tidyverse)
})

**Functions to make personalized genomes and annotations**

In [3]:
# Formats the name of an HLA allele from HLA_X*ii:jj:kk to X-ii-jj
format_allele = function(allele) {
    gene = str_split(str_split(allele, '_')[[1]][2], '\\*')[[1]][1]
    two_digit = str_split(str_split(allele, '\\*')[[1]][2], ':')[[1]][1]
    four_digit = str_split(str_split(allele, '\\*')[[1]][2], ':')[[1]][2]
    return(paste(gene, two_digit, four_digit, sep='-'))
}

# Makes the genome .fa file
# personalized_alleles: .csv file containing personalized HLA alleles imputed for a sample
# genome_out: Directory for output
make_genome = function(personalized_alleles, genome_out) {
    idxs = c()
    warnings = c()
    sample = str_split(tail(str_split(personalized_alleles, '/')[[1]], n=1), '_alleles')[[1]][1]
    alleles = read.csv(personalized_alleles)
    for (i in 1:nrow(alleles)) {
        
        to_match = format_allele(alleles[i, 'ID']) # HLA_X*ii:jj:kk to X-ii-jj
        idx = which(formatted_database_names==to_match) # get indices of matching sequences
        
        # no matches, skip allele
        if (length(idx) == 0) {
            warnings = c(warnings, paste0('WARNING: ', to_match, ' not found for sample ', sample))
            next
        }
        idxs = c(idxs, idx[1])   
    }
    Biostrings::writeXStringSet(database[idxs], paste0(genome_out, sample, '_genome.fa')) # save file
    lapply(warnings, cat, '\n', file=paste0(genome_out, 'missing_alleles.csv'), append=TRUE) # saves missing alleles
}

make_annotation = function(sample, annot_out, unique) {
    genome = Biostrings::readDNAStringSet(paste0(out, 'genomes/', sample, '_genome.fa'))
    col_names = c('seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute')
    annot = data.frame(matrix(ncol=9, nrow=0, dimnames=list(NULL, col_names)))
    
    for (allele in names(genome)) {
        end = width(genome[allele])
        if (unique) {
            name = allele
        } else {
            name = str_split(allele, '\\*')[[1]][1] # nonunique case
        }
        attribute = paste0('transcript_id "', allele, '"; gene_id "', # Joyce changed "name" to "allele" 5/12/22
                           name, '"; gene_name "', name, '";')
        annot[nrow(annot)+1,] = c(allele, 'IMGTHLA', 'exon', 1, end, '.', '+', '.', attribute) 
    }
    write.table(annot, file=annot_out, sep='\t', quote = FALSE, col.names=FALSE, row.names=FALSE)
}

make_references = function(personalized_alleles_path, sequence_database_path, out) {
    dir.create(out, showWarnings = FALSE)
    dir.create(paste0(out, 'genomes/'), showWarnings = FALSE)
    dir.create(paste0(out, 'unique_annotations/'), showWarnings = FALSE)
    dir.create(paste0(out, 'nonunique_annotations/'), showWarnings = FALSE)
    
    # Get list of personalized allele files
    personalized_alleles = list.files(personalized_alleles_path, full.names=TRUE)
    
    # make genomes
    for (i in 1:length(personalized_alleles)) {
        genome_out = paste0(out, 'genomes/')
        make_genome(personalized_alleles[i], genome_out)
    }
    
    # make annotations
    get_sample = function(file) {return(str_split(tail(str_split(file, '/')[[1]], n=1), '_alleles')[[1]][1])}
    samples = lapply(personalized_alleles, get_sample)

    for (sample in samples) {
        annot_out = paste0(out, 'unique_annotations/', sample, '_annotation.gtf')
        make_annotation(sample, annot_out, unique=TRUE)
    }
    for (sample in samples) {
        annot_out = paste0(out, 'nonunique_annotations/', sample, '_annotation.gtf')
        make_annotation(sample, annot_out, unique=FALSE)
    }
}

Read in IMGT allele database

cat /data/srlab1/jkang/hla/schla/IMGTHLA/alignments_FINAL/*.fa > '/data/srlab1/amber_joyce/scHLA/IMGTHLA_all_alleles_FINAL.fa'

In [4]:
sequence_database_path = '/data/srlab1/amber_joyce/scHLA/IMGTHLA_all_alleles_FINAL.fa' 
# concatenated genes from /data/srlab1/jkang/hla/schla/IMGTHLA/alignments_FINAL
database = Biostrings::readDNAStringSet(sequence_database_path)
formatted_database_names = lapply(names(database), format_allele)

In [5]:
# Examine the database
database
head(formatted_database_names)
tail(formatted_database_names)

DNAStringSet object of length 26573:
        width seq                                           names               
    [1]  4626 [47m[30mA[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47m[30mG[39m[49m[47m[30mC[39m[49m[47m[30mC[39m[49m[47m[30mT[39m[49m[47m[30mG[39m[49m[47m[30mA[39m[49m[47m[30mA[39m[49m...[47m[30mA[39m[49m[47m[30mT[39m[49m[47m[30mG[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mA[39m[49m[47m[30mA[39m[49m[47m[30mC[39m[49m[47m[30mC[39m[49m[47m[30mT[39m[49m[47m[30mT[39m[49m[47m[30mC[39m[49m[47m[30mC[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mA[39m[49m[47m[30mG[39m[49m[47m[30mT[39m[49m[47m[30mC[39m[49m[47m[30mC[39m[49m[47

There was a previous version of HLA imputation, which I moved from personalized_references to personalized_references_old

## AMP2RA

In [14]:
personalized_alleles_path = '/data/srlab/ssg34/scHLA/data/AMPGENO/RA_updated_alleles/' 
out = '/data/srlab2/jkang/scHLA/personalized_final/AMP2RA_NewPanel/personalized_references/'
make_references(personalized_alleles_path, sequence_database_path, out)

## Smillie2019

In [8]:
personalized_alleles_path = '/data/srlab/ssg34/scHLA/data/Smillie2019/ccdg_broad/updated_newref_alleles'
out = '/data/srlab2/jkang/scHLA/personalized_final/Smillie2019_NewPanel/personalized_references/'
make_references(personalized_alleles_path, sequence_database_path, out)

personalized_alleles_path = '/data/srlab/ssg34/scHLA/data/Smillie2019/Helmsley_merged/updated_newref_alleles'
out = '/data/srlab2/jkang/scHLA/personalized_final/Smillie2019_NewPanel/personalized_references/'
make_references(personalized_alleles_path, sequence_database_path, out)

## Randolph2021

In [6]:
personalized_alleles_path = '/data/srlab/ssg34/scHLA/data/Randolph2021/updated_newref_alleles'
out = '/data/srlab2/jkang/scHLA/personalized_final/Randolph2021_NewPanel/personalized_references/'
make_references(personalized_alleles_path, sequence_database_path, out)

## OneK1K

In [9]:
personalized_alleles_path = '/data/srlab/ssg34/scHLA/data/1K1K/updated_newref_alleles'
out = '/data/srlab2/jkang/scHLA/personalized_final/OneK1K_NewPanel/personalized_references/'
make_references(personalized_alleles_path, sequence_database_path, out) # Takes 5 mins

## All done!

In [9]:
sessionInfo()

R version 4.0.5 (2021-03-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux Server release 6.5 (Santiago)

Matrix products: default
BLAS/LAPACK: /PHShome/jbk37/anaconda3/envs/hla_new/lib/libopenblasp-r0.3.18.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] forcats_0.5.1        purrr_0.3.4          readr_2.1.2         
 [4] tidyr_1.2.0          tibble_3.1.6         tidyverse_1.3.1     
 [7] stringr_1.4.0        ggplot2_3.3.5        dplyr_1.0.8         
[10] Matrix_1.4-0         Biostrings_2