# Preparing Reference Fasta for Searches
Converting Karl's search fasta files for peptide assignment.
The following references have been used:
* B721 RNA
* B721 RPF
* B721 RibORF + Price
* Pan Sample
* Pan Sample + Mel 11 Variants
* Pan Sample + CLL 5 Variants

#### Import Libraries & Setup Environment

In [1]:
import subprocess

#### Define Functions

In [2]:
def writeLine(index, header, category, sequence, out):
    indexStr = 'Protein:{:06d}'.format(index)
    entry = (
        indexStr + '\t' +
        header + '\t' +
        category + '\t' +
        sequence + '\n')
    out.write(entry)

In [3]:
def getCategory(line):
    start = line[0:3]
    if start == '>uc':
        return('UCSC')
    elif start in {'>sp', '>tr'}:
        return('Contaminant')
    elif start in {'>EN', '>T0', '>T1', '>T2', '>T3', '>TC'}:
        return('nuORF')
    elif start == '>sm':
        return('smORF')
    else:
        return('[ERROR]:' + header)

In [4]:
def parseFasta(fasta, out):
    fasta = open(fasta, 'r')
    out = open(out, 'w')
    out.write('Protein:UID\theader\tcategory\tsequence\n')
    header = ''
    category = ''
    sequence = ''
    index = 0
    for line in fasta:
        if (line[0] == '>') and (header != ''):
            index += 1
            writeLine(index, header, category, sequence, out)
            header = ''
            category = ''
            sequence = ''
        if line[0] == '>':
            header = line[1:-1]
            category = getCategory(line)
        else:
            sequence = sequence + line[:-1]
    index += 1
    writeLine(index, header, category, sequence, out)
    fasta.close()
    out.close()

In [5]:
def addVariants(ref, fasta, out):
    out = open(out, 'w')
    ref = open(ref, 'r')
    index = ''
    for line in ref:
        out.write(line)
        index = line.split('\t')[0]
    ref.close()
    index = int(index[index.find(':') + 1:])
    header = ''
    category = 'Variant'
    sequence = ''
    fasta = open(fasta, 'r')
    for line in fasta:
        if (line[0] == '>') and (header != ''):
            index += 1
            writeLine(index, header, category, sequence, out)
            header = ''
            sequence = ''
        if line[0] == '>':
            header = line[1:-1]
        else:
            sequence = sequence + line[:-1]
    index += 1
    writeLine(index, header, category, sequence, out)
    out.close()

## B721 RNA

In [6]:
fasta = '../../data/fasta/PA_ucsc_proteomenr_264contams.B721.RNA.gencode.mit.553smORFs.fasta'
out = '../../data/ref/B721.RNA.ref'
parseFasta(fasta, out)

## B721 RPF

In [7]:
fasta = '../../data/fasta/PA_ucsc_proteomenr_264contams.B721.RPF.gencode.mit.553smORFs.fasta'
out = '../../data/ref/B721.RPF.ref'
parseFasta(fasta, out)

## B721 RibORF + Price

In [8]:
fasta = '../../data/fasta/PM_ucsc_proteomenr_264contams.B721.RibORF.Price.prot.553smORFs.fasta'
out = '../../data/ref/B721.ref'
parseFasta(fasta, out)

## Pan Sample

In [9]:
fasta = '../../data/fasta/PM_ucsc_proteomenr_264contams.PanSample.RibORF.Price.553smORFs3.fasta'
out = '../../data/ref/PanSample.ref'
parseFasta(fasta, out)

## Mel 11 Variants

In [10]:
ref = '../../data/ref/PanSample.ref'
fasta1 = '../../data/fasta/mel11.PanSample.snvs.fasta'
fasta2 = '../../data/fasta/mel11.PanSample.indels.fasta'
out1 = '../../data/ref/temp.ref'
out2 = '../../data/ref/MEL.ref'
addVariants(ref, fasta1, out1)
addVariants(out1, fasta2, out2)
run = subprocess.run(['rm', out1])

## CLL 5 Variants

In [11]:
ref = '../../data/ref/PanSample.ref'
fasta = '../../data/fasta/cll_jn.PanSample.variants.fasta'
out = '../../data/ref/CLL.ref'
addVariants(ref, fasta, out)