# Add ORF Type

#### Import Libraries & Setup Environment

In [1]:
import pandas
import numpy
CANONICAL = {
    'CDS',
    'Trunc',
    'Variant',
    'canonical',
    'canonical_extended',
    'canonical_truncated'
}

#### Define Functions

In [2]:
def nuORFID(header):
    header = header.split(' | ')[1]
    header = header.replace('~', '|')
    return(header)

In [3]:
def ucscType(header, ucscMap):
    if header in ucscMap:
        return('canonical:both')
    else:
        return('canonical:ucsc')

In [4]:
def nuORFType(header, ucscMap):
    ORF_ID = nuORFID(header)
    if ORF_ID in ucscMap:
        return('canonical:both')
    orfType = ORF_ID.split('|')[-1]
    if orfType not in CANONICAL:
        return(orfType)
    else:
        return('canonical:gencode')

In [5]:
def variantORFType(header):
    orfType = header.split('|')[2]
    if orfType in CANONICAL:
        return('canonical:variant')
    return(orfType)

In [6]:
def parseUcscMap(path):
    ucscMap = dict()
    for line in open(path, 'r'):
        line = line[:-1]
        line = line.split('\t')
        if line[1] != '':
            line[1] = line[1].split(',')
            for nu in line[1]:
                ucscMap[line[0]] = nu
                ucscMap[nu] = line[0]
    return(ucscMap)

In [7]:
def getType(row, ucscMap):
    category = row.loc['category']
    if category == 'UCSC':
        return(ucscType(row.loc['header'], ucscMap))
    if category == 'Contaminant':
        return('Contaminant')
    if category == 'nuORF':
        return(nuORFType(row.loc['header'], ucscMap))
    if category == 'Variant':
        return(variantORFType(row.loc['header']))
    else:
        return(numpy.nan)

In [8]:
def updateRef(ref, ucscMap, out):
    ref = pandas.read_csv(
        ref,
        sep='\t',
        header=0,
        index_col=False
    )
    ucscMap = parseUcscMap(ucscMap)
    ref['orfType'] = ref.apply(
        getType,
        axis=1,
        args=(ucscMap,)
    )
    ref.to_csv(
        out,
        sep='\t',
        header=True,
        index=False
    )

## Pan Sample

In [9]:
ref = '../../data/ref/PanSample.ref'
ucscMap = '../../data/map/ucsc.map'
out = '../../data/ref/PanSample.ref'
updateRef(ref, ucscMap, out)

## Variants

### CLL 5283

In [10]:
ref = '../../data/ref/CLL.5283.ref'
ucscMap = '../../data/map/ucsc.map'
out = '../../data/ref/CLL.5283.ref'
updateRef(ref, ucscMap, out)

### Mel 11

In [11]:
ref = '../../data/ref/MEL.11.ref'
ucscMap = '../../data/map/ucsc.map'
out = '../../data/ref/MEL.11.ref'
updateRef(ref, ucscMap, out)

### Mel 2

In [12]:
ref = '../../data/ref/MEL.2.ref'
ucscMap = '../../data/map/ucsc.map'
out = '../../data/ref/MEL.2.ref'
updateRef(ref, ucscMap, out)

### Mel 5

In [13]:
ref = '../../data/ref/MEL.5.ref'
ucscMap = '../../data/map/ucsc.map'
out = '../../data/ref/MEL.5.ref'
updateRef(ref, ucscMap, out)

### GBM 7

In [14]:
ref = '../../data/ref/GBM.7.ref'
ucscMap = '../../data/map/ucsc.map'
out = '../../data/ref/GBM.7.ref'
updateRef(ref, ucscMap, out)

## Missing Pan Sample

In [15]:
ref = '../../data/ref/MissingCanonical.ref'
missingMap = '../../data/map/missing.map'
out = '../../data/ref/MissingCanonical.ref'
updateRef(ref, missingMap, out)