# Generate UCSC & ORF_ID Map

#### Import Libraries & Setup Environment

In [1]:
import pandas
import numpy
from tqdm import tqdm

#### Define Functions

In [2]:
def nuORFID(header):
    header = header.split(' | ')[1]
    header = header.replace('~', '|')
    return(header)

### Generating UCSC to nuORF Map
Given a reference, generates a map of exact matches from the nuORF sequences.

In [3]:
def findMatch(sequence, df):
    df = df[df['sequence'] == sequence]
    matches = df.shape[0]
    if matches == 0:
        return(numpy.nan)
    return(','.join(list(map(
        lambda x: nuORFID(x),
        df['header'].values
    ))))

In [4]:
def ucscMatches(reference, out):
    ucsc = reference[reference['category'] == 'UCSC'].copy()
    nuORF = reference[reference['category'] == 'nuORF'].copy()
    tqdm.pandas(desc='Find Matches')
    ucsc['match'] = ucsc['sequence'].progress_apply(findMatch, args=(nuORF, ))
    ucsc = ucsc[['header', 'match']]
    ucsc.to_csv(out, sep='\t', header=False, index=False)

## Generate Pan Sample ucsc map

In [5]:
data = pandas.read_csv('../../data/ref/PanSample.ref', sep='\t', header=0, index_col=False)
ucscMatches(data, '../../data/map/ucsc.map')

Find Matches: 100%|██████████| 52788/52788 [41:40<00:00, 21.11it/s]


## Generate Pan Sample + Missing & UCSC Map

In [6]:
data= pandas.read_csv('../../data/ref/MissingCanonical.ref', sep='\t', header=0, index_col=False)
ucscMatches(data, '../../data/map/missing.map')

Find Matches: 100%|██████████| 52788/52788 [52:03<00:00, 16.90it/s]  
