Schuller et al., eIF5A Functions Globally in Translation Elongation and Termination, Molecular Cell (2017), http:// dx.doi.org/10.1016/j.molcel.2017.03.003

Polarity at position i in gene with length l is defined as:

$w_i = {{2i-(l+1)}\over{l - 1}}$

$p_i = {{d_i w_i}\over{\sum_{i=1}^l d_i}}$

In [2]:
import sys
import xlrd
import pandas as pd
import numpy as np
from numpy import median
import os
import fnmatch 

In [3]:
'''
Description: Parse the input file to start at 15 bps upstream of each gene
    Input: path for input file and path for new csv file generated
'''
def makecsv(inputpath, outputpath, ext):
    inputf = open(inputpath,'r')
    outputf = open(outputpath, 'w')
    outputf.write('Name'+ ',' + 'Orient' + ',' + 'Start' + ',' + 'End' + ',' + 'Position' + ',' +'Length' + ',' + 'RelativePosition' + ',' + 'Counts' + '\n')
    rposit = -1000    
    lines = inputf.readlines()
    for line in lines:
        string = line.rstrip() 
        cells = string.split('\t')
        name = cells[0][:7] 
        orient = cells[0][6:7]
        start = cells[1]
        end = cells[2]
        length = abs(int(start) - int(end))
        riboCount = cells[4]
        posit = cells[3]
        
        if orient == 'W':
            rposit = (int(cells[3])+1 - (int(start)))
    
        elif orient == "C":
            rposit = ((int(start))-(int(cells[3])-1))
            
        if rposit > 0 and rposit <(length-15):
            outputf.write(str(name) + ',' + str(orient) + ',' + str(start) + ',' + str(end) + ',' + str(posit) + ',' + str(length-15) + ',' + str(rposit) + ',' + str(riboCount) + '\n')
    inputf.close()
    outputf.close()

In [9]:
def calculatePolarity(i, l, densityati, density):
    normDist = (2*i - (l + 1))/(l - 1)   
    polarity = densityati*normDist/density
    
    return polarity

def PolarityPerGene(inputpath, destdir):
    df = pd.read_csv(inputpath)
    names = inputpath.split('.')
    names1 = names[0].split('/')
    
    #this is the final output file
    nameofcsv = destdir + names1[len(names1) - 1] + str("_polarity&gene.csv")
    print(nameofcsv)
    
    len_dict = dict(zip(df['Name'],df['Length']))
    den_dict = df.groupby('Name')['Counts'].agg('sum').to_dict()
    
    Polaritylist = []
    for i in range (len(df)):
        polarityofi = calculatePolarity(df.iloc[i,6], df.iloc[i,5], df.iloc[i,7], den_dict.get(df.iloc[i,0]))
        Polaritylist.append(polarityofi)
    df['Polarity'] = Polaritylist
    
    #calculate polarity per gene
    pol_dict = df.groupby('Name')['Polarity'].agg('sum').to_dict()
    
    Lengthlist = []
    Totalcountlist = []
    names = []
    genepol = []
    for name in pol_dict.keys():
        if name in len_dict.keys() and name in den_dict.keys():
            Lengthlist.append(len_dict.get(name))
            Totalcountlist.append(den_dict.get(name))
            names.append(name)
            genepol.append(pol_dict.get(name))
        
    d = {'Name':names,'Length':Lengthlist,'Counts':Totalcountlist,'PolarityPerGene': genepol}
    dfcsv = pd.DataFrame(data=d)
    dfcsv = dfcsv.dropna(how='any',axis=0)
    dfcsv.to_csv(nameofcsv)
   

In [11]:
inputlist = []
outputlist = []
inpath = '/Users/jiz225/Desktop/ZidLab/polarity/input/rna_35/'
midpath = '/Users/jiz225/Desktop/ZidLab/polarity/output/rna_35/'
inputfolder = os.listdir(inpath)
destdir = "/Users/jiz225/Desktop/ZidLab/polarity/outputcsv/rna_35/"
extension = "*.xls"
for doc in inputfolder:
    if fnmatch.fnmatch(doc, extension):
        #change this input and ouput file path
        inputpath = inpath + doc
        namelist = doc.split('.')
        name = namelist[0]
        outputpath = midpath + name + str(".csv")
        inputlist.append(inputpath)
        outputlist.append(outputpath)
print('\n Number of possible input files: ' + str(len(inputlist)))
print('\n Number of possible output csv files: ' + str(len(outputlist)))
    
for i in range(0, len(inputlist)):
    makecsv(inputlist[i], outputlist[i], extension)
    PolarityPerGene(outputlist[i], destdir)


 Number of possible input files: 5

 Number of possible output csv files: 5
/Users/jiz225/Desktop/ZidLab/polarity/outputcsv/rna_35/Sample_Zidp35-2_1_ATCACG_polarity&gene.csv
/Users/jiz225/Desktop/ZidLab/polarity/outputcsv/rna_35/Sample_Zidp35-2_4_TGACCA_polarity&gene.csv
/Users/jiz225/Desktop/ZidLab/polarity/outputcsv/rna_35/Sample_Zidp35-2_3_TTAGGC_polarity&gene.csv
/Users/jiz225/Desktop/ZidLab/polarity/outputcsv/rna_35/Sample_Zidp35-2_2_CGATGT_polarity&gene.csv
/Users/jiz225/Desktop/ZidLab/polarity/outputcsv/rna_35/Sample_Zidp35-2_5_ACAGTG_polarity&gene.csv
