In [8]:
# Import libraries

# You need to save "gw3RACE_functions.py" file in the same directory to import "gw3RACE_functions"
import gw3RACE_functions as gw
import pandas as pd

%matplotlib inline

In [28]:
# Read DataFrame as df
df = pd.read_csv('output_Spombe/output_short.tab', sep = '\t',
                 names = ['read_ID', 'chr', 'start_R1', 'stop_R1','strand_R1', 'gene_start', 'gene_stop',
                         'gene','coord_R2', 'cigar', 'seq_R2'])

## Add new column including 3'RNA tails based on CIGAR code
df['tail_fromcigar'] = df.apply(lambda kol: gw.take_tail_fromcigar8(kol.strand_R1, kol.cigar,  
                                                                    kol.seq_R2),   axis = 1)

## Add new column including 3'RNA tails using grep (only unmapped reads R2)
df['tail_fromGREP'] = df.apply(lambda kol: gw.grep_tail_edit_onlyfromSeq(kol.seq_R2), axis = 1)

## Add new column including 3'RNA tail sequences based on CIGAR or grep
df['tail_GreporCigar'] = df.apply(lambda kol: gw.tail_fromGREPorCIGAR(kol.cigar, kol.tail_fromGREP,
                                                                      kol.tail_fromcigar),axis = 1)

## Add new column with information 'grep' or 'cigar'                                                                                   
df['tail_from'] = df.apply(lambda kol: gw.tail_fromGREPorCIGAR_description(kol.cigar,
                                                        kol.tail_fromGREP,kol.tail_fromcigar), axis = 1)

## Calculate tail length
df['tail_len'] = df['tail_GreporCigar'].apply(lambda x: len(x))

## Add new column with information about tail type
df['tail_type'] = df.apply(lambda kol:   gw.test_tail_cigargrep8(kol.tail_from, kol.strand_R1, 
                                                                 kol.tail_GreporCigar),axis = 1)
  
## Add column with coordinate of 3'end of R2 reads
df['stop_R2'] =  df.apply(lambda kol: gw.stop_based_on_cigar(kol.cigar, kol.strand_R1 , 
                                                             kol.coord_R2),axis = 1)

## Calculate distance to transcription end site (TES) based 
df['distance_to_TES'] =  df.apply(lambda kol: gw.distance_to_TES(kol.cigar,kol.strand_R1 ,kol.stop_R2,
                                                              kol.gene_start,kol.gene_stop),axis = 1)

## Calculate lentth of gene sequence
df['gene_len'] = df['gene_stop']-df['gene_start']

## Calculate relative distance to TES
df['rel_distance_to_TES'] = df['distance_to_TES']/df['gene_len']

## Drop some columns to reduce the size of the table
df.drop(columns= [ 'chr', 'start_R1', 'stop_R1', 'strand_R1', 'gene_start',
       'gene_stop',  'coord_R2', 'cigar', 'seq_R2', 'tail_fromcigar',
       'tail_fromGREP', 'stop_R2', 'gene_len'], inplace = True)
    

In [29]:
## Assign RNA type - it will works only for S. pombe genome

df['RNA_type'] = df['gene'].apply(lambda x: gw.test_RNA_type(x.lower()))

In [None]:
## Save DataFrame as csv
df.to_csv('output_Spombe/output_detailed_DataFrame.csv', index = False)

In [30]:
df.head()

Unnamed: 0,read_ID,gene,tail_GreporCigar,tail_from,tail_len,tail_type,distance_to_TES,rel_distance_to_TES,RNA_type
0,A00805:163:HFM2WDRX2:1:1101:1000:1626:ANTGTT,SPCC320.14,,cigar,0,no_tail,-482,-0.243557,mRNA
1,A00805:163:HFM2WDRX2:1:1101:1000:19445:CNCCTG,SPBC947.02,,cigar,0,no_tail,-729,-0.218656,mRNA
2,A00805:163:HFM2WDRX2:1:1101:1000:27085:CTTTAT,SPSNRNA.07,,cigar,0,no_tail,-1,-0.003922,snRNA
3,A00805:163:HFM2WDRX2:1:1101:10004:28682:TCACAC,SPBC1539.09c,TTTTTTTTTTTTTT,cigar,14,polyA,130,0.057018,mRNA
4,A00805:163:HFM2WDRX2:1:1101:10004:7075:TCTTCA,SPSNRNA.07,,cigar,0,no_tail,-1,-0.003922,snRNA
