In [None]:
# Import libraries

# You need to save "gw3RACE_functions.py" file in the same directory to import "gw3RACE_functions"
import gw3RACE_functions as gw
import pandas as pd

%matplotlib inline

In [None]:
def safe_distance_to_TES(cigar, strand_R1, stop_R2, gene_start, gene_stop):
    """
    Calculate the distance to the transcription end site (TES) safely, accounting for possible None values in parameters.

    This function computes the distance from the 3' end of the read (R2) to the transcription end site (TES). 
    It ensures that calculations are only performed when all parameters are provided and are not None.
    
    Parameters:
    - cigar (str): The CIGAR string from the alignment.
    - strand_R1 (str): The strand information for R1, expected to be '+' or '-'.
    - stop_R2 (int or None): The stop coordinate of the R2 read. If None, the function returns None.
    - gene_start (int or None): The start coordinate of the gene. If None, the function returns None.
    - gene_stop (int or None): The stop coordinate of the gene. If None, the function returns None.

    Returns:
    int or None: The distance to TES if all inputs are valid, otherwise None if any input is None.


    The function ensures that the resulting distance is calculated correctly based on the strand orientation:
    - For '-' strand, the distance is calculated as gene_start minus stop_R2.
    - For '+' strand, the distance is calculated as stop_R2 minus gene_stop.
    """
    if isinstance(stop_R2, type(None)) or isinstance(gene_start, type(None)) or isinstance(gene_stop, type(None)):
        return None  # Returns None if any of the parameters is None
    else:
        if strand_R1 == '-':
            return int(gene_start) - int(stop_R2)  # Ensures all values are integers
        elif strand_R1 == '+':
            return int(stop_R2) - int(gene_stop)  # Ensures all values are integers

In [None]:
# Read DataFrame as df
df = pd.read_csv('output_Spombe/output_short.tab', sep = '\t',
                 names = ['read_ID', 'chr', 'start_R1', 'stop_R1','strand_R1', 'gene_start', 'gene_stop',
                         'gene','coord_R2', 'cigar', 'seq_R2'])

## Add new column including 3'RNA tails based on CIGAR code
df['tail_fromcigar'] = df.apply(lambda kol: gw.take_tail_fromcigar8(kol.strand_R1,
                                                                    kol.cigar,  
                                                                    kol.seq_R2),
                                axis = 1)

## Add new column including 3'RNA tails using grep (only unmapped reads R2)
df['tail_fromGREP'] = df.apply(lambda kol: gw.grep_tail_edit_onlyfromSeq(kol.seq_R2) if isinstance(kol.seq_R2, str) else '', axis=1)
## Add new column including 3'RNA tail sequences based on CIGAR or grep
df['tail_GreporCigar'] = df.apply(lambda kol: gw.tail_fromGREPorCIGAR(kol.cigar,
                                                                      kol.tail_fromGREP,
                                                                      kol.tail_fromcigar),axis = 1)

## Add new column with information 'grep' or 'cigar'                                                                                   
df['tail_from'] = df.apply(lambda kol: gw.tail_fromGREPorCIGAR_description(kol.cigar,
                                                        kol.tail_fromGREP,kol.tail_fromcigar), axis = 1)

## Calculate tail length
df['tail_len'] = df['tail_GreporCigar'].apply(lambda x: len(x))

## Add new column with information about tail type
df['tail_type'] = df.apply(lambda kol:   gw.test_tail_cigargrep8(kol.tail_from, kol.strand_R1, 
                                                                 kol.tail_GreporCigar),axis = 1)
  
## Add column with coordinate of 3'end of R2 reads
df['stop_R2'] = df.apply(lambda kol: gw.stop_based_on_cigar(kol.cigar, kol.strand_R1, kol.coord_R2) if isinstance(kol.cigar, str) else None, axis=1)
## Calculate distance to transcription end site (TES) based 
df['distance_to_TES'] = df.apply(lambda kol: safe_distance_to_TES(kol.cigar, 
                                                                  kol.strand_R1, 
                                                                  kol.stop_R2, 
                                                                  kol.gene_start,
                                                                  kol.gene_stop), 
                                 axis=1)
## Calculate lentth of gene sequence
df['gene_start'] = pd.to_numeric(df['gene_start'], errors='coerce')
df['gene_stop'] = pd.to_numeric(df['gene_stop'], errors='coerce')


df['gene_len'] = df.apply(lambda row: row['gene_stop'] - row['gene_start'] if pd.notna(row['gene_stop']) and pd.notna(row['gene_start']) else None, axis=1)

## Calculate relative distance to TES
df['rel_distance_to_TES'] = df['distance_to_TES']/df['gene_len']

## Drop some columns to reduce the size of the table
df.drop(columns= [ 'chr', 'start_R1', 'stop_R1', 'strand_R1', 'gene_start',
       'gene_stop',  'coord_R2', 'cigar', 'seq_R2', 'tail_fromcigar',
       'tail_fromGREP', 'stop_R2', 'gene_len'], inplace = True)
    

In [None]:
## Assign RNA type - it will works only for S. pombe genome

df['RNA_type'] = df['gene'].apply(lambda x: gw.test_RNA_type(str(x).lower()) if x is not None else None)

In [None]:
## Save DataFrame as csv
df.to_csv('output_Spombe/output_detailed_DataFrame.csv', index = False)