In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import scipy.stats as sps

In [2]:
from pandas.core.common import SettingWithCopyWarning
import warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [14]:
def path(name, type_):
    return "~ksuma/isomirs/" + name + "_isomir/output-IsoMiRmap_v5-"+type_+"-isomiRs.expression.txt"

def get_table(name, type_, dig):
    ''' Reading the file '''
    df = pd.read_fwf(path(name, type_), delimiter = '/t')
    df = df['## Table of '+type_+'-isomiRs.'].str.split('\t', expand = True)
    df.columns = df.loc[dig]
    
    ''' Deleting tandem repeats (in case of exclusive it is not neccessary)'''
    df = df.where(df[df.columns[-1]] == '').dropna()
    df = df.rename(columns = {df.columns[-1] :'Repeated'})
    df = df.rename(columns = {'Mature meta-data (bracket delimited per hairpin)' :'Mature'})
    
    df.index = 1 + np.arange(0, len(df))
    df['RPM '+name] = pd.to_numeric(df['RPM**'])
    return df[['IsomiR sequence', 'RPM ' + name, 'Mature']]

def get_type(name, table, typ):
    table[name+' type'] = [typ if table['RPM '+name].loc[i] else 0 for i in range(table.shape[0])]
    return table

In [4]:
# определяем референсную микроРНК (вытаскиваем название)
def refer(x):
    try:
        x = x.split('&')[1]
        if 'hsa-' in x:
            return x
    except:
        pass

In [5]:
# вытаскиваем название изоформ
def mir(x):
    try:
        for i in x.split('], ['):
            i = i.strip('[]').split(', ')[0]
            i = re.sub(r'&offsets','', i)
            k = i.split('&')[1]
        if 'hsa-' in k:
            return k
    except:
        pass

In [6]:
# найти изоформы с постранскр. добавленным нуклеотидом
def find_add(x):
    if '|0|' not in x and re.search("\(|\)", x):  # 5' и (+1N)
        return 1
    elif '|0|' in x:       # 3'
        return -1
    return 0               # 5' без (+1N)

In [7]:
def post_transcriptional(name):
    factor = get_table(name, "exclusive", 5)
    factor['isomiR'] = factor['Mature'].apply(mir)
    factor['Mature'] = factor['Mature'].apply(refer)
    factor = factor.dropna()
    factor['binary_5'] = factor['isomiR'].apply(lambda i: 0 if '|0|' in i else 1)
    
    # найти 5'-изоформы с постранскр. добавленным нуклеотидом
    factor['add'] = factor['isomiR'].apply(find_add)
    
    # суммарная и максимальная экспрессии серди 5'-изоформ
    expression_5 = factor.loc[factor['binary_5'] == 1].groupby("Mature").sum()['RPM '+name]
    expression_all = factor.groupby("Mature").sum()['RPM '+name]
    max_expression_5 = factor.loc[factor['binary_5'] == 1].groupby("Mature").max()['RPM '+name]
    
    #суммарная и макс экспрессии среди 5'-изоформ с нематрично добавленными N
    expression_add = factor.loc[factor['add'] == 1].groupby("Mature").sum()['RPM '+name]
    max_expression_add = factor.loc[factor['add'] == 1].groupby("Mature").max()['RPM '+name]
    
    table = pd.DataFrame([expression_5, expression_add, max_expression_5, max_expression_add, expression_all],
                index = [ "sum 5'-isomiR expr", "sum add_5'-isomiR expr", 
                         "max 5'-isomiR expr","max add_5'-isomiR expr",'total']).T
    
    table = table.fillna(0)
    table = pd.DataFrame([table.index, 
                         table["sum 5'-isomiR expr"], table["sum add_5'-isomiR expr"],
                         table["max 5'-isomiR expr"], table["max add_5'-isomiR expr"], table["total"]],
                         index = ['Mature'] + list(table.columns)).T
    return table

In [8]:
ELOV5 = post_transcriptional("ELOV5")
LUC = post_transcriptional("LUC")
IGFBP6 = post_transcriptional("IGFBP6")

In [9]:
ELOV5.sort_values(by = "sum add_5'-isomiR expr", ascending = False)

Unnamed: 0,Mature,sum 5'-isomiR expr,sum add_5'-isomiR expr,max 5'-isomiR expr,max add_5'-isomiR expr,total
159,hsa-miR-29a-3p,921.38,173.33,646.23,88.78,2440.51
31,hsa-miR-10a-5p,306.36,73.76,149.5,25.69,897.78
61,hsa-miR-1307-3p,188.33,59.12,86.54,19.23,824.25
72,hsa-miR-140-3p,180.85,57.3,52.95,27.27,275.28
234,hsa-miR-423-3p,234.15,56.73,117.5,14.62,1776.97
...,...,...,...,...,...,...
426,hsa-let-7f-2-3p,0.0,0.0,0.0,0.0,0.92
4,hsa-let-7b-5p,0.13,0.0,0.13,0.0,969.44
5,hsa-let-7c-3p,0.13,0.0,0.13,0.0,2.1
10,hsa-let-7e-5p,0.13,0.0,0.13,0.0,716.83


In [10]:
LUC.sort_values(by = "sum add_5'-isomiR expr", ascending = False)

Unnamed: 0,Mature,sum 5'-isomiR expr,sum add_5'-isomiR expr,max 5'-isomiR expr,max add_5'-isomiR expr,total
203,hsa-miR-29a-3p,804.16,169.76,559.6,70.48,1890.64
34,hsa-miR-10a-5p,325.84,75.91,170.7,27.04,942.6
89,hsa-miR-140-3p,182.87,67.88,52.49,30.78,278.39
318,hsa-miR-423-3p,221.02,60.05,105.67,12.25,1595.54
217,hsa-miR-30a-3p,129.58,59.48,47.01,38.08,562.09
...,...,...,...,...,...,...
19,hsa-miR-10392-5p,0.06,0.0,0.03,0.0,0.06
639,hsa-miR-944,0.06,0.0,0.03,0.0,0.09
35,hsa-miR-10b-3p,0.03,0.0,0.03,0.0,0.03
38,hsa-miR-11401,0.03,0.0,0.03,0.0,0.35


In [11]:
IGFBP6.sort_values(by = "sum add_5'-isomiR expr", ascending = False)

Unnamed: 0,Mature,sum 5'-isomiR expr,sum add_5'-isomiR expr,max 5'-isomiR expr,max add_5'-isomiR expr,total
196,hsa-miR-29a-3p,1125.96,200.29,799.14,98.58,2584.98
75,hsa-miR-1307-3p,326.98,96.18,155.75,30.97,1316.56
209,hsa-miR-30a-3p,99.87,45.95,35.39,31.85,359.67
308,hsa-miR-423-3p,194.47,44.74,94.64,12.32,1382.12
37,hsa-miR-10a-5p,158.05,33.61,82.32,12.62,515.72
...,...,...,...,...,...,...
634,hsa-miR-1277-3p,0.0,0.0,0.0,0.0,0.03
635,hsa-miR-128-1-5p,0.0,0.0,0.0,0.0,0.98
636,hsa-miR-1285-3p,0.0,0.0,0.0,0.0,0.03
637,hsa-miR-1287-3p,0.0,0.0,0.0,0.0,0.09
