In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import scipy.stats as sps

In [2]:
from pandas.core.common import SettingWithCopyWarning
import warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [3]:
def path(name, type_):
    return "~ksuma/isomirs/" + name + "_isomir/output-IsoMiRmap_v5-"+type_+"-isomiRs.expression.txt"

def get_table(name, type_, dig):
    ''' Reading the file '''
    df = pd.read_fwf(path(name, type_), delimiter = '/t')
    df = df['## Table of '+type_+'-isomiRs.'].str.split('\t', expand = True)
    df.columns = df.loc[dig]
    
    ''' Deleting tandem repeats (in case of exclusive it is not neccessary)'''
    df = df.where(df[df.columns[-1]] == '').dropna()
    df = df.rename(columns = {df.columns[-1] :'Repeated'})
    df = df.rename(columns = {'Mature meta-data (bracket delimited per hairpin)' :'Mature'})
    
    df.index = 1 + np.arange(0, len(df))
    df['RPM '+name] = pd.to_numeric(df['RPM**'])
    return df[['IsomiR sequence', 'RPM ' + name, 'Mature']]

def get_type(name, table, typ):
    table[name+' type'] = [typ if table['RPM '+name].loc[i] else 0 for i in range(table.shape[0])]
    return table

In [4]:
# определяем референсную микроРНК (вытаскиваем название)
def refer(x):
    try:
        x = x.split('&')[1]
        if 'hsa-' in x:
            return x
    except:
        pass

In [5]:
# вытаскиваем название изоформ
def mir(x):
    try:
        for i in x.split('], ['):
            i = i.strip('[]').split(', ')[0]
            i = re.sub(r'&offsets','', i)
            k = i.split('&')[1]
        if 'hsa-' in k:
            return k
    except:
        pass

In [19]:
def find_5_isomiRs(name):
    factor = get_table(name, "exclusive", 5)
    
    # вытаскиваем референсную микроРНК для каждой изоформы
    factor['isomiR'] = factor['Mature'].apply(mir)

    # Определяем референс
    factor['Mature'] = factor['Mature'].apply(refer)
    factor = factor.dropna()

    # определяем 5'-изоформы (добавляем бинарный столбец: 1 - 5'-isomiR; 0 - каноническая)
    factor['binary_5'] = factor['isomiR'].apply(lambda i: 0 if '|0|' in i else 1)
    
    # определяем, какая экспрессия для 5'-изоформ, принадлежащих одной микроРНК
    expression_5 = factor.loc[factor['binary_5'] == 1].groupby("Mature").sum()['RPM '+name]

    # суммарная экспрессия
    expression_all = factor.groupby("Mature").sum()['RPM '+name]

    # ищем макс экспрессию среди 5'-изоформ
    max_expression_5 = factor.loc[factor['binary_5'] == 1].groupby("Mature").max()['RPM '+name]
    
    table = pd.DataFrame([max_expression_5, expression_5, expression_all],
             index = ["max 5'-isomiR expr", "sum 5'-isomiR expr", 'total']).T
    table = table.fillna(0)
    return table

In [20]:
ELOV5 = find_5_isomiRs("ELOV5")
LUC = find_5_isomiRs("LUC")
IGFBP6 = find_5_isomiRs("IGFBP6")

In [22]:
ELOV5.sort_values(by = 'total', ascending = False)

Unnamed: 0_level_0,max 5'-isomiR expr,sum 5'-isomiR expr,total
Mature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hsa-miR-100-5p,25.55,51.88,41197.52
hsa-miR-21-5p,15.15,84.02,28149.42
hsa-let-7i-5p,3.56,7.90,11298.13
hsa-let-7a-5p,4.87,22.21,8326.06
hsa-miR-221-3p,0.53,2.10,7407.60
...,...,...,...
hsa-miR-4797-3p,0.00,0.00,0.13
hsa-miR-4731-5p,0.13,0.13,0.13
hsa-miR-488-3p,0.00,0.00,0.13
hsa-miR-498-5p,0.00,0.00,0.13


In [23]:
LUC.sort_values(by = 'total', ascending = False)

Unnamed: 0_level_0,max 5'-isomiR expr,sum 5'-isomiR expr,total
Mature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hsa-miR-100-5p,33.67,57.58,46316.95
hsa-miR-21-5p,14.27,75.26,23045.40
hsa-let-7i-5p,2.74,7.48,11085.70
hsa-miR-222-3p,4.18,26.81,7990.74
hsa-let-7a-5p,6.80,22.25,7635.39
...,...,...,...
hsa-miR-4511,0.00,0.00,0.03
hsa-miR-4515,0.00,0.00,0.03
hsa-miR-4655-5p,0.00,0.00,0.03
hsa-miR-4658,0.00,0.00,0.03


In [24]:
IGFBP6.sort_values(by = 'total', ascending = False)

Unnamed: 0_level_0,max 5'-isomiR expr,sum 5'-isomiR expr,total
Mature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
hsa-miR-100-5p,52.47,90.69,65352.13
hsa-miR-21-5p,18.89,92.41,20014.55
hsa-let-7i-5p,4.02,10.66,13408.19
hsa-let-7a-5p,5.68,22.54,7643.47
hsa-let-7f-5p,3.06,7.87,4572.20
...,...,...,...
hsa-miR-5581-5p,0.00,0.00,0.03
hsa-miR-6822-5p,0.03,0.03,0.03
hsa-miR-6821-5p,0.03,0.03,0.03
hsa-miR-4660,0.00,0.00,0.03
