In [155]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import scipy.stats as sps

### Определение изоформ с мутациями на 5'-концах
* Считаю все изоформы, которые определил isomirmap
* Возьму микроРНК, которые являются референсом для данного изомира (mirDeep)

In [156]:
def path(name, type_):
    return "~ksuma/isomirs/" + name + "_isomir/output-IsoMiRmap_v5-"+type_+"-isomiRs.expression.txt"

def get_table(name, type_, dig):
    ''' Reading the file '''
    df = pd.read_fwf(path(name, type_), delimiter = '/t')
    df = df['## Table of '+type_+'-isomiRs.'].str.split('\t', expand = True)
    df.columns = df.loc[dig]
    
    ''' Deleting tandem repeats (in case of exclusive it is not neccessary)'''
    df = df.where(df[df.columns[-1]] == '').dropna()
    df = df.rename(columns = {df.columns[-1] :'Repeated'})
    df = df.rename(columns = {'Mature meta-data (bracket delimited per hairpin)' :'Mature'})
    
    df.index = 1 + np.arange(0, len(df))
    df['RPM '+name] = pd.to_numeric(df['RPM**'])
    return df[['License Plate', 'IsomiR sequence', 'RPM ' + name, 
               'Mature']]

def get_type(name, table, typ):
    table[name+' type'] = [typ if table['RPM '+name].loc[i] else 0 for i in range(table.shape[0])]
    return table

In [157]:
ELOV5 = get_table("ELOV5", "exclusive", 5)
IGFBP6 = get_table("IGFBP6", "exclusive", 5)
LUC = get_table("LUC", "exclusive", 5)

In [158]:
IGFBP6

5,License Plate,IsomiR sequence,RPM IGFBP6,Mature
1,iso-22-BRS28UEYP,AACCCGTAGATCCGAACTTGTG,45233.22,"[MIMAT0000098&hsa-miR-100-5p&offsets|0|0, m-27..."
2,iso-23-VIV6OYINDR,TAGCTTATCAGACTGATGTTGAC,12214.16,"[MIMAT0000076&hsa-miR-21-5p&offsets|0|+1, m-1&..."
3,iso-22-XKVL7YXYQ,TGAGGTAGTAGTTTGTGCTGTT,10543.66,"[MIMAT0000415&hsa-let-7i-5p&offsets|0|0, m-49&..."
4,iso-21-BRS28UEYE,AACCCGTAGATCCGAACTTGT,6761.78,"[MIMAT0000098&hsa-miR-100-5p&offsets|0|-1, m-2..."
5,iso-22-BRS28UEYO,AACCCGTAGATCCGAACTTGTA,5719.47,[MIMAT0000098&hsa-miR-100-5p&offsets|0|-1(+1A)...
...,...,...,...,...
15797,iso-24-ZZJXZI3U1O,TTTTTCATTATTGCTCCTGACCTA,0.03,[MIMAT0004703&hsa-miR-335-3p&offsets|0|+1(+1A)...
15798,iso-21-ZZJXZI3VD,TTTTTCATTATTGCTCCTGCC,0.03,[MIMAT0004703&hsa-miR-335-3p&offsets|0|-3(+2C)...
15799,iso-20-ZZJXZI38,TTTTTCATTATTGCTCCTGG,0.03,[MIMAT0004703&hsa-miR-335-3p&offsets|0|-3(+1G)...
15800,iso-23-ZZ3RL94DDX,TTTTTCCTCCCGCTCCTAACGGA,0.03,[m-9124&15|-|44427218|44427239&offsets|-1|0]


In [159]:
def mir(x):
    try:
        for i in x.split('], ['):
            i = i.strip('[]').split(', ')[0]
            i = re.sub(r'&offsets','', i)
            k = i.split('&')[1]
        if 'hsa-' in k:
            return k
    except:
        pass

IGFBP6['IsomiR'] = IGFBP6['Mature'].apply(mir)
LUC['IsomiR'] = LUC['Mature'].apply(mir)
ELOV5['IsomiR'] = ELOV5['Mature'].apply(mir)

In [160]:
def refer(x):
    try:
        x = x.split('&')[1]
        if 'hsa-' in x:
            return x
    except:
        pass

IGFBP6['Mature'] = IGFBP6['Mature'].apply(refer)
LUC['Mature'] = LUC['Mature'].apply(refer)
ELOV5['Mature'] = ELOV5['Mature'].apply(refer)

In [161]:
IGFBP6 = IGFBP6.dropna()
LUC = LUC.dropna()
ELOV5 = ELOV5.dropna()

#### Например, метка «hsa-miR-142-5p|-2|-3» основана на обозначениях miRBase и относится к изоформе miR-142-5p, 5'-конец которой начинается на 2 нуклеотида (нт) выше 5'-конец и 3'-конец эталона miRBase заканчиваются на 3 нуклеотида выше по течению от 3'-конца эталона miRBase.

In [162]:
from pandas.core.common import SettingWithCopyWarning
import warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [163]:
IGFBP6_5p = IGFBP6[['Mature', 'IsomiR', 'RPM IGFBP6']]
LUC_5p = LUC[['Mature', 'IsomiR', 'RPM LUC']]
ELOV5_5p = ELOV5[['Mature', 'IsomiR', 'RPM ELOV5']]

In [164]:
def mut_5end(x):
    if re.search('p\|\+\d', x) or re.search('p\|\-\d', x):
        return 1
    return 0

def canonical(x):
    if re.search('p\|0\|', x):
        return 1
    return 0

In [165]:
# поиск изоформ с 5'-изменениями
IGFBP6_5p['5_mutation'] = IGFBP6_5p.loc[:, 'IsomiR'].apply(mut_5end)
LUC_5p['5_mutation'] = LUC_5p.loc[:, 'IsomiR'].apply(mut_5end)
ELOV5_5p['5_mutation'] = ELOV5_5p.loc[:, 'IsomiR'].apply(mut_5end)

In [166]:
# определение экспрессии канонической формы
IGFBP6_5p['canonical'] = IGFBP6_5p.loc[:, 'IsomiR'].apply(canonical) * IGFBP6_5p['RPM IGFBP6']
LUC_5p['canonical'] = LUC_5p.loc[:, 'IsomiR'].apply(canonical) * LUC_5p['RPM LUC']
ELOV5_5p['canonical'] = ELOV5_5p.loc[:, 'IsomiR'].apply(canonical) * ELOV5_5p['RPM ELOV5']

In [175]:
# Суммы канонических форм
canon_sums = ELOV5_5p.loc[ELOV5_5p['5_mutation'] == 0].groupby("Mature").sum()
canon_sums = pd.DataFrame([canon_sums.index, canon_sums['RPM ELOV5']],
                           index = ['Mature', 'Expression_canon']).T

# поиск максимально экспресированной неканонической формы
max_noncanon = ELOV5_5p.loc[ELOV5_5p['5_mutation'] == 1].groupby("Mature").max()
max_noncanon = max_noncanon.rename(columns = {'RPM ELOV5': 'Max expression'})
max_noncanon = pd.DataFrame([max_noncanon.index, max_noncanon['Max expression']],
                           index = ['Mature', 'Max_non_canon']).T

# поиск сумм экспрессий неканонических форм
expr_noncanon = ELOV5_5p.loc[ELOV5_5p['5_mutation'] == 1].groupby("Mature").sum()
expr_noncanon = pd.DataFrame([expr_noncanon.index, expr_noncanon['RPM ELOV5']],
                            index = ['Mature', 'Expression_non_canon']).T

In [176]:
pd.concat([expr_noncanon, max_noncanon, canon_sums], axis = 0)

Unnamed: 0,Mature,Expression_non_canon,Max_non_canon,Expression_canon
0,hsa-let-7a-2-3p,2.63,,
1,hsa-let-7a-3p,4.73,,
2,hsa-let-7a-5p,22.21,,
3,hsa-let-7b-3p,0.26,,
4,hsa-let-7b-5p,0.13,,
...,...,...,...,...
663,hsa-miR-9903,,,0.26
664,hsa-miR-99a-3p,,,0.39
665,hsa-miR-99a-5p,,,822.32
666,hsa-miR-99b-3p,,,168.09
