# Setup

In [1]:
!pip install pyfaidx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyfaidx
  Downloading pyfaidx-0.7.1.tar.gz (103 kB)
[K     |████████████████████████████████| 103 kB 4.4 MB/s 
Building wheels for collected packages: pyfaidx
  Building wheel for pyfaidx (setup.py) ... [?25l[?25hdone
  Created wheel for pyfaidx: filename=pyfaidx-0.7.1-py3-none-any.whl size=27747 sha256=7a2412e8aea8603c092b6028bdd755188c966ee99a9dd11d8a8ef39e2e591f77
  Stored in directory: /root/.cache/pip/wheels/1a/d6/99/7334c4d11bfb574e6d6ea706256053b268a12f2127af1cfd40
Successfully built pyfaidx
Installing collected packages: pyfaidx
Successfully installed pyfaidx-0.7.1


In [2]:
import pandas as pd
from pyfaidx import Fasta

# Downloading raw data

Downloading original data from [Helwak et al. (2013)](https://www.sciencedirect.com/science/article/pii/S009286741300439X?via%3Dihub) - Supplementary Data S1.

In [3]:
!wget https://www.cell.com/cms/10.1016/j.cell.2013.03.043/attachment/a32ed39a-296d-47ef-bb68-6e913f08387c/mmc1.txt

--2022-10-30 10:02:11--  https://www.cell.com/cms/10.1016/j.cell.2013.03.043/attachment/a32ed39a-296d-47ef-bb68-6e913f08387c/mmc1.txt
Resolving www.cell.com (www.cell.com)... 104.18.123.114, 104.18.124.114
Connecting to www.cell.com (www.cell.com)|104.18.123.114|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4635394 (4.4M) [application/octet-stream]
Saving to: ‘mmc1.txt’


2022-10-30 10:02:12 (31.8 MB/s) - ‘mmc1.txt’ saved [4635394/4635394]



Downloading mRNA database from the Hyb pipeline ([Travis et al. (2014))](https://www.sciencedirect.com/science/article/pii/S1046202313004180?via%3Dihub). It will be used later to enlarge mRNA sequences that are shorter than 50bp.

In [4]:
!wget https://github.com/gkudla/hyb/raw/master/data/db/hOH7.fasta.gz
!gzip -d hOH7.fasta.gz

--2022-10-30 10:02:12--  https://github.com/gkudla/hyb/raw/master/data/db/hOH7.fasta.gz
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/gkudla/hyb/master/data/db/hOH7.fasta.gz [following]
--2022-10-30 10:02:13--  https://raw.githubusercontent.com/gkudla/hyb/master/data/db/hOH7.fasta.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35354165 (34M) [application/octet-stream]
Saving to: ‘hOH7.fasta.gz’


2022-10-30 10:02:16 (150 MB/s) - ‘hOH7.fasta.gz’ saved [35354165/35354165]



# Loading the data

In [5]:
df = pd.read_csv("mmc1.txt", sep='\t', skiprows=30)
df

Unnamed: 0,seq_ID,microRNA_name,miRNA_start,miRNA_end,miRNA_seq,mRNA_name,mRNA_start,mRNA_end_extended,mRNA_seq_extended,chimeras_decompressed,...,folding_energy,5'UTR,CDS,3'UTR,folding_class,conservation_score,log2_target_enrichment,CLASH_single_reads_ovlp,CLASH_cluster_ovlp,PAR_CLIP_cluster_ovlp
0,0727A-1038930_1,MIMAT0000062_MirBase_let-7a_microRNA,1,22,TGAGGTAGTAGGTTGTATAGTT,ENSG00000113328_ENST00000340828_CCNG1_mRNA,1791,1890,ATTTGTATCTACGATAAAAATTTTTATACAGAACCTACTGCCTCAA...,31,...,-25.1,,,1.0,III,0.210342,-0.020802,270.0,,
1,L1HS-1112536_1,MIMAT0000062_MirBase_let-7a_microRNA,1,22,TGAGGTAGTAGGTTGTATAGTT,ENSG00000100697_ENST00000343455_DICER1_mRNA,3857,3928,CAGGAAATACCCGTGCAACCAACTACCTCATATTCCATTCAGAATT...,9,...,-24.4,,1.0,,II,,0.628759,24.0,,1.0
2,L2HS-818542_2,MIMAT0000062_MirBase_let-7a_microRNA,1,22,TGAGGTAGTAGGTTGTATAGTT,ENSG00000080546_ENST00000436639_SESN1_mRNA,2385,2434,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,2,...,-22.2,,1.0,1.0,III,,0.022816,56.0,1.0,1.0
3,L2HS-1161339_2,MIMAT0000062_MirBase_let-7a_microRNA,1,22,TGAGGTAGTAGGTTGTATAGTT,ENSG00000164190_ENST00000282516_NIPBL_mRNA,6570,6623,CAATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAA...,2,...,-22.1,,1.0,,III,,-0.007294,7.0,,
4,L2-407944_2,MIMAT0000062_MirBase_let-7a_microRNA,1,22,TGAGGTAGTAGGTTGTATAGTT,ENSG00000138785_ENST00000340139_INTS12_mRNA,1164,1208,AATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGTG,2,...,-21.9,,1.0,,III,,0.026476,6.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18509,L2HS-3158593_1,MIMAT0018349_MirBase_miR-3934_microRNA,1,22,TCAGGTGTGGAAACTGAGGCAG,ENSG00000198712_ENST00000361739_MT-CO2_mRNA,199,253,ATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTAC...,1,...,-13.5,,1.0,,V,,,593.0,,
18510,L2HS-578047_3,MIMAT0018359_MirBase_miR-3943_microRNA,1,23,TAGCCCCCAGGCTTCACTTGGCG,ENSG00000140988_ENST00000343262_RPS2_mRNA,345,388,CTTCCTGGGGGCCTCTCTCAAGGATGAGGTTTTGAAGATTATGC,4,...,-18.8,,1.0,,I,,,662.0,,
18511,L1HS-550121_2,MIMAT0018359_MirBase_miR-3943_microRNA,1,23,TAGCCCCCAGGCTTCACTTGGCG,ENSG00000080824_ENST00000216281_HSP90AA1_mRNA,666,720,AGTACGCTTGGGAGTCCTCAGCAGGGGGATCATTCACAGTGAGGAC...,2,...,-18.5,,1.0,,I,,,178.0,,
18512,L2HS-896514_2,MIMAT0018359_MirBase_miR-3943_microRNA,1,23,TAGCCCCCAGGCTTCACTTGGCG,ENSG00000135404_ENST00000257857_CD63_mRNA,614,657,TAAGTGCTGTGGGGCTGCTAACTACACAGATTGGGAGAAAATCC,2,...,-17.9,,1.0,,III,,,47.0,,


In [6]:
hOH7 = Fasta('hOH7.fasta')
chroms = dict(zip(hOH7.keys(), [hOH7[key][:].end for key in hOH7.keys()]))
dict(list(chroms.items())[:5])

{'ENSG00000211785_ENST00000390433_AE000659-15_Ig': 340,
 'ENSG00000211599_ENST00000390244_IGKV5-2_Ig': 408,
 'ENSG00000211649_ENST00000390295_IGLV7-46_Ig': 384,
 'ENSG00000211786_ENST00000390434_AE000659-2_Ig': 338,
 'ENSG00000211655_ENST00000390301_IGLV1-36_Ig': 392}

# Modifying mRNA sequences

## Enlarging the mRNA

In [7]:
genes = df[['mRNA_name', 'mRNA_start', 'mRNA_end_extended', 'microRNA_name']].copy(True)
genes['new_start'] = 0
genes['new_end'] = 0

for i, row in genes.iterrows():
    center = int(row['mRNA_start'] + (row['mRNA_end_extended'] - row['mRNA_start'])//2)

    # Taking care of the edge cases:
    # if the current center is way too to the left - move it so that there are
    # at least 25bp to the left
    if center < 25:
        center = 25
    # if the current center is way too to the right - move it so that there are
    # at least 25bp to the right
    if center + 25 >= chroms[row['mRNA_name']]:
        center = chroms[row['mRNA_name']] - 26
        
    # compute new start and end based on the new center, and so that the length
    # of the whole mRNA is 50bp
    genes.loc[i, 'new_start'] = center - 25
    genes.loc[i, 'new_end'] = center + 25
    assert (genes.loc[i, 'new_end'] - genes.loc[i, 'new_start']) == 50

genes

Unnamed: 0,mRNA_name,mRNA_start,mRNA_end_extended,microRNA_name,new_start,new_end
0,ENSG00000113328_ENST00000340828_CCNG1_mRNA,1791,1890,MIMAT0000062_MirBase_let-7a_microRNA,1815,1865
1,ENSG00000100697_ENST00000343455_DICER1_mRNA,3857,3928,MIMAT0000062_MirBase_let-7a_microRNA,3867,3917
2,ENSG00000080546_ENST00000436639_SESN1_mRNA,2385,2434,MIMAT0000062_MirBase_let-7a_microRNA,2384,2434
3,ENSG00000164190_ENST00000282516_NIPBL_mRNA,6570,6623,MIMAT0000062_MirBase_let-7a_microRNA,6571,6621
4,ENSG00000138785_ENST00000340139_INTS12_mRNA,1164,1208,MIMAT0000062_MirBase_let-7a_microRNA,1161,1211
...,...,...,...,...,...,...
18509,ENSG00000198712_ENST00000361739_MT-CO2_mRNA,199,253,MIMAT0018349_MirBase_miR-3934_microRNA,201,251
18510,ENSG00000140988_ENST00000343262_RPS2_mRNA,345,388,MIMAT0018359_MirBase_miR-3943_microRNA,341,391
18511,ENSG00000080824_ENST00000216281_HSP90AA1_mRNA,666,720,MIMAT0018359_MirBase_miR-3943_microRNA,668,718
18512,ENSG00000135404_ENST00000257857_CD63_mRNA,614,657,MIMAT0018359_MirBase_miR-3943_microRNA,610,660


## Obtaining new mRNA sequences

In [8]:
genes = genes[['mRNA_name', 'new_start', 'new_end']]
genes.head()

Unnamed: 0,mRNA_name,new_start,new_end
0,ENSG00000113328_ENST00000340828_CCNG1_mRNA,1815,1865
1,ENSG00000100697_ENST00000343455_DICER1_mRNA,3867,3917
2,ENSG00000080546_ENST00000436639_SESN1_mRNA,2384,2434
3,ENSG00000164190_ENST00000282516_NIPBL_mRNA,6571,6621
4,ENSG00000138785_ENST00000340139_INTS12_mRNA,1161,1211


In [9]:
genes['mRNA_seq'] = genes.apply(lambda x: hOH7[x['mRNA_name']][x['new_start']:x['new_end']].seq, axis=1)
genes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,mRNA_name,new_start,new_end,mRNA_seq
0,ENSG00000113328_ENST00000340828_CCNG1_mRNA,1815,1865,ATACAGAACCTACTGCCTCAAACTGAATCCCATCAAGAAAACTAGT...
1,ENSG00000100697_ENST00000343455_DICER1_mRNA,3867,3917,CGTGCAACCAACTACCTCATATTCCATTCAGAATTTATACAGTTAC...
2,ENSG00000080546_ENST00000436639_SESN1_mRNA,2384,2434,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...
3,ENSG00000164190_ENST00000282516_NIPBL_mRNA,6571,6621,ATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAATG...
4,ENSG00000138785_ENST00000340139_INTS12_mRNA,1161,1211,ACAATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGT...
...,...,...,...,...
18509,ENSG00000198712_ENST00000361739_MT-CO2_mRNA,201,251,CTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCA...
18510,ENSG00000140988_ENST00000343262_RPS2_mRNA,341,391,TTTCTTCCTGGGGGCCTCTCTCAAGGATGAGGTTTTGAAGATTATG...
18511,ENSG00000080824_ENST00000216281_HSP90AA1_mRNA,668,718,ACGCTTGGGAGTCCTCAGCAGGGGGATCATTCACAGTGAGGACAGA...
18512,ENSG00000135404_ENST00000257857_CD63_mRNA,610,660,TTTTAAGTGCTGTGGGGCTGCTAACTACACAGATTGGGAGAAAATC...


In [12]:
dataset = pd.concat([df[['miRNA_seq', 'microRNA_name']], genes[['mRNA_seq', 'mRNA_name']]], axis=1)
dataset

Unnamed: 0,miRNA_seq,microRNA_name,mRNA_seq,mRNA_name
0,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062_MirBase_let-7a_microRNA,ATACAGAACCTACTGCCTCAAACTGAATCCCATCAAGAAAACTAGT...,ENSG00000113328_ENST00000340828_CCNG1_mRNA
1,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062_MirBase_let-7a_microRNA,CGTGCAACCAACTACCTCATATTCCATTCAGAATTTATACAGTTAC...,ENSG00000100697_ENST00000343455_DICER1_mRNA
2,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062_MirBase_let-7a_microRNA,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,ENSG00000080546_ENST00000436639_SESN1_mRNA
3,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062_MirBase_let-7a_microRNA,ATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAATG...,ENSG00000164190_ENST00000282516_NIPBL_mRNA
4,TGAGGTAGTAGGTTGTATAGTT,MIMAT0000062_MirBase_let-7a_microRNA,ACAATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGT...,ENSG00000138785_ENST00000340139_INTS12_mRNA
...,...,...,...,...
18509,TCAGGTGTGGAAACTGAGGCAG,MIMAT0018349_MirBase_miR-3934_microRNA,CTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCA...,ENSG00000198712_ENST00000361739_MT-CO2_mRNA
18510,TAGCCCCCAGGCTTCACTTGGCG,MIMAT0018359_MirBase_miR-3943_microRNA,TTTCTTCCTGGGGGCCTCTCTCAAGGATGAGGTTTTGAAGATTATG...,ENSG00000140988_ENST00000343262_RPS2_mRNA
18511,TAGCCCCCAGGCTTCACTTGGCG,MIMAT0018359_MirBase_miR-3943_microRNA,ACGCTTGGGAGTCCTCAGCAGGGGGATCATTCACAGTGAGGACAGA...,ENSG00000080824_ENST00000216281_HSP90AA1_mRNA
18512,TAGCCCCCAGGCTTCACTTGGCG,MIMAT0018359_MirBase_miR-3943_microRNA,TTTTAAGTGCTGTGGGGCTGCTAACTACACAGATTGGGAGAAAATC...,ENSG00000135404_ENST00000257857_CD63_mRNA


# Modifying miRNA sequences

In [13]:
dataset['miRNA_seq'] = dataset['miRNA_seq'].apply(lambda x: x[0:20])
dataset

Unnamed: 0,miRNA_seq,microRNA_name,mRNA_seq,mRNA_name
0,TGAGGTAGTAGGTTGTATAG,MIMAT0000062_MirBase_let-7a_microRNA,ATACAGAACCTACTGCCTCAAACTGAATCCCATCAAGAAAACTAGT...,ENSG00000113328_ENST00000340828_CCNG1_mRNA
1,TGAGGTAGTAGGTTGTATAG,MIMAT0000062_MirBase_let-7a_microRNA,CGTGCAACCAACTACCTCATATTCCATTCAGAATTTATACAGTTAC...,ENSG00000100697_ENST00000343455_DICER1_mRNA
2,TGAGGTAGTAGGTTGTATAG,MIMAT0000062_MirBase_let-7a_microRNA,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,ENSG00000080546_ENST00000436639_SESN1_mRNA
3,TGAGGTAGTAGGTTGTATAG,MIMAT0000062_MirBase_let-7a_microRNA,ATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAATG...,ENSG00000164190_ENST00000282516_NIPBL_mRNA
4,TGAGGTAGTAGGTTGTATAG,MIMAT0000062_MirBase_let-7a_microRNA,ACAATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGT...,ENSG00000138785_ENST00000340139_INTS12_mRNA
...,...,...,...,...
18509,TCAGGTGTGGAAACTGAGGC,MIMAT0018349_MirBase_miR-3934_microRNA,CTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCA...,ENSG00000198712_ENST00000361739_MT-CO2_mRNA
18510,TAGCCCCCAGGCTTCACTTG,MIMAT0018359_MirBase_miR-3943_microRNA,TTTCTTCCTGGGGGCCTCTCTCAAGGATGAGGTTTTGAAGATTATG...,ENSG00000140988_ENST00000343262_RPS2_mRNA
18511,TAGCCCCCAGGCTTCACTTG,MIMAT0018359_MirBase_miR-3943_microRNA,ACGCTTGGGAGTCCTCAGCAGGGGGATCATTCACAGTGAGGACAGA...,ENSG00000080824_ENST00000216281_HSP90AA1_mRNA
18512,TAGCCCCCAGGCTTCACTTG,MIMAT0018359_MirBase_miR-3943_microRNA,TTTTAAGTGCTGTGGGGCTGCTAACTACACAGATTGGGAGAAAATC...,ENSG00000135404_ENST00000257857_CD63_mRNA


# Finalize the positive dataset

In [14]:
# add the label
dataset['label'] = 1
# shuffle the samples
dataset = dataset.sample(frac=1, random_state=42).drop_duplicates().reset_index(drop=True)
# name the index (used when saving)
dataset.index.name = "id"

dataset

Unnamed: 0_level_0,miRNA_seq,microRNA_name,mRNA_seq,mRNA_name,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,TGTAAACATCCTCGACTGGA,MIMAT0000087_MirBase_miR-30a_microRNA,GTCGAGAAGTCAACGACTCAATGAAGAATTCCACTTATGGCGTGAA...,ENSG00000130227_ENST00000252512_XPO7_mRNA,1
1,TGAGACCTCTGGGTTCTGAG,MIMAT0003886_MirBase_miR-769-5p_microRNA,GACCTCAGAGCTCCAGGATCATCAGTAAATTTGTCATGTTATATAT...,ENSG00000159322_ENST00000311669_ADPGK_mRNA,1
2,GAGGGTTGGGTGGAGGCTCT,MIMAT0004679_MirBase_miR-296-3p_microRNA,TGCCATAATGAACCGTCCAGCCCCTGTGGAGATCTCCTATGAGAAC...,ENSG00000184007_ENST00000344035_PTP4A2_mRNA,1
3,CAAAGTGCTTACAGTGCAGG,MIMAT0000070_MirBase_miR-17_microRNA,CACCAGAATTGCCAAAGCACATATATAATAAATTAGATAAAGGGCA...,ENSG00000121879_ENST00000263967_PIK3CA_mRNA,1
4,TTAGGGCCCTGGCTCCATCT,MIMAT0005794_MirBase_miR-1296_microRNA,AGCCAAGTGGAGAAGGGTTCCTATCCCTGGCAGGTATCTCTGAAAC...,ENSG00000183378_ENST00000454689_OVCH2_mRNA,1
...,...,...,...,...,...
18508,TCAGGCTCAGTCCCCTCCCG,MIMAT0002174_MirBase_miR-484_microRNA,GTTTCTTGGGAGGAAGAAGCCTGATCCATCACCATCTGCTTGACTA...,ENSG00000147224_ENST00000372435_PRPS1_mRNA,1
18509,TGAGAACTGAATTCCATAGG,MIMAT0002809_MirBase_miR-146b-5p_microRNA,GAAGATGTTCGTCAGCCAATTCAACCTCCAGCCAGCTTCTTGAGCC...,ENSG00000009830_ENST00000261534_POMT2_mRNA,1
18510,TACCCTGTAGATCCGAATTT,MIMAT0000253_MirBase_miR-10a_microRNA,ATAAAGCAAAGACGCGCGTCTACAGGGACACAGCTGAGCCAAACTG...,ENSG00000186716_ENST00000305877_BCR_mRNA,1
18511,TGAGGTAGTAGGTTGTGTGG,MIMAT0000063_MirBase_let-7b_microRNA,ACCCCCGACCCCAACCTCTCCATTGACCAGGTTGGCGTGCCCCGCT...,ENSG00000181222_ENST00000322644_POLR2A_mRNA,1


We lost few pairs due to deduplication.

# Saving the positive dataset

In [15]:
dataset[['miRNA_seq', 'mRNA_seq', 'label']].to_csv("positive_set.csv", index=True, header=['miRNA', 'gene', 'label'])

In [16]:
!head positive_set.csv

id,miRNA,gene,label
0,TGTAAACATCCTCGACTGGA,GTCGAGAAGTCAACGACTCAATGAAGAATTCCACTTATGGCGTGAATAGC,1
1,TGAGACCTCTGGGTTCTGAG,GACCTCAGAGCTCCAGGATCATCAGTAAATTTGTCATGTTATATATTTAT,1
2,GAGGGTTGGGTGGAGGCTCT,TGCCATAATGAACCGTCCAGCCCCTGTGGAGATCTCCTATGAGAACATGC,1
3,CAAAGTGCTTACAGTGCAGG,CACCAGAATTGCCAAAGCACATATATAATAAATTAGATAAAGGGCAAATA,1
4,TTAGGGCCCTGGCTCCATCT,AGCCAAGTGGAGAAGGGTTCCTATCCCTGGCAGGTATCTCTGAAACAAAG,1
5,CAAAGAATTCTCCTTTTGGG,AATAGTAAGAGTATGTGAAGCAACTTATGACACTACTCTTGTGGAGAAAG,1
6,TGAGAACTGAATTCCATAGG,CTACAACGTTATCGTCACAGCCCATGCATTTGTAATAATCTTCTTCATAG,1
7,CGCATCCCCTAGGGCATTGG,CTCCATGTCCGGGGATGAGCTCACAGAGCTGCTGGCAGGGATCACTGGCA,1
8,AACTGGCCCTCAAAGTCCCG,GGGACTTAAAATTGGGGCCTTATGTAGATCATTACTATAGAGACTACCCA,1


In [18]:
dataset.to_csv("positive_set_extended.csv", index=True, header=['miRNA', 'miRNA_name', 'gene', 'gene_name', 'label'])

In [19]:
!head positive_set_extended.csv

id,miRNA,miRNA_name,gene,gene_name,label
0,TGTAAACATCCTCGACTGGA,MIMAT0000087_MirBase_miR-30a_microRNA,GTCGAGAAGTCAACGACTCAATGAAGAATTCCACTTATGGCGTGAATAGC,ENSG00000130227_ENST00000252512_XPO7_mRNA,1
1,TGAGACCTCTGGGTTCTGAG,MIMAT0003886_MirBase_miR-769-5p_microRNA,GACCTCAGAGCTCCAGGATCATCAGTAAATTTGTCATGTTATATATTTAT,ENSG00000159322_ENST00000311669_ADPGK_mRNA,1
2,GAGGGTTGGGTGGAGGCTCT,MIMAT0004679_MirBase_miR-296-3p_microRNA,TGCCATAATGAACCGTCCAGCCCCTGTGGAGATCTCCTATGAGAACATGC,ENSG00000184007_ENST00000344035_PTP4A2_mRNA,1
3,CAAAGTGCTTACAGTGCAGG,MIMAT0000070_MirBase_miR-17_microRNA,CACCAGAATTGCCAAAGCACATATATAATAAATTAGATAAAGGGCAAATA,ENSG00000121879_ENST00000263967_PIK3CA_mRNA,1
4,TTAGGGCCCTGGCTCCATCT,MIMAT0005794_MirBase_miR-1296_microRNA,AGCCAAGTGGAGAAGGGTTCCTATCCCTGGCAGGTATCTCTGAAACAAAG,ENSG00000183378_ENST00000454689_OVCH2_mRNA,1
5,CAAAGAATTCTCCTTTTGGG,MIMAT0000456_MirBase_miR-186_microRNA,AATAGTAAGAGTATGTGAAGCAACTTATGACACTACTCTTGTGGAGAAAG,ENSG00000112245_ENST00000370651_PTP4A1_mRNA,1
6,TGAGAAC

# Creating the negative dataset

The negative dataset is created by matching real target sequences with random miRNAs from the same experiment excluding the original positive set ones.

There are 396 and 17937 unique miRNAs and mRNAs respectively. Making all posible pairs, we would get ```396*17937=7103052``` pairs. Helwak et al. (2013) identified 18392 positive pairs. We can look at the additional 7084660 pairs as the negative ones.



In [20]:
print("Number of unique miRNAs:", len(dataset['miRNA_seq'].unique()))
print("Number of unique mRNAs:", len(dataset['mRNA_seq'].unique()))
print("Number of positive miRNA:mRNA pairs:", len(dataset))

Number of unique miRNAs: 396
Number of unique mRNAs: 17937
Number of positive miRNA:mRNA pairs: 18513


In [23]:
dataset = dataset[['miRNA_seq', 'mRNA_seq']]

In [24]:
from itertools import product
all_dset = pd.DataFrame(list(product(dataset['miRNA_seq'].unique(), dataset['mRNA_seq'].unique())), columns=['miRNA_seq', 'mRNA_seq'])

In [25]:
negative_dset = all_dset.merge(dataset, on=['miRNA_seq', 'mRNA_seq'], how='left', indicator=True)
negative_dset = negative_dset.loc[negative_dset._merge=='left_only',negative_dset.columns!='_merge']
negative_dset['label'] = 0
negative_dset = negative_dset.sample(frac=1, random_state=42).drop_duplicates().reset_index(drop=True)
negative_dset.index.name = "id"
negative_dset

Unnamed: 0_level_0,miRNA_seq,mRNA_seq,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,AGAGGTAGTAGGTTGCATAG,AATAGTGTCTACTATCTCAACCCTCCAAAATTTGCAGAGTGTTGGG...,0
1,TAAATCCCATGGTGCCTTCT,AATGAGAAGGCACAGGAGACCCAGCACTGGAGTCAAATGGCATTTT...,0
2,TAAAGTGCTGACAGTGCAGA,ATGTATGTGCCAGCAAGCCAGAGGTGACTGTGCGGCTGAATGTCCA...,0
3,CTATACAATCTATTGCCTTC,TAGAAAGGTGCCTCAGGGATGATGATCATGGCTTGATGGAAGAATC...,0
4,TTGAAAGGCTATTTCTTGGT,TGTAGAGTAAACCTGAGAGCTTAGAGATGTATACGTTTCCACTGCT...,0
...,...,...,...
7084655,TGGCTCAGTTCAGCAGGAAC,ATGGGGAGTTCTCTGGGCCAGGCCACATTCACATTCCCCTCCCCCT...,0
7084656,TGGGGAGCTGAGGCTCTGGG,GTTCTTTGGCCCAGGGAAAGAATTTTTTAATGAGCAAATTTTCATC...,0
7084657,TCTCCCAACCCTTGTACCAG,GAAAACCAACATGAAACACCAAATAGTGTGTGTGAATCTTCTGGCG...,0
7084658,AAAAACCACAATTACTTTTG,CAGTGGGGGCAGTGCTGAAGGAATAAGCAATTCTGTGTGGGGACTG...,0


# Saving the negative dataset

In [26]:
negative_dset.to_csv("negative_set.csv", index=True, header=['miRNA', 'gene', 'label'])

In [27]:
!head negative_set.csv

id,miRNA,gene,label
0,AGAGGTAGTAGGTTGCATAG,AATAGTGTCTACTATCTCAACCCTCCAAAATTTGCAGAGTGTTGGGACTG,0
1,TAAATCCCATGGTGCCTTCT,AATGAGAAGGCACAGGAGACCCAGCACTGGAGTCAAATGGCATTTTACTT,0
2,TAAAGTGCTGACAGTGCAGA,ATGTATGTGCCAGCAAGCCAGAGGTGACTGTGCGGCTGAATGTCCATAAA,0
3,CTATACAATCTATTGCCTTC,TAGAAAGGTGCCTCAGGGATGATGATCATGGCTTGATGGAAGAATCCCAG,0
4,TTGAAAGGCTATTTCTTGGT,TGTAGAGTAAACCTGAGAGCTTAGAGATGTATACGTTTCCACTGCTGGAA,0
5,ACTGCCCCAGGTGCTGCTGG,CTTGCGCTGGACCTGGTTCTTAGCCCTTGGGCACTGCACCCTGTTTAACA,0
6,TCAGGTGTGGAAACTGAGGC,GTGGAATTCCCAGAGGCCCGAATCTATGAGGAGACACTCAACGTCCTACT,0
7,AGTTTTGCATAGTTGCACTA,TCAAGGCTCAGCTCAGGGAGCTGAATATTACGGCAGCTAAGGAAATTGAA,0
8,TGCTGGATCAGTGGTTCGAG,TTTCAGCCACAGCCTGTTCAACCTCAGCAAGGTTATATTCCTCCAATGGC,0
