# Setup

In [1]:
from pathlib import Path
from urllib.request import urlretrieve
import pandas as pd

# Download data

The data we are using for this experiment comes from the [miRBind paper](https://doi.org/10.3390/genes13122323) (and originaly from the Helwak CLASH experiment). We have a 20nt miRNA sequences, 50nt gene sequences (lengths might slightly differ) and labels if these sequences interact together (1) or not (0).

In [3]:
url = "https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/main/Datasets/train_set_1_10_CLASH2013_paper.tsv"
clash_data = Path("train_set_1_10_CLASH2013_paper.tsv")

if not clash_data.is_file():
    print("Data file not present, downloading it.")
    urlretrieve(url, clash_data.name)
else:
    print("Data file already present.")

Data file already present.


In [4]:
df = pd.read_csv(clash_data, sep='\t')
df

Unnamed: 0,miRNA,gene,label
0,TGTAAACATCCTCGACTGGA,GTCGAGAAGTCAACGACTCAATGAAGAATTCCACTTATGGCGTGAA...,1
1,TGAGACCTCTGGGTTCTGAG,GACCTCAGAGCTCCAGGATCATCAGTAAATTTGTCATGTTATATAT...,1
2,GAGGGTTGGGTGGAGGCTCT,TGCCATAATGAACCGTCCAGCCCCTGTGGAGATCTCCTATGAGAAC...,1
3,CAAAGTGCTTACAGTGCAGG,CACCAGAATTGCCAAAGCACATATATAATAAATTAGATAAAGGGCA...,1
4,TTAGGGCCCTGGCTCCATCT,AGCCAAGTGGAGAAGGGTTCCTATCCCTGGCAGGTATCTCTGAAAC...,1
...,...,...,...
169307,CAAATTCGTATCTAGGGGAA,GCAGGTAGGTTTGGCTAGGGGGAAATGTTTAACTTGTTCTGAAAGA...,0
169308,GGGCTCACATCACCCCAT,TGGTTTGGTGGGGCTGCGGCCACTTAAAACCTCCCGATCTCTTTTT...,0
169309,ACCCTATCAATATTGTCTCT,CTATGCCGACCTGGAAGCTGTTAATCGAGCCAGGCGCCCGTTAGCA...,0
169310,TGTGTCACTCGATGACCACT,CGTTGGGGACGTGCTGGTGTTAACCAAACCGTTAGGAACCCAGGTT...,0


# Select sequences for an experiment

In [5]:
df.miRNA.value_counts()

miRNA
TATTGCACTTGTCCCGGCCT    1252
TCCGAGCCTGGGTCTCCCTC    1168
TCAGGCTCAGTCCCCTCCCG    1096
TGAGGTAGTAGGTTGTGTGG     885
AAAAGCTGGGTTGAGAGGGC     880
                        ... 
CCCTGAGACCCTAACCTTAA     352
AAAAGTAATTGCGGTCTTTG     352
CCAAAACTGCAGTTACTTTT     345
ACCCGTCCCGTTCGTCCCCG     334
TCAGCACCAGGATATTGTTG     329
Name: count, Length: 396, dtype: int64

In [6]:
miRNA = "TATTGCACTTGTCCCGGCCT"

In [7]:
df[df["miRNA"] == miRNA]["label"].value_counts()

label
1    895
0    357
Name: count, dtype: int64

In [8]:
df_example = pd.concat([df[df["miRNA"] == miRNA][:300], 
                        df[df["miRNA"] == miRNA][-300:]], 
                       ignore_index=True)
df_example

Unnamed: 0,miRNA,gene,label
0,TATTGCACTTGTCCCGGCCT,AGCACTGCCGCCGGGGACTGCTCAGCAACCACACCGGCAGCCCGCG...,1
1,TATTGCACTTGTCCCGGCCT,CTTCTCGGAGACGGTGCGCATCATCAACCGCAAGGTGAAGCCGCGG...,1
2,TATTGCACTTGTCCCGGCCT,GGGCTGGGCAAAGAATGTGCAAAAGTCTTCTATGCTGCGGGTGCTA...,1
3,TATTGCACTTGTCCCGGCCT,TCCACTAGAAGGCTGGGACAGCACCGGTGATTACTGTCTTTCCTGC...,1
4,TATTGCACTTGTCCCGGCCT,AGTTCACAGGCTTTGTGGACATGTGTGTGCAGCATATCCCTTCTCC...,1
...,...,...,...
595,TATTGCACTTGTCCCGGCCT,CCCATTTTGTTGTTCAGGTCAACAGCAAAATGCCTGCACCATGACT...,0
596,TATTGCACTTGTCCCGGCCT,ATATAAATCTCCTGCTACCTCACCCATTAGTAGTAATTCTCACAGG...,0
597,TATTGCACTTGTCCCGGCCT,GTAAATGTCTGTTTTTCATAATTGCTCTTTATATTGTGTGTTATCT...,0
598,TATTGCACTTGTCCCGGCCT,GTCCAGCAGGTTTCTGCCCTGACATTCTCTTGTCTGCTATTCCCAG...,0


# Save sequences for RoseTTAFold2NA

I have decided to do embedding computation in this more complicated way so I can store the structure and precomputed alignements in case I will want to recompute the embeddings. Prediction is faster than MSA computation. So if I will change the embedding extraction, I will have the MSA already pre-computed.

# Compute embeddings

In [12]:
!mkdir miRNA -p

In [20]:
!mkdir miRNA/example_ago_miRNA -p

Saving selected miRNA sequence to a fasta file.

In [21]:
with open("miRNA/example_ago_miRNA/miRNA.fa", "w") as handle:
    handle.write(">miRNA\n")
    handle.write(miRNA)

Saving Human AGO2 protein sequence to a fasta file.

In [13]:
!wget https://rest.uniprot.org/uniprotkb/Q9UKV8.fasta
!mv Q9UKV8.fasta miRNA/example_ago_miRNA/AGO2_human.fa

--2023-10-31 15:07:40--  https://rest.uniprot.org/uniprotkb/Q9UKV8.fasta
Resolving rest.uniprot.org (rest.uniprot.org)... 193.62.193.81
Connecting to rest.uniprot.org (rest.uniprot.org)|193.62.193.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: ‘Q9UKV8.fasta’

Q9UKV8.fasta            [ <=>                ]     958  --.-KB/s    in 0s      

2023-10-31 15:07:40 (140 MB/s) - ‘Q9UKV8.fasta’ saved [958]



In [11]:
import os

lenght = len(df[df["miRNA"] == miRNA])

for i in range(300):
    
    pos_dir_name = "miRNA/example_pos_" + str(i)
    #os.mkdir(pos_dir_name)
    if not os.path.exists(pos_dir_name):
        os.makedirs(pos_dir_name)
    pos_mRNA = df[df["miRNA"] == miRNA].iloc[i]["gene"]
    with open(pos_dir_name + "/gene.fa", "w") as handle:
        handle.write(">pos mRNA\n")
        handle.write(pos_mRNA)
    
    neg_dir_name = "miRNA/example_neg_" + str(i)
    #os.mkdir(neg_dir_name)
    if not os.path.exists(neg_dir_name):
        os.makedirs(neg_dir_name)
    neg_mRNA = df[df["miRNA"] == miRNA].iloc[lenght - i - 1]["gene"]
    with open(neg_dir_name + "/gene.fa", "w") as handle:
        handle.write(">neg mRNA\n")
        handle.write(neg_mRNA)
    
    

# Cleanup

In [None]:
os.remove(clash_data)