In [14]:
import os
import polars as ps

# Load MSA data

In [7]:
data_dir = '/media/datasets/rna-folding/MSA'
msa_files = []

for file in os.listdir(data_dir):
    msa_files.append(os.path.join(data_dir, file))

len(msa_files)

856

In [12]:
with open(msa_files[1], 'r') as fp:
    msa_data = fp.read()

print(msa_data)

>query
UGGGGAGCCGAAAGGCGAAGAACC
>URS000080E0AF_1_48_f/1-24
GGGGGAGCCGAAAGGCGAAGAACC
>URS00009EA8F6_1132_1203_f/25-48
UGGGGAGCCGAAAGGCGAAGUUUC
>XM_033466160.1_3272_3338_f/22-45
AGAGGAGCCGAAAGGCGAAGAAGC
>URS0000AF63A8_1507_1573_f/24-47
AGGGGAGCCGAAAGGCGAAGCAGC
>MZ344584.1_1623_1689_r/22-45
UGUGGAGCCGAAAGGCGAAGAAGG
>URS0000CAAFC5_1567_1633_f/24-47
CGGGGAGCCGAAAGGCGAAGUUUC
>URS00000CC4DB_1630_1696_f/24-47
AGGGGAGCCGAAAGGCGAAGUUUC
>LR535839.1_10004896_10004962_f/24-47
AGGGGAGCCGAAAGGCGAAGCUCU
>AM425884.2_15686_15752_f/17-43
UGGGUAGCCGAAAGGCGAAGAACC
>OV277874.1_24654480_24654546_r/25-46
UGGGGAGCCGAAAGGCGA--AACU
>URS0002154534_1534_1600_f/24-48
AGGGGAGCCGAAAGGCGAAGCACC
>CP074140.1_582900_582966_r/21-44
UUUCGAGCCGAAAGGCGAAGAACA
>CP009526.1_816365_816431_f/22-45
AACGGAGCCGAAAGGCGAAGAAUU
>URS000216084C_2429_2494_f/24-47
AGGGGAGCCGAAAGGCGAAAAGUG
>URS0000D7B92E_363_429_f/25-46
UGGGGAGCCGAAAGGCGAA--AGU
>OW121692.1_9681225_9681291_r/22-45
CACGGAGCCGAAAGGCGAAGAAAA
>URS0002170A02_2417_2481_f/23-46
GUG

# Training data

**target_id** - (string) An arbitrary identifier. In train_sequences.csv, this is formatted as pdb_id_chain_id, where pdb_id is the id of the entry in the Protein Data Bank and chain_id is the chain id of the monomer in the pdb file.

**sequence** - (string) The RNA sequence. For test_sequences.csv, this is guaranteed to be a string of A, C, G, and U. For some train_sequences.csv, other characters may appear.

**temporal_cutoff** - (string) The date in yyyy-mm-dd format that the sequence was published. See Additional Notes.

**description** - (string) Details of the origins of the sequence. For a few targets, additional information on small molecule ligands bound to the RNA is included. You don't need to make predictions for these ligand coordinates.

**all_sequences** - (string) FASTA-formatted sequences of all molecular chains present in the experimentally solved structure. In a few cases this may include multiple copies of the target RNA (look for the word "Chains" in the header) and/or partners like other RNAs or proteins or DNA. You don't need to make predictions for all these molecules; if you do, just submit predictions for sequence. Some entries are blank. **basically, additional rna and molecules that have interacted with the primary RNA sequence to form the 3D strucuture**

In [44]:
training_data = ps.read_csv(os.path.join('/media/datasets/rna-folding/train_sequences.csv'))
training_data.head(5)

target_id,sequence,temporal_cutoff,description,all_sequences
str,str,str,str,str
"""1SCL_A""","""GGGUGCUCAGUACGAGAGGAACCGCACCC""","""1995-01-26""","""THE SARCIN-RICIN LOOP, A MODUL…",""">1SCL_1|Chain A|RNA SARCIN-RIC…"
"""1RNK_A""","""GGCGCAGUGGGCUAGCGCCACUCAAAAGGC…","""1995-02-27""","""THE STRUCTURE OF AN RNA PSEUDO…",""">1RNK_1|Chain A|RNA PSEUDOKNOT…"
"""1RHT_A""","""GGGACUGACGAUCACGCAGUCUAU""","""1995-06-03""","""24-MER RNA HAIRPIN COAT PROTEI…",""">1RHT_1|Chain A|RNA (5'-R(P*GP…"
"""1HLX_A""","""GGGAUAACUUCGGUUGUCCC""","""1995-09-15""","""P1 HELIX NUCLEIC ACIDS (DNA/RN…",""">1HLX_1|Chain A|RNA (5'-R(*GP*…"
"""1HMH_E""","""GGCGACCCUGAUGAGGCCGAAAGGCCGAAA…","""1995-12-07""","""THREE-DIMENSIONAL STRUCTURE OF…",""">1HMH_1|Chains A, C, E|HAMMERH…"


In [45]:
training_data['all_sequences'][0]

'>1SCL_1|Chain A|RNA SARCIN-RICIN LOOP|Rattus norvegicus (10116)\nGGGUGCUCAGUACGAGAGGAACCGCACCC\n'

# Training labels

**ID** - (string) that identifies the target_id and residue number, separated by _. Note: residue numbers use one-based indexing.

**resname** - (character) The RNA nucleotide ( A, C, G, or U) for the residue.

**resid** - (integer) residue number.

**x_1,y_1,z_1,x_2,y_2,z_2**  - (float) Coordinates (in Angstroms) of the C1' atom for each experimental RNA structure. There is typically one structure for the RNA sequence, and train_labels.csv curates one structure for each training sequence. However, in some targets the experimental method has captured more than one conformation, and each will be used as a potential reference for scoring your predictions. validation_labels.csv has examples of targets with multiple reference structures (x_2,y_2,z_2, etc.).

In [28]:
training_labels = ps.read_csv(os.path.join('/media/datasets/rna-folding/train_labels.csv'))

In [29]:
training_labels

ID,resname,resid,x_1,y_1,z_1
str,str,i64,f64,f64,f64
"""1SCL_A_1""","""G""",1,13.76,-25.974001,0.102
"""1SCL_A_2""","""G""",2,9.31,-29.638,2.669
"""1SCL_A_3""","""G""",3,5.529,-27.813,5.878
"""1SCL_A_4""","""U""",4,2.678,-24.900999,9.793
"""1SCL_A_5""","""G""",5,1.827,-20.136,11.793
…,…,…,…,…,…
"""8Z1F_T_82""","""U""",82,,,
"""8Z1F_T_83""","""C""",83,,,
"""8Z1F_T_84""","""A""",84,,,
"""8Z1F_T_85""","""U""",85,,,
