In [None]:
DATA_DIR = '../datamount'

In [None]:
cd {DATA_DIR}

In [None]:
#install perl modules if needed
!yes '' | cpan -i Set::Object
!yes '' | cpan -i Graph

In [None]:
#see https://github.com/hendrixlab/bpRNA for more details
!git clone https://github.com/hendrixlab/bpRNA.git

In [1]:
#place your perl lib params
import os
perl_lib_path = '/home/hoyso/perl5/lib/perl5'
os.environ['PERL5LIB'] = perl_lib_path

In [2]:
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [3]:
def get_predicted_loop_type(sequence, structure, debug=False, pid=0):
    with open(f'a{pid}.dbn', 'w') as file:
        file.write(sequence + '\n')
        file.write(structure + '\n')

    os.system(f'perl bpRNA/bpRNA.pl a{pid}.dbn')

    result = [l.strip('\n') for l in open(f'a{pid}.st')]
    if debug:
        print(sequence)
        print(structure)
        print(result[5])

    return result[5]

In [4]:
import shutil
from joblib import Parallel, delayed
import multiprocessing as mp
from multiprocessing import cpu_count
import threading

def split_dataframe(df, chunk_size = 10000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

def get_predicted_loop_type_chunk(chunk):
    seq = chunk.sequence
    mfe = chunk.mfe_eternafold
    #mfe = chunk.mfe_contrafold
    thread_id = threading.get_ident()
    result = []
    for s,m in tqdm(zip(seq,mfe)):
        result.append(get_predicted_loop_type(s,m,False,thread_id))
    os.remove(f'a{thread_id}.dbn')
    os.remove(f'a{thread_id}.st')
    return result

In [5]:
train_df = pd.read_parquet('train_data_new.parquet')

result = Parallel(n_jobs=cpu_count())(
    delayed(get_predicted_loop_type_chunk)(x)
    for i,x in enumerate(split_dataframe(train_df, 10000))
)
train_df['looptype_eternafold'] = np.concatenate(result, axis=0)
train_df.to_parquet('train_data_new.parquet')
del train_df

10000it [06:21, 26.23it/s]
10000it [06:21, 26.22it/s]
10000it [06:22, 26.18it/s]
10000it [06:21, 26.19it/s]
10000it [06:22, 26.17it/s]
10000it [06:22, 26.18it/s]
10000it [06:22, 26.17it/s]
10000it [06:22, 26.17it/s]
10000it [06:22, 26.15it/s]
10000it [06:21, 26.20it/s]
10000it [06:22, 26.13it/s]
10000it [06:22, 26.15it/s]
10000it [06:21, 26.19it/s]
10000it [06:21, 26.18it/s]
10000it [06:22, 26.14it/s]
10000it [06:22, 26.12it/s]
10000it [06:22, 26.12it/s]
10000it [06:23, 26.10it/s]
10000it [06:22, 26.13it/s]
10000it [06:22, 26.15it/s]
10000it [06:22, 26.13it/s]
10000it [06:23, 26.10it/s]
10000it [06:23, 26.10it/s]
10000it [06:24, 26.04it/s]
10000it [06:23, 26.08it/s]
10000it [06:22, 26.12it/s]
10000it [06:22, 26.11it/s]
10000it [06:23, 26.06it/s]
10000it [06:23, 26.07it/s]
10000it [06:23, 26.08it/s]
10000it [06:23, 26.06it/s]
10000it [06:23, 26.04it/s]
10000it [06:18, 26.42it/s]
10000it [06:20, 26.25it/s]
10000it [06:18, 26.40it/s]
10000it [06:20, 26.28it/s]
10000it [06:20, 26.29it/s]
1

In [6]:
test_df = pd.read_parquet('test_sequences_new.parquet')

result = Parallel(n_jobs=cpu_count())(
    delayed(get_predicted_loop_type_chunk)(x)
    for i,x in enumerate(split_dataframe(test_df, 10000))
)
test_df['looptype_eternafold'] = np.concatenate(result, axis=0)
test_df.to_parquet('test_sequences_new.parquet')
del test_df

10000it [06:23, 26.07it/s]
10000it [06:23, 26.06it/s]
10000it [06:24, 26.04it/s]
10000it [06:24, 26.04it/s]
10000it [06:24, 26.03it/s]
10000it [06:24, 26.03it/s]
10000it [06:24, 26.02it/s]
10000it [06:24, 26.02it/s]
10000it [06:24, 26.01it/s]
10000it [06:24, 25.99it/s]
10000it [06:24, 25.98it/s]
10000it [06:25, 25.97it/s]
10000it [06:25, 25.97it/s]
10000it [06:25, 25.97it/s]
10000it [06:25, 25.96it/s]
10000it [06:25, 25.96it/s]
10000it [06:25, 25.96it/s]
10000it [06:25, 25.94it/s]
10000it [06:25, 25.94it/s]
10000it [06:25, 25.94it/s]
10000it [06:25, 25.93it/s]
10000it [06:25, 25.94it/s]
10000it [06:25, 25.94it/s]
10000it [06:25, 25.92it/s]
10000it [06:25, 25.93it/s]
10000it [06:25, 25.91it/s]
10000it [06:25, 25.91it/s]
10000it [06:25, 25.91it/s]
10000it [06:26, 25.91it/s]
10000it [06:26, 25.90it/s]
10000it [06:26, 25.85it/s]
10000it [06:26, 25.86it/s]
10000it [06:25, 25.94it/s]
10000it [06:26, 25.87it/s]
10000it [06:27, 25.79it/s]
10000it [06:28, 25.72it/s]
10000it [06:29, 25.69it/s]
1