In [None]:
# %pip install scikit-learn

In [60]:
from nupack import *
import random
import numpy as np
import pandas as pd

# Generates the input data 
def generate_random_sequences(length, num_sequences):
    bases = ['A', 'C', 'G', 'T']
    sequences = []
    
    for _ in range(num_sequences):
        sequence = ''.join(random.choice(bases) for _ in range(length))
        sequences.append(sequence)
    
    return sequences

def mfe_set(sequence_list):
    
    # Set the model
    model1 = Model(material='dna', celsius=37)

    mfe_results = []

    # compute MFE
    for sequence in sequence_list:
        mfe_results.append(mfe([sequence], model=model1)[0].energy)

    return mfe_results

dataset_X = generate_random_sequences(20, 100000)
dataset_Y = mfe_set(dataset_X)


In [61]:
from sklearn.model_selection import train_test_split

# transform the sequence for dataset_X into one-hot encoding
def one_hot_encoding(sequence_list):
    one_hot_list = []
    for sequence in sequence_list:
        one_hot = []
        for base in sequence:
            if base == 'A':
                one_hot.append([1, 0, 0, 0])
            elif base == 'C':
                one_hot.append([0, 1, 0, 0])
            elif base == 'G':
                one_hot.append([0, 0, 1, 0])
            elif base == 'T':
                one_hot.append([0, 0, 0, 1])
        one_hot_list.append(one_hot)
    return one_hot_list



In [62]:
# concat dataset_X and dataset_Y into a pd dataframe and output to csv. We have two csvs, one with the one-hot encoding and one without

# one-hot encoding
dataset_X_one_hot = one_hot_encoding(dataset_X)
dataset_X_one_hot = np.array(dataset_X_one_hot)
dataset_Y = np.array(dataset_Y)
dataset_X_one_hot = dataset_X_one_hot.reshape(dataset_X_one_hot.shape[0], -1)
dataset_X_one_hot = pd.DataFrame(dataset_X_one_hot)
dataset_Y = pd.DataFrame(dataset_Y)
dataset_X_one_hot['mfe'] = dataset_Y
dataset_X_one_hot.to_csv('dataset_one_hot.csv', index=False)

# without one-hot encoding
dataset_X = pd.DataFrame(dataset_X)
dataset_Y = pd.DataFrame(dataset_Y)
dataset_X['mfe'] = dataset_Y
dataset_X.to_csv('dataset.csv', index=False)
