In [None]:
# Copyright 2022 The Authors. All Rights Reserved.
#
# GNU General Public License v3.0
# Permissions of this strongest copyleft license are conditioned on 
# making available complete source code of licensed works and modifications, 
# which include larger works using a licensed work, under the same license. 
# Copyright and license notices must be preserved. 
# Contributors provide an express grant of patent rights. 
# When a modified version is used to provide a service over a network, 
# the complete source code of the modified version must be made available.
# ==============================================================================

# Title: Extracting MFEs from RNAfold
# Author: Hyunjin Shim
# Date created: 20220105
# Email: jinenstar@gmail.com

# Extracting MFEs from RNAfold output file

In [1]:
### packages

# Data
import os
from pathlib import Path 
from typing import Dict, List, Tuple
import glob
import numpy as np
import pandas as pd
import re

# Biopython
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Math
import random
import statistics

# Plot
import matplotlib.pyplot as plt

In [2]:
### functions

# extract information from fasta files
def extract_dataset_info(records: List[SeqRecord]) -> Dict:
    # contains info on SeqIO data
    seqs_id = [r.id for r in records]
    seqs = [str(r.seq) for r in records]
    d = {"ID":seqs_id, "Seq":seqs} 
    return d

In [29]:
# location of raw data file
datapath = Path("/Users/jinenstar/Desktop/Pro_AE_CRISPR/Data/Simulated_5tps_curated/100_tp1/")

# read fasta file with SeqIO module for convenience
repeat_info = {"Class1": {}, "Class2": {}}

# load test sequences
for f in datapath.glob("Class1*.txt"):
    records = list(SeqIO.parse(str(f), "fasta"))
    key = f.stem
    repeat_info["Class1"][key] = extract_dataset_info(records)

for f in datapath.glob("Class2*.txt"):
    records = list(SeqIO.parse(str(f), "fasta"))
    key = f.stem
    repeat_info["Class2"][key] = extract_dataset_info(records)

In [4]:
#len(repeat_info.keys())
len(repeat_info["Class1"])

100

In [5]:
s=repeat_info["Class1"]["Class1_5_tps_real_IE_curated copy 1_output"]["Seq"][0]
s

'UGGUUUGUCCCGUCUGGAGCCGGGAACCC.(((...(((((........)))))))).(-9.00).(((...(((((........)))))))).[-9.43].(((...(((((........)))))))).{-9.00d=1.19}frequencyofmfestructureinensemble0.494375;ensemblediversity2.04'

In [36]:
#Class1: extract MFEs from 100 simulations and store them in a list for averaging
MFE_Class1_all = []
for key in repeat_info["Class1"].keys():
    #print(key)
    l = []
    for s in repeat_info["Class1"][key]["Seq"]:
        l.append((re.findall(r"[-+]?\d*\.\d+|\d+",s)))
    
    MFE_only = [None] * len(l)
    for m in range(len(l)):
        MFE_only[m] = float(l[m][2])
    MFE_Class1_all.append(MFE_only)
#     with open(key + '.fa', 'w') as f:
#         for i in range(len(MFE_only)):
#             f.write(">" + str(repeat_info[key]["ID"][i]) + "\n" + str(MFE_only[i]) + "\n")      

In [37]:
#MFE_Class1_all

In [38]:
result1 = [statistics.mean(k) for k in zip(*MFE_Class1_all)]

In [39]:
#result1

In [40]:
with open("t5_100_mean_Class1_IE.fa", 'w') as f:
    for i in range(len(result1)):
        f.write(">" + str(repeat_info["Class1"][list(repeat_info["Class1"].keys())[0]]["ID"][i]) + "\n" + str(result1[i]) + "\n")      

In [30]:
#Class2: extract MFEs from 100 simulations and store them in a list for averaging
MFE_Class2_all = []
for key in repeat_info["Class2"].keys():
    #print(key)
    l = []
    for s in repeat_info["Class2"][key]["Seq"]:
        l.append((re.findall(r"[-+]?\d*\.\d+|\d+",s)))
    
    MFE_only = [None] * len(l)
    for m in range(len(l)):
        MFE_only[m] = float(l[m][2])
    MFE_Class2_all.append(MFE_only)

In [31]:
#MFE_Class2_all

In [32]:
result2 = [statistics.mean(k) for k in zip(*MFE_Class2_all)]

In [33]:
#result2

In [34]:
with open("t1_100_mean_Class2_IIA.fa", 'w') as f:
    for i in range(len(result2)):
        f.write(">" + str(repeat_info["Class2"][list(repeat_info["Class2"].keys())[0]]["ID"][i]) + "\n" + str(result2[i]) + "\n")      

### Extra

In [None]:
# location of raw data file
datapath = Path("/Users/jinenstar/Desktop/Pro_AE_CRISPR/Sim_CRISPR/Data")

# read fasta file with SeqIO module for convenience
file = 'RNAfold_Gmix4.txt'

repeat_info = {}

# load test sequences
for f in datapath.glob(file):
    records = list(SeqIO.parse(str(f), "fasta"))
    repeat_info = extract_dataset_info(records)

In [None]:
l = []
for s in repeat_info["Seq"]:
    l.append((re.findall(r"[-+]?\d*\.\d+|\d+",s)))
    
MFE_only = [None] * len(l)
for m in range(len(l)):
    MFE_only[m] = l[m][2]


In [None]:
len(repeat_info)
s=repeat_info["Seq"][0]
s

In [None]:
# l[0][2]

In [None]:
# MFE_only = [None] * len(l)

# for i in range(len(l)):
#     MFE_only[i] = l[i][2]

In [None]:
#MFE_only

In [None]:
# with open(file + '.fa', 'w') as f:
#     for i in range(len(MFE_only)):
#         f.write(">" + str(repeat_info["ID"][i]) + "\n" + str(MFE_only[i]) + "\n")      

In [None]:
d_final
ID = list( dict.fromkeys(d_final['ID']) )

In [None]:
d_final[d_final['ID']==ID[0]]['MFE'][0][2]

In [None]:
#d_final_len = [None] * len(ID)
d_final_dct = {}

for i in ID:
    d_final_dct['%s' % i] = d_final[d_final['ID']==i]['MFE']
    #d_final_len[i] = len(d_final[d_final['ID']==ID[i])]

In [None]:
d_final_dct.keys()
#dict( A = np.array([1,2]), B = np.array([1,2,3,4]) )
d_final_dct.values()

In [None]:
pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in d_final_dct.items() ]))

In [None]:
#multiple line plots
plt.plot(range(1,len(d_final0)), 'MFE', data=d_final0)

#show legend
plt.legend()

#show graph
plot.show()