In [98]:
import pandas as pd
import h5py
import numpy as np
import json


class OLIGO:
    """
    """
    def __init__(self, name):
        self.name = name
        self.group = name
        self.type = ""
        self.target_genome = {"fa":[], "gtf":[]}
        self.negative_genome = {"fa":[], "gtf":[]}
        self.negative_tools = {"tools":"", "version":"", "params":""}
        self.temperature = {"low":"", "high":""}
        self.temperature_tools = {"tools":"", "version":"", "params":""}
        self.secondary_structure = {"low":"", "high":""}
        self.secondary_structure_tools = {"tools":"", "version":"", "params":""}
        self.location = {"fa":"", "gtf":""}
        self.date = ""
        self.author = ""
        self.email = ""
        self.other_info = ""
        self.data = pd.DataFrame()
    

    def tsv_read(self, header, data):

        with open(header, "r") as fh:
            for line in fh:
                line_list = line.strip().split("\t")
                feature = line_list[0]
                value = line_list[1]
                len_value = len(value.split(";"))
                if feature == "type":
                    self.type = value
                elif feature == "target_genome":
                    if len_value > 0:
                        self.target_genome["fa"] = value.split(";")[0].split(",")
                    if len_value > 1:
                        self.target_genome["gtf"] = value.split(";")[1].split(",")
                elif feature == "negative_genome":
                    if len_value > 0:
                        self.negative_genome["fa"] = value.split(";")[0].split(",")
                    if len_value > 1:
                        self.negative_genome["gtf"] = value.split(";")[1].split(",")
                elif feature == "negative_tools":
                    if len_value > 0:
                        self.negative_tools["tools"] = value.split(";")[0]
                    if len_value > 1:
                        self.negative_tools["version"] = value.split(";")[1]
                    if len_value > 2:
                        self.negative_tools["params"] = value.split(";")[2]
                elif feature == "temperature":
                    if len_value > 0:
                        if value.split(";")[0]:
                            self.temperature["low"] = float(eval(value.split(";")[0]))
                    if len_value > 1:
                        self.temperature["high"] = float(eval(value.split(";")[1]))
                elif feature == "temperature_tools":
                    if len_value > 0:
                        self.temperature_tools["tools"] = value.split(";")[0]
                    if len_value > 1:
                        self.temperature_tools["version"] = value.split(";")[1]
                    if len_value > 2:
                        self.temperature_tools["params"] = value.split(";")[2]
                elif feature == "secondary_structure":
                    if len_value > 0:
                        if value.split(";")[0]:
                            self.secondary_structure["low"] = float(eval(value.split(";")[0]))
                    if len_value > 1:
                        self.secondary_structure["high"] = float(eval(value.split(";")[1]))
                elif feature == "secondary_structure_tools":
                    if len_value > 0:
                        self.secondary_structure_tools["tools"] = value.split(";")[0]
                    if len_value > 1:
                        self.secondary_structure_tools["version"] = value.split(";")[1]
                    if len_value > 2:
                        self.secondary_structure_tools["params"] = value.split(";")[2]
                elif feature == "location":
                    if len_value > 0:
                        self.location["fa"] = value.split(";")[0]
                    if len_value > 1:
                        self.location["gtf"] = value.split(";")[1]
                elif feature == "date":
                    self.date = value
                elif feature == "author":
                    self.author = value
                elif feature == "email":
                    self.email = value
                elif feature == "other_info":
                    self.other_info = value
                else:
                    print(f"no {feature} in oligo format!")
        
        df_data = pd.read_csv(data, sep="\t")

        self.data = df_data

        return
    

    def to_oligo(self, name=None, out_dir="./"):

        if name:
            file_name = name
        else:
            file_name = self.name

        with h5py.File(f"{out_dir}/{file_name}.oligo", "w") as f:
            
            group = f.create_group(self.group)
            dataset = group.create_dataset(self.name, data=self.data.astype("str").values)
            dataset.attrs["clomuns"] = json.dumps(dict(self.data.dtypes))
            dataset.attrs["name"] = self.name
            dataset.attrs["type"] = self.type
            dataset.attrs["target_genome"] = json.dumps(self.target_genome)
            dataset.attrs["negative_genome"] = json.dumps(self.negative_genome)
            dataset.attrs["negative_tools"] = json.dumps(self.negative_tools)
            dataset.attrs["temperature"] = json.dumps(self.temperature)
            dataset.attrs["temperature_tools"] = json.dumps(self.temperature_tools)
            dataset.attrs["secondary_structure"] = json.dumps(self.secondary_structure)
            dataset.attrs["secondary_structure_tools"] = json.dumps(self.secondary_structure_tools)
            dataset.attrs["location"] = json.dumps(self.location)
            dataset.attrs["date"] = self.date
            dataset.attrs["author"] = self.author
            dataset.attrs["email"] = self.email
            dataset.attrs["other_info"] = self.other_info
            dataset.attrs["clomuns"] = json.dumps({col: str(dtype) for col, dtype in self.data.dtypes.items()})
        return



In [99]:
data = "../sample_data/TSV/data.tsv"

In [100]:
header = "../sample_data/TSV/header.tsv"

In [101]:
oligo = OLIGO("test")

In [102]:
oligo.tsv_read(header, data)

no feature in oligo format!


In [103]:
oligo.data

Unnamed: 0,name,oligo,temperature,secondary_structure,chr,start
0,seq1,AACCGGTGATCGTATAGCTAGTCGTA,40,0.11,chr1,10
1,seq2,AACCGGTGACTGGATAGCTAGTCGTA,42,0.15,chr2,150
2,seq3,AACCGGTGATCGTATCCGTAGTCGTA,44,0.17,chr1,200
3,seq4,AACCGGTTGCCGTATAGCTAGTCGTA,36,0.12,chr4,10
4,seq5,AACCGGTGATCGTAAAGCTAGTCGTA,39,0.22,chr1,1000
5,seq6,AACCGTTAGTCGTATAGCTAGTCGTA,38,0.27,chr3,5000
6,seq7,AACCGGTGATCGTATCCGAAGTCGTA,41,0.03,chr2,2500
7,seq8,AACCGGTGGGACTATAGCTAGTCGTA,43,0.17,chr1,900
8,seq9,AACCGGTGATCGTATATTGGGTCGTA,45,0.08,chr4,4000


In [104]:
oligo.to_oligo()

TypeError: Object of type ObjectDType is not JSON serializable

In [78]:
data = {
    'name': ['Seq1', 'Seq2', 'Seq3'],
    'oligo': ['AATTGG', 'CCGGTT', 'TTAACC'],
    'temperature': [40, 45, 50],
    'secondary_structure': [0.11, 0.12, 0.13],
    'chr': ['chr1', 'chr2', 'chr3'],
    'start': [10, 20, 30]
}

In [79]:
df = pd.DataFrame(data)

In [80]:
dtypes_dict = {col: str(dtype) for col, dtype in df.dtypes.items()}