matching.py

import re
from collections import defaultdict
from os import path
from string import whitespace
from typing import List
from futils import timeit
import logging
import sbol2

from Bio import Seq, SeqIO, SeqRecord, pairwise2


def read_file(filename: str):
    """
    Read file and return content

    :param filename: Filename to read (string)
    :returns: file contents (string)

    """
    with open(filename, "r") as file_object:
        file_contents = file_object.read()
        return file_contents


def file_to_seqrec(f: str):
    """
    Read file return SeqRec

    :param filename: Filename to read and convert to SeqRec (string)

    :returns: SeqRecord

    """
    basename = path.basename(f)
    name = path.splitext(basename)[0]
    seq = Seq.Seq(read_file(f).strip(whitespace))  # strip newlines and shit
    seqrec = SeqRecord.SeqRecord(seq, id=name, name=name, description=name)
    return seqrec


def sbol_to_seqrec(f: str):
    """
    Read an SBOL file and return SeqRec 

    :param filename: Filename to read and convert to SeqRec (string)

    :returns: SeqRecord
    """
    basename = path.basename(f)
    name = path.splitext(basename)[0]
    doc = sbol2.Document()
    doc.read(f)
    # Here we assume the sbol has one element
    # TODO do this properly if doc has multiple elements (?)
    # TODO get name from SBOL document
    s = doc.sequences[0].elements.upper()
    seq = Seq.Seq(s)
    seqrec = SeqRecord.SeqRecord(seq, id=name, name=name, description=name)
    return seqrec


class Part:
    """
    Part class
    """

    def __init__(self, part: dict, part_type: str, repository: str):
        """
        Constructor from part dictionary (see template)
        """
        self.name = part["name"]
        self.filetype = part["filetype"]
        self.filename = part["filename"]
        self.type = part_type
        filepath = f"{repository}/parts/{self.type}/{self.filename}"
        if self.filetype == "genbank":
            self.sequence = SeqIO.read(filepath, self.filetype)
        elif self.filetype == "text":
            self.sequence = file_to_seqrec(filepath)
        elif self.filetype == "sbol":
            self.sequence = sbol_to_seqrec(filepath)
        else:
            raise ValueError(
                "Unrecognized filetype. Only Genbank and Text files are handled."
            )

    def __repr__(self):
        """
        Representation of the Part class
        """
        return f"{self.__class__.__name__}({self.name}, {self.type}, {self.filename}, {self.filetype})"


class Trace:
    """
    Trace class
    """

    def __init__(self, filename: str):
        """
        Constructor from trace filename
        """

        basename = path.basename(filename)
        self.name = basename.split(".")[0]
        self.filetype = basename.split(".")[1]
        self.filename = basename
        record = SeqIO.read(filename, "abi")
        self.record = record
        channels = [
            "DATA9",
            "DATA10",
            "DATA11",
            "DATA12",
            "PCON2",
            "SMPL1",
            "PBAS2",
            "PLOC1",
            "FWO_1",
        ]
        trace = defaultdict(list)
        for c in channels:
            trace[c] = record.annotations["abif_raw"][c]
        self.trace_binary = trace["PCON2"]
        self.trace_string = "".join([chr(value + 33) for value in trace["PCON2"]])
        self.phred_quality = record.letter_annotations["phred_quality"]
        self.sample = trace["SMPL1"].decode()  # Sample name
        self.sequence = trace["PBAS2"].decode()  # Sequence as generated by BaseCaller
        self.baseorder = tuple(trace["FWO_1"].decode())  # Base order
        # We assign the self.G, self.T, self.A, self.C attributes according to the base order from the ABIF file
        channelsorder = ("DATA9", "DATA10", "DATA11", "DATA12")
        exec(f"self.{self.baseorder[0]} = trace['{channelsorder[0]}']")
        exec(f"self.{self.baseorder[1]} = trace['{channelsorder[1]}']")
        exec(f"self.{self.baseorder[2]} = trace['{channelsorder[2]}']")
        exec(f"self.{self.baseorder[3]} = trace['{channelsorder[3]}']")
        self.ploc = trace["PLOC1"]  # Peak location array
        self.peak_G = [self.G[peak] for peak in self.ploc]
        self.peak_A = [self.A[peak] for peak in self.ploc]
        self.peak_T = [self.T[peak] for peak in self.ploc]
        self.peak_C = [self.C[peak] for peak in self.ploc]

    def compute_seq_from_trace(self):
        """
        Compute the sequence of the most probable nucleotides
        from the 4 channels from the trace file (the one with the strongest signal)
        for each peak location.
        """
        bases = []
        result = ""
        for i in range(len(self.ploc)):
            bases.append(
                (self.peak_G[i], self.peak_A[i], self.peak_T[i], self.peak_C[i])
            )
        for i in bases:
            m = max(i)
            pos = i.index(m)
            result += self.baseorder[pos]
        return result

    def compute_probability(self):
        """
        Compute probability of bases at pic location
        Return list of tuple of probability of bases
        Tuples are in baseorder
        """
        result = []
        peaks = {"G": self.peak_G, "A": self.peak_A, "T": self.peak_T, "C": self.peak_C}
        # method creating lists for each base first and then loooping to create the tuple
        if 0:
            lg, la, lt, lc, = (
                [],
                [],
                [],
                [],
            )
            tup = {"G": lg, "A": la, "T": lt, "C": lc}
            for i in range(len(self.ploc)):
                s = sum(
                    (self.peak_G[i], self.peak_A[i], self.peak_T[i], self.peak_C[i])
                )
                lg.append(self.peak_G[i] / s)
                la.append(self.peak_A[i] / s)
                lt.append(self.peak_T[i] / s)
                lc.append(self.peak_C[i] / s)
                ll = []
                for j in range(0, 4, 1):
                    b = self.baseorder[j]
                    ll.append(tup[b][i])
                l.append(tuple(ll))
        for i in range(len(self.ploc)):
            s = sum((self.peak_G[i], self.peak_A[i], self.peak_T[i], self.peak_C[i]))
            # method with looping over the base order to create the tuple
            if 0:
                ll = []
                # Return tuple in the order of the bases (self.baseorder)
                for j in range(0, 4, 1):
                    base = self.baseorder[j]
                    ll.append(peaks[base][i] / s if s else 0)
                l.append(tuple(ll))
            # method with direct tuple
            bases_prob = (
                peaks[self.baseorder[0]][i] / s,
                peaks[self.baseorder[1]][i] / s,
                peaks[self.baseorder[2]][i] / s,
                peaks[self.baseorder[3]][i] / s,
            )
            result.append(bases_prob)
        # return lg, la, lt, lc
        # return lg, la, lt, lc, l
        return result

    def compute_seq_from_probability(self):
        """
        Call compute probability from trace and return sequence string
        """
        result = self.compute_probability()
        seq = ""
        for t in result:
            idx = t.index(max(t))
            seq += self.baseorder[idx]
        return seq

    def __repr__(self):
        """
        Representation of the Trace object
        """
        return (
            f"{self.__class__.__name__}({self.name}, {self.filename}, {self.filetype})"
        )


class Sequence:
    """
    Sequence class
    """

    def __init__(self, filename: str):
        """
        Constructor from sequence file
        """
        basename = path.basename(filename)
        self.dirname = path.dirname(filename)
        self.name = basename.split(".")[0]
        self.filetype = basename.split(".")[1]
        self.filename = basename
        # self.sequence = Seq.Seq( read_file(filename).strip(whitespace)).reverse_complement()  # remove shit form string
        self.sequence = Seq.Seq(read_file(filename).strip(whitespace))
        # self.trace = self.get_trace()
        self.length = len(self.sequence)

    def __repr__(self):
        """
        Representation
        """
        return (
            f"{self.__class__.__name__}({self.name}, {self.filename}, {self.filetype})"
        )

    def count_n(self) -> int:
        """
        Count the number of N bases in the sequence
        """
        n = self.sequence.count("N")
        return n

    def normalized_n_score(self) -> float:
        """
        Return the normalized score of N bases counts in the sequence

        :returns: normalized score (float)
        """
        return self.count_n() / self.length if self.length else 0

    def count_max_n_cluster(self) -> int:
        """
        Return the largest number of N bases cluster in the sequence

        ex: given a sequence "NNNACGCANNNNANAANNNNNN"
            the function will return 6

        :returns: largest numver of N bases cluster (int)
        """
        seq = str(self.sequence)
        result = len(max(re.compile("(N+N)*").findall(seq)))  # Find maximum number of N
        return result

    def normalized_max_n_score(self) -> float:
        """
        Return the normalized score of the largest N bases cluster in the sequence

        :returns: normalized score (float)
        """
        return self.count_n() / self.length if self.length else 0

    def get_trace(self) -> Trace:
        """
        Return a Trace object associated to the sequence

        :returns: Trace object (Trace)
        """
        trace_file = path.join(self.dirname, self.name + ".ab1")
        if path.exists(trace_file):
            trace = Trace(trace_file)
        else:
            raise OSError("Trace file not found", trace_file)
        return trace


class Library:
    """
    Library class
    """

    def __init__(self, library_template: dict):
        """
        Constructur from library template
        """
        self.parts = []
        self.type = library_template["type"]
        self.name = library_template["name"]
        self.repository = library_template["repository"]
        for part in library_template["parts"]:
            self.parts.append(Part(part, self.type, self.repository))

    def __repr__(self):
        """
        Representation of the Library object
        """
        return f"{self.__class__.__name__}({self.name}, {self.type}, {self.parts})"


class PartCandidate:
    """
    PartCandidate class
    """

    def __init__(self, part):
        """
        Constructor from part dictionary (see template)
        """
        self.name = part[0]
        self.score = part[1]
        self.start = part[2]
        self.length = part[3]
        self.end = part[4]
        self.alignment = part[5]

    def __repr__(self):
        """
        Representation of the PartCandidate object
        """
        return f"{self.__class__.__name__}({self.name}, {self.score}, {self.start}, {self.length}, {self.end}, <alignments>)"


@timeit
def match_part(
    sequence: Sequence, part: Part, threshold: float = 0.5, direction53=True
) -> List[PartCandidate]:
    """
    Match part to a sequence and return candidates that score above the threshold

    :param sequence: Sequence object
    :param part: Library object
    :param threshold:  (Default value = 0.5)

    """
    logging.info(part.name)
    candidates = []
    # TODO check this shit
    if direction53:
        part_rc = part.sequence.seq
    else:
        part_rc = part.sequence.seq.reverse_complement()
        # part_rc = part.sequence.seq
        # part_rc = part.sequence.seq[::-1]
    # Calculate Alignments
    alignments = pairwise2.align.localms(part_rc, sequence.sequence, 1, -1, -2, -1)
    for alignment in alignments:
        score = alignment.score / len(part.sequence.seq)  # normalize score
        if score > threshold:
            candidate = (
                part.name,
                score,
                alignment.start,
                len(part.sequence.seq),
                alignment.end,
                alignment,
            )
            part_candidate = PartCandidate(candidate)
            candidates.append(part_candidate)
    return candidates


def match_part_probability_trace(
    sequence: Sequence, part: Part, threshold: float = 0.5
) -> List[PartCandidate]:
    """
    Match part to the computed probability sequence of a sequence (from trace file) and return candidates that score above the threshold

    :param sequence: Sequence object
    :param part: Library object
    :param threshold:  (Default value = 0.5)

    """
    candidates = []
    # Calculate Alignments
    part_rc = part.sequence.seq.reverse_complement()
    # Compute probability sequence from Trace
    seq_prob = sequence.trace.compute_seq_from_probability()
    seq = Seq.Seq(seq_prob)
    # Align
    alignments = pairwise2.align.localms(part_rc, seq, 1, -1, -2, -1)
    for alignment in alignments:
        score = alignment.score / len(part.sequence.seq)  # normalize score
        if score > threshold:
            candidate = (
                part.name,
                score,
                alignment.start,
                len(part.sequence.seq),
                alignment.end,
                alignment,
            )
            part_candidate = PartCandidate(candidate)
            candidates.append(part_candidate)
    return candidates


def match_library(
    sequence: Sequence, library: Library, threshold: float = 0.1, direction53=True
) -> List[PartCandidate]:
    """
    Match library of parts to a sequence and return candidates that score above the threshold.

    :param sequence: Sequence object
    :param library: Library object
    :param threshold:  (Default value = 0.5)

    """
    library_candidates = []
    for part in library.parts:
        part_candidates = match_part(sequence, part, threshold, direction53)
        if part_candidates:
            library_candidates.append(part_candidates)
    return library_candidates


def match_library_proba(
    sequence: Sequence, library: Library, threshold: float = 0.1
) -> List[PartCandidate]:
    """
    Match library of parts to a sequence and return candidates that score above the threshold.

    :param sequence: Sequence object
    :param library: Library object
    :param threshold:  (Default value = 0.5)

    """
    library_candidates = []
    for part in library.parts:
        part_candidates = match_part_probability_trace(sequence, part, threshold)
        if part_candidates:
            library_candidates.append(part_candidates)
    return library_candidates


def get_score(part: Part) -> float:
    """
    Get score function (used to order the get_top_score() results)
    see function below.
    """
    return part.score


def get_top_scores(partcandidates_list: List[PartCandidate]) -> PartCandidate:
    """
    Get top scores

    :param partcandidates_list: List of PartCandidates

    """
    result = sorted(partcandidates_list, key=get_score)
    top = result[0]
    return top