In [72]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import pandas as pd
from Bio.Seq import Seq
from typing import List, Dict, Union

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import pandas as pd
from Bio.Seq import Seq
from typing import List, Dict, Union



def open_gff3_files(path:str= '') -> List[List[str]]:
    '''
    Opens and reads a GFF3 file and returns its contents as a list of lists.

    Parameters:
    -----------
    path: str
        The path to the GFF3 file.

    Returns:
    --------
    List[List[str]]
        A list of lists containing the contents of the GFF3 file.
    '''
    with open(path, 'r') as infile:
        LINES = []
        for line in infile:
            LINES.append(line[:].split('\t'))
        LINES = LINES[1:]
    
    return LINES



def tidy_up_gff(lst_of_gff:list) -> list:
    """
    This function takes a list of GFF lines and returns a list of dictionaries,
    with each dictionary containing information on the signal peptides in the GFF file.
    
    Parameters:
    lst_of_gff (list): A list of GFF lines.
    
    Returns:
    list_of_peptides (list): A list of dictionaries, with each dictionary containing information on the signal peptides in the GFF file.
    """
    signal_peptides = {}
    list_of_peptides = []

    for peptide in lst_of_gff:
        signal_peptides['gene'] = peptide[0][:19]
        signal_peptides['start_pos'] = int(peptide[3])-1
        signal_peptides['end_pos']= int(peptide[4])+1
        signal_peptides['signal_peptide_likelyhood']= peptide[5]
        list_of_peptides.append(signal_peptides)
        signal_peptides = {'gene':'', 'start_pos':'', 'end_pos':'','signal_peptide_likelyhood': '' }

    return list_of_peptides




def dict_of_signal_peptides(path: str = '') -> List[Dict[str, Union[str, int]]]:
    """
    Given a path to a GFF3 file, returns a list of dictionaries with information on signal peptides.

    Args:
        path (str): Path to the GFF3 file. Default is an empty string.

    Returns:
        list: A list of dictionaries where each dictionary contains the following keys:
            - 'gene' (str): Gene name of the signal peptide.
            - 'start_pos' (int): Start position of the signal peptide in the protein sequence.
            - 'end_pos' (int): End position of the signal peptide in the protein sequence.
            - 'signal_peptide_likelyhood' (str): The likelihood of the sequence being a signal peptide.
    """
    gff = open_gff3_files(path)
    dict_of_signal_peptides = tidy_up_gff(gff)
    return dict_of_signal_peptides


def read_gff_to_pd(path:str= '') -> pd.DataFrame:
    """
    Reads a GFF3 file and returns a pandas DataFrame with columns 'gene', 'start_pos', 'end_pos', 
    and 'signal_peptide_likelyhood'.
    
    Parameters:
    -----------
    path : str
        The path to the GFF3 file.
        
    Returns:
    --------
    df : pandas.DataFrame
        A DataFrame with columns 'gene', 'start_pos', 'end_pos', and 'signal_peptide_likelyhood'.
    """
    
    gff = open_gff3_files(path)
    dict_of_signal_peptides = tidy_up_gff(gff)
    df = pd.DataFrame.from_records(dict_of_signal_peptides)
    
    return df



def get_signal_peptides_cross_ref_with_genome(list_of_peptides: List[dict], all_proteins: List[SeqRecord]) -> List[SeqRecord]:
    """
    Extracts the protein sequence that corresponds to each predicted signal peptide sequence from the input list
    of peptides and matches the signal peptide to its corresponding protein sequence in the input list of 
    protein sequences.
    
    Parameters
    ----------
    list_of_peptides : list
        A list of dictionaries containing information about predicted signal peptide sequences, including gene name, 
        start and end positions, and the signal peptide likelihood score.
    all_proteins : list
        A list of SeqRecord objects containing protein sequences.
    
    Returns
    -------
    list
        A list of SeqRecord objects that correspond to the input predicted signal peptide sequences, including 
        protein sequence, ID, name, and a description indicating that the sequence corresponds to a predicted 
        signal peptide.
    """
    signal_peptide_seqs = []

    for signal_peptide in list_of_peptides:
        for seqrecord in all_proteins:
            if signal_peptide['gene'] in seqrecord.id:             
                seq = SeqRecord(
                    Seq(seqrecord.seq[signal_peptide['start_pos']:signal_peptide['end_pos']]), 
                    id=seqrecord.id,
                    name=seqrecord.name,
                    description="signal_peptide predicted by signalP")

                signal_peptide_seqs.append(seq)

    return signal_peptide_seqs


def add_dunder_tail(peptide:str , max_lenght:int=22 ): 
    '''Adds a tail if a peptide is shorter than the specified max_len.
    '''
    if len(peptide) < max_lenght: 
        difference = max_lenght - len(peptide)
        sequence = peptide + ('-'*difference)
    else: 
        sequence = peptide
        
    return sequence     


In [86]:
aa1 = list("ACDEFGHIKLMNPQRSTVWY")
len(aa1)

20

In [5]:
top_sp = 'MMVAWWSLFLYGLQVAAPAL'

In [6]:
from teemi.design.combinatorial_design import get_combinatorial_list

In [9]:
list_sp = [list(seq) for seq in top_sp]
list_sp

[['M'],
 ['M'],
 ['V'],
 ['A'],
 ['W'],
 ['W'],
 ['S'],
 ['L'],
 ['F'],
 ['L'],
 ['Y'],
 ['G'],
 ['L'],
 ['Q'],
 ['V'],
 ['A'],
 ['A'],
 ['P'],
 ['A'],
 ['L']]

In [15]:
list_sp[-3] = aa1
list_sp[-2] = aa1
list_sp[-1] = aa1
list_sp


[['M'],
 ['M'],
 ['V'],
 ['A'],
 ['W'],
 ['W'],
 ['S'],
 ['L'],
 ['F'],
 ['L'],
 ['Y'],
 ['G'],
 ['L'],
 ['Q'],
 ['V'],
 ['A'],
 ['A'],
 ['A',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H',
  'I',
  'K',
  'L',
  'M',
  'N',
  'P',
  'Q',
  'R',
  'S',
  'T',
  'V',
  'W',
  'Y'],
 ['A',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H',
  'I',
  'K',
  'L',
  'M',
  'N',
  'P',
  'Q',
  'R',
  'S',
  'T',
  'V',
  'W',
  'Y'],
 ['A',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H',
  'I',
  'K',
  'L',
  'M',
  'N',
  'P',
  'Q',
  'R',
  'S',
  'T',
  'V',
  'W',
  'Y']]

In [19]:
all_combinations = get_combinatorial_list(list_sp)
len(all_combinations)

8000

In [24]:
all_combinations_as_str = []
aa_seq = ''
for sp in all_combinations: 
    for seq in sp: 
        aa_seq += seq
    
    all_combinations_as_str.append(aa_seq)
    aa_seq = ''

In [26]:
len(all_combinations_as_str)

8000

In [58]:
RFP = Seq('GCCTCCTCCGAGGACGTCATCAAGGAGTTCATGCGCTTCAAGGTGCGCATGGAGGGCTCCGTGAACGGCCACGAGTTCGAGATCGAGGGCGAGGGCGAGGGCCGCCCCTACGAGGGCACCCAGACCGCCAAGCTGAAGGTGACCAAGGGCGGCCCCCTGCCCTTCGCCTGGGACATCCTGTCCCCTCAGTTCCAGTACGGCTCCAAGGCCTACGTGAAGCACCCCGCCGACATCCCCGACTACTTGAAGCTGTCCTTCCCCGAGGGCTTCAAGTGGGAGCGCGTGATGAACTTCGAGGACGGCGGCGTGGTGACCGTGACCCAGGACTCCTCCCTGCAGGACGGCGAGTTCATCTACAAGGTGAAGCTGCGCGGCACCAACTTCCCCTCCGACGGCCCCGTAATGCAGAAGAAGACCATGGGCTGGGAGGCCTCCACCGAGCGGATGTACCCCGAGGACGGCGCCCTGAAGGGCGAGATCAAGATGAGGCTGAAGCTGAAGGACGGCGGCCACTACGACGCCGAGGTCAAGACCACCTACATGGCCAAGAAGCCCGTGCAGCTGCCCGGCGCCTACAAGACCGACATCAAGCTGGACATCACCTCCCACAACGAGGACTACACCATCGTGGAACAGTACGAGCGCGCCGAGGGCCGCCACTCCACCGGCGCCCATCATCATCATCATCATTAA')
RFP = RFP.translate()
print(RFP)

ASSEDVIKEFMRFKVRMEGSVNGHEFEIEGEGEGRPYEGTQTAKLKVTKGGPLPFAWDILSPQFQYGSKAYVKHPADIPDYLKLSFPEGFKWERVMNFEDGGVVTVTQDSSLQDGEFIYKVKLRGTNFPSDGPVMQKKTMGWEASTERMYPEDGALKGEIKMRLKLKDGGHYDAEVKTTYMAKKPVQLPGAYKTDIKLDITSHNEDYTIVEQYERAEGRHSTGAHHHHHH*


In [65]:
list_of_all_sp_combinations = []
for i in range(len(all_combinations_as_str)): 
    sp = SeqRecord(all_combinations_as_str[i]) + RFP
    sp.description, sp.name, sp.id  = "" , i, str(i)
    
    
    list_of_all_sp_combinations.append(sp)


In [66]:
len(list_of_all_sp_combinations[0])

251

In [67]:
partition1 = list_of_all_sp_combinations[:5000]
partition2 = list_of_all_sp_combinations[5000:]


In [68]:
with open("/Users/lucaslevassor/projects/Signal_peptide_project/data/13_all_sp_combinations_with_last_three_aa_mutated/first_partition.fasta", "w") as output_handle:
    SeqIO.write(partition1, output_handle, "fasta")


In [69]:
with open("/Users/lucaslevassor/projects/Signal_peptide_project/data/13_all_sp_combinations_with_last_three_aa_mutated/second_partition.fasta", "w") as output_handle:
    SeqIO.write(partition2, output_handle, "fasta")

In [79]:
first = read_gff_to_pd('/Users/lucaslevassor/projects/Signal_peptide_project/data/13_all_sp_combinations_with_last_three_aa_mutated/first_partition.gff3')
second = read_gff_to_pd('/Users/lucaslevassor/projects/Signal_peptide_project/data/13_all_sp_combinations_with_last_three_aa_mutated/second_partition.gff3')

all_sps = pd.concat([first, second])
all_sps.to_clipboard()

Interestingly, then most of the synthetic signal peptides predicted to be signal peptides.

In [85]:
print(f'Out of all {len(all_combinations)} combinations, the number of predicted peptides are : {len(all_sps)}')
print(f'This is {len(all_sps)/len(all_combinations)*100}% of the solution space')

Out of all 8000 combinations, the number of predicted peptides are : 7338
This is 91.725%
