In [1]:
%%capture 
!pip install h2o==3.40.0.1

In [2]:
import numpy as np
import pandas as pd
import h2o

In [3]:
def generate_artificial_peptide(list_of_probabilities: np.ndarray, amino_acids: np.ndarray, max_length=22) -> str:
    """
    Generate an artificial peptide based on a list of probabilities and amino acids.
    
    Parameters:
    ----------
    list_of_probabilities : numpy.ndarray
        2-D array of probability of amino acids in the peptide
    amino_acids : numpy.ndarray
        1-D array of amino acids.
        
    Returns:
    -------
    str
        Generated artificial peptide
        
    Notes:
    ------
    The length of the probability array should be same as the length of the peptide.
    """
    out_str = ''
    for i in range(len(list_of_probabilities)):
        # make synthetic signal peptide
        artificial_amino_acid = list(np.random.choice(amino_acids, 1, p=list_of_probabilities[i]))

        if artificial_amino_acid == ['-']: 
            break

        out_str += artificial_amino_acid[0]
    return out_str

def add_dunder_tail(peptide:str , max_lenght:int=22 ): 
    '''Adds a tail if a peptide is shorter than the specified max_len.
    '''
    if len(peptide) < max_lenght: 
        difference = max_lenght - len(peptide)
        sequence = peptide + ('-'*difference)
    else: 
        sequence = peptide
        
    return sequence      

def generate_artificial_peptides(list_of_probabilities: np.ndarray, amino_acids: np.ndarray, n_peptides: int, max_len=50) -> pd.DataFrame:
    """
    Generate a dataframe of artificial peptides based on a list of probabilities and amino acids.
    
    Parameters:
    ----------
    list_of_probabilities : numpy.ndarray
        2-D array of probability of amino acids in the peptide
    amino_acids : numpy.ndarray
        1-D array of amino acids.
    n_peptides : int
        Number of peptides to generate
        
    Returns:
    -------
    pd.DataFrame
        Dataframe of generated artificial peptides with 'sequence' as column
        
    Notes:
    ------
    The length of the probability array should be same as the length of the peptide.
    """
    artificial_peptides = []
    lengths = [] 
    for i in range(n_peptides): 
        peptide = generate_artificial_peptide(list_of_probabilities,amino_acids, max_length=max_len)
        if len(peptide) <= max_len:
            peptide_w_tail = add_dunder_tail(peptide, max_lenght = max_len)
        else: 
            continue
        
        # save
        lengths.append(len(peptide))                                     
        artificial_peptides.append(peptide_w_tail)

    df = pd.DataFrame(artificial_peptides, columns =['sequence'])
    df['length'] = lengths
    return df


def split_peptides_sequences(df_signalPP: pd.DataFrame) -> pd.DataFrame:
    '''
    Splits the amino acid sequences into individual amino acids for each position.

    Parameters:
    -----------
    df_signalPP: pandas.DataFrame
        A DataFrame containing the amino acid sequences.

    Returns:
    --------
    pandas.DataFrame
        A DataFrame containing the split amino acid sequences.
    '''
    # Initialize an empty list to store the split sequences.
    peptides_split = []

    # Split each amino acid sequence into individual amino acids.
    for k, v in df_signalPP.iterrows():
        sequence = []
        for seq in v['sequence']:
            sequence.append(seq)
        peptides_split.append(sequence)

    # Convert the list of split sequences into a DataFrame and fill NaN values with '-'.
    new_peptides = pd.DataFrame(peptides_split)
    new_peptides = new_peptides.fillna('-')

    return new_peptides


def signal_peptide_predictor(list_of_probabilities: list, amino_acids: str, 
                            n_peptides: int, 
                            number_of_iterations: int, 
                            best_model: h2o.estimators, 
                            training_column_name:str = 'MM_N_peptide_abundance', 
                            max_len_of_signal_peptides:int = 30,
                            length_of_return_df = 500, 
                            one_hot_encode:bool = True) -> pd.DataFrame:
    '''
    Predicts the best signal peptides from a given number of iterations.

    Parameters:
    -----------
    list_of_probabilities: list
        A list of probabilities.
    amino_acids: str
        A string containing the amino acids used to generate the peptides.
    n_peptides: int
        The number of peptides to generate.
    number_of_iterations: int
        The number of iterations to run the predictor.
    best_model: h2o.estimators
        The trained model to use for prediction.
    training_column_name : str
        The name of the column that the model has been trained on.
    max_len_of_signal_peptides : int
        The maximum lenght of the signal peptides you want to be generated 

    Returns:
    --------
    pandas.DataFrame
        A DataFrame containing the predicted signal peptides.
    '''
    # Initialize an empty DataFrame to store the predicted peptides.
    data = pd.DataFrame()
    
    # Generate and predict peptides for each iteration.
    for i in range(0, number_of_iterations):
        # Generate new peptides.
        new_TO_NATURE_peptides = generate_artificial_peptides(list_of_probabilities, amino_acids, n_peptides=n_peptides, max_len=max_len_of_signal_peptides)
        
        # Split the peptides into sequences.
        new_TO_NATURE_peptides = split_peptides_sequences(new_TO_NATURE_peptides)

        if one_hot_encode:
            new_TO_NATURE_peptides = one_hot_encode_aa(new_TO_NATURE_peptides)
            # Convert the DataFrame to an H2OFrame and make the columns categorical.
            df_test = h2o.H2OFrame(pd.concat([new_TO_NATURE_peptides], axis='columns'))
        else:
            df_test = h2o.H2OFrame(pd.concat([new_TO_NATURE_peptides], axis='columns'))
            for column in df_test.columns:
                if column != training_column_name:
                    df_test[column] = df_test[column].asfactor()

        # Make predictions on the test data.
        predicted = best_model.predict(df_test).as_data_frame()
        new_TO_NATURE_peptides['predictions'] = predicted['predict'].to_list()

        # Concatenate the new predictions with the existing DataFrame.
        if len(data) == 0:
            data = new_TO_NATURE_peptides.copy()
        else:
            data = pd.concat([data, new_TO_NATURE_peptides], axis=0)
            data = data.sort_values('predictions', ascending=False)
            data = data[0:length_of_return_df]
    
    return data


def one_hot_encode_aa(df):
    """One-hot encode amino acid sequences in a pandas dataframe"""

    # Define amino acid alphabet
    aa_alphabet = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
                   'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-']

    # Determine maximum sequence length in input dataframe
    max_seq_len = df.applymap(len).max().max()

    # Initialize one-hot encoding dataframe
    encoding_df = pd.DataFrame()

    # Loop over columns in input dataframe and encode each amino acid sequence
    for col in df.columns:
        # Extract amino acid sequences from current column
        seqs = df[col].values

        # Initialize 2D array to hold one-hot encoding for current column
        encoding_matrix = np.zeros((len(seqs), max_seq_len * len(aa_alphabet)))

        # Loop over amino acid sequences and one-hot encode each one
        for i, seq in enumerate(seqs):
            # Get one-hot encoding vector for current amino acid sequence
            encoding_vec = np.zeros((max_seq_len, len(aa_alphabet)))
            for j, aa in enumerate(seq):
                encoding_vec[j, aa_alphabet.index(aa)] = 1

            # Flatten encoding vector and add it to encoding matrix for current column
            encoding_matrix[i, :] = encoding_vec.flatten()

        # Convert encoding matrix to dataframe and add it to one-hot encoding dataframe
        col_encoding_df = pd.DataFrame(encoding_matrix)
        col_encoding_df.columns = [f'{col}_{j}' for j in range(max_seq_len * len(aa_alphabet))]
        encoding_df = pd.concat([encoding_df, col_encoding_df], axis=1)

    return encoding_df


def one_hot_decode_aa(df, max_length = 22):
    # Define amino acid alphabet
    aa_alphabet = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
                    'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '-']
    sps = []
    peptide = ''
    cols = list(df.columns)

    for index, row in df.iterrows():
        for col in cols: 
            if int(row[col]) ==1: 
                aa_index = int(col.split('_')[1])
                aa_acid = aa_alphabet[aa_index]
                peptide += aa_acid
                if len(peptide) == max_length: 
                    sps.append(peptide)
                    peptide = ''

    df['decoded_sequence'] = sps

    return df

# Gerating artificial Signal Peptides (SPs)

## 0 Introduction 
To avoid the combinatorial explosion that arises from the large number of possible amino acid sequences, an algorithm must be developed to narrow down the search space and identify the sequences that are most likely to function as signal peptides. This can be accomplished through a variety of computational methods, such as bioinformatics, machine learning, and statistical analysis.

One common approach is to use bioinformatics methods to analyze large sets of data on known signal peptides and identify patterns or features that are associated with signal peptide function. These features can then be used to predict the function of novel sequences.

Machine learning algorithms can also be used to predict signal peptides. These algorithms can be trained on large sets of data on known signal peptides, and can then be used to predict the function of novel sequences. Common machine learning algorithms used for this purpose include decision trees, random forests, and neural networks.

Another approach is to use statistical analysis to identify the regions of the peptide sequences that are most likely to function as signal peptides. This can be done by analyzing the frequency and distribution of different amino acids in known signal peptides and identifying those that are over-represented or under-represented in these sequences.

In summary, by developing an algorithm, we can narrow down the search space and identify the sequences that are most likely to function as signal peptides, thus avoiding combinatorial explosion. The algorithm we are showcasing here is based on a combination of bioinformatics, machine learning and statistical analysis.

### Agenda
- Use AutoML predictions and synthetic signal peptide generation algorotihm to get a novel list of potential signal peptides

## 1 Amino acid probability matrix

Lets import our df_pwn that was made in a previous notebook:

In [4]:
df_pwn = pd.read_csv('../data/02_all_signal_peptides/df_pwn_for_signal_peptides_found_in_supernatant.csv', index_col = False)
df_pwn

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,N,P,Q,R,S,T,V,W,Y,-
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.070312,0.000000,0.003906,0.003906,0.035156,0.000000,0.062500,0.035156,0.234375,0.078125,...,0.011719,0.031250,0.070312,0.214844,0.031250,0.007812,0.046875,0.015625,0.015625,0.000000
2,0.042969,0.000000,0.003906,0.007812,0.179688,0.058594,0.000000,0.050781,0.015625,0.210938,...,0.023438,0.027344,0.023438,0.066406,0.140625,0.050781,0.066406,0.007812,0.015625,0.000000
3,0.070312,0.000000,0.003906,0.000000,0.097656,0.035156,0.015625,0.046875,0.062500,0.167969,...,0.031250,0.042969,0.058594,0.042969,0.179688,0.085938,0.015625,0.011719,0.019531,0.000000
4,0.109375,0.007812,0.007812,0.007812,0.035156,0.027344,0.015625,0.066406,0.066406,0.136719,...,0.046875,0.035156,0.046875,0.015625,0.167969,0.078125,0.046875,0.019531,0.046875,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,0.003906,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.003906,0.000000,0.000000,0.000000,0.000000,0.992188
66,0.003906,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.996094
67,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.003906,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.996094
68,0.003906,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.996094


In [5]:
amino_acids = list(df_pwn.columns.values)
amino_acids

['A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'Y',
 '-']

In [6]:
list_of_probabilities = []
for i in range(len(df_pwn)): 
    list_of_probabilities.append(df_pwn.loc[i, :].values.tolist())
list_of_probabilities[1]

[0.0703125,
 0.0,
 0.00390625,
 0.00390625,
 0.03515625,
 0.0,
 0.0625,
 0.03515625,
 0.234375,
 0.078125,
 0.03125,
 0.01171875,
 0.03125,
 0.0703125,
 0.21484375,
 0.03125,
 0.0078125,
 0.046875,
 0.015625,
 0.015625,
 0.0]

In [7]:
len(list_of_probabilities)

70

In [8]:
list_of_probabilities_random = list_of_probabilities.copy()

In [9]:
# GEnerating random signal peptides with equal distribution
for i in range(0,len(list_of_probabilities_random)): 
    for j in range(len(list_of_probabilities_random[i])):
        list_of_probabilities_random[i][j] = (100/21)/100

In [10]:
list_of_probabilities_random

[[0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616],
 [0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.047619047619047616,
  0.04761904761

In [11]:
generate_artificial_peptides

<function __main__.generate_artificial_peptides(list_of_probabilities: numpy.ndarray, amino_acids: numpy.ndarray, n_peptides: int, max_len=50) -> pandas.core.frame.DataFrame>

In [12]:
# random_peptides = generate_artificial_peptides(list_of_probabilities_random,amino_acids, 
#                                                 n_peptides=50000, 
#                                                 max_len=22)


# random_peptides_15_22 = random_peptides[random_peptides["length"] >= 15]

# random_peptides_15_22.to_csv('../data/05_best_signal_peptides/random_peptides_for_ML_training/random_peptides_for_ML_training_15_22.csv')
# random_peptides_15_22


## 2 Load best model from AutoML

In [13]:
h2o.init(ip="localhost", min_mem_size_GB=8)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_361"; Java(TM) SE Runtime Environment (build 1.8.0_361-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.361-b09, mixed mode)
  Starting server from /Users/lucaslevassor/opt/anaconda3/envs/constrain/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/2f/lw3sfzbs7l7f_q1knzbtbwrr0000gp/T/tmp0a150b2x
  JVM stdout: /var/folders/2f/lw3sfzbs7l7f_q1knzbtbwrr0000gp/T/tmp0a150b2x/h2o_lucaslevassor_started_from_python.out
  JVM stderr: /var/folders/2f/lw3sfzbs7l7f_q1knzbtbwrr0000gp/T/tmp0a150b2x/h2o_lucaslevassor_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,19 secs
H2O_cluster_timezone:,Europe/Copenhagen
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.1
H2O_cluster_version_age:,2 months and 22 days
H2O_cluster_name:,H2O_from_python_lucaslevassor_djoxyd
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.566 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [14]:
best_model = h2o.load_model("../data/04_ML_models/DeepLearning_grid_1_AutoML_1_20230426_90434_model_9")
best_model

Unnamed: 0,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
,1,409,Input,0.0,,,,,,,,,
,2,20,RectifierDropout,10.0,0.0,0.0,0.0618324,0.0982837,0.0,0.0686823,0.3475422,-0.6929061,0.4269514
,3,1,Linear,,0.0,0.0,9.4e-06,2.69e-05,0.0,0.0055326,0.004399,-0.0648037,0.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
mae,0.0037686,0.0053795,0.0184182,0.0059225,0.002701,0.0012151,0.0016255,0.0034445,0.0010129,0.0013611,0.0008457,0.0011395
mean_residual_deviance,0.0015035,0.0040994,0.0130892,0.0015734,0.0002002,3.2e-06,7.39e-05,5.17e-05,9.5e-06,1.16e-05,8.3e-06,1.42e-05
mse,0.0015035,0.0040994,0.0130892,0.0015734,0.0002002,3.2e-06,7.39e-05,5.17e-05,9.5e-06,1.16e-05,8.3e-06,1.42e-05
r2,0.0103269,0.1004046,0.0172692,-0.0225876,-0.0213516,0.2831413,-0.0287704,0.0367059,-0.0717703,-0.0267836,-0.0447277,-0.0178563
residual_deviance,0.0015035,0.0040994,0.0130892,0.0015734,0.0002002,3.2e-06,7.39e-05,5.17e-05,9.5e-06,1.16e-05,8.3e-06,1.42e-05
rmse,0.0198944,0.0350831,0.1144081,0.0396665,0.0141486,0.0017875,0.0085957,0.0071891,0.0030854,0.0034085,0.0028803,0.0037739
rmsle,0.0156568,0.0242164,0.078839,0.034205,0.0133928,0.0017813,0.0083187,0.0070099,0.0030615,0.0033764,0.0028576,0.0037264

Unnamed: 0,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_deviance,training_mae,training_r2
,2023-04-26 09:31:39,0.000 sec,,0.0,0,0.0,,,,
,2023-04-26 09:31:39,25 min 20.495 sec,55263 obs/sec,10.0,1,7350.0,0.042531,0.0018089,0.0150107,-0.167728
,2023-04-26 09:31:44,25 min 25.552 sec,66650 obs/sec,470.0,47,345450.0,0.0394075,0.001553,0.0029266,-0.0025075
,2023-04-26 09:31:49,25 min 30.585 sec,69115 obs/sec,960.0,96,705600.0,0.0373536,0.0013953,0.0032806,0.099269
,2023-04-26 09:31:54,25 min 35.649 sec,71742 obs/sec,1490.0,149,1095150.0,0.0370015,0.0013691,0.0034579,0.1161693
,2023-04-26 09:31:55,25 min 36.722 sec,72010 obs/sec,1600.0,160,1176000.0,0.0371662,0.0013813,0.0045196,0.1082848

variable,relative_importance,scaled_importance,percentage
21_3,1.0,1.0,0.0097765
8_0,0.5474159,0.5474159,0.0053518
13_5,0.5444318,0.5444318,0.0053227
16_20,0.5432656,0.5432656,0.0053113
6_15,0.5304022,0.5304022,0.0051855
13_0,0.4856204,0.4856204,0.0047477
1_14,0.4661863,0.4661863,0.0045577
13_13,0.4627634,0.4627634,0.0045242
4_15,0.4551175,0.4551175,0.0044495
4_9,0.4394741,0.4394741,0.0042965


## 3 Signal peptide predictor algorithm

The signal peptide predictor algorithm works by: 

    1. Generating artificial signal peptides - based on the PWM.
    2. It splits the artificial signal peptides into a dataframe adding '_' to the positions that are empty. 
    3. Initiates the H2O dataframe and makes all values except the feature column categorical
    4. Predicts the columns - based on a previously made ML model
    5. Makes a new dataframe with the best predictions, sorts it and slices out top100 
    6. The loop will continue until the specified number is reached. 
 

Note: Minimum number of iterations is 2.

In [15]:
signal_peptide_predictor

<function __main__.signal_peptide_predictor(list_of_probabilities: list, amino_acids: str, n_peptides: int, number_of_iterations: int, best_model: <module 'h2o.estimators' from '/Users/lucaslevassor/opt/anaconda3/envs/constrain/lib/python3.8/site-packages/h2o/estimators/__init__.py'>, training_column_name: str = 'MM_N_peptide_abundance', max_len_of_signal_peptides: int = 30, length_of_return_df=500, one_hot_encode: bool = True) -> pandas.core.frame.DataFrame>

In [16]:
%%capture
run = True

if run: 
    best_predicted_SPs = signal_peptide_predictor(list_of_probabilities, 
                                                        amino_acids, n_peptides =  5000,  
                                                        number_of_iterations = 10000, 
                                                        best_model = best_model, 
                                                        max_len_of_signal_peptides=22,
                                                        one_hot_encode = True,
                                                        length_of_return_df = 5000,
                                                        training_column_name = 'MM_N_peptide_abundance')

H2OResponseError: Server error java.lang.IllegalArgumentException:
  Error: Total input file size of 5.9 MB is much larger than total cluster memory of Zero  , please use either a larger cluster or smaller data.
  Request: POST /3/Parse
    data: {'destination_frame': 'Key_Frame__upload_b380c5285f8da7aa2997bdba57cd4972.hex', 'parse_type': 'CSV', 'separator': '44', 'check_header': '1', 'number_columns': '462', 'chunk_size': '4194304', 'delete_on_done': 'True', 'blocking': 'False', 'column_types': '["Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric"]', 'single_quotes': 'False', 'escapechar': '0', 'column_names': '["0_0","0_1","0_2","0_3","0_4","0_5","0_6","0_7","0_8","0_9","0_10","0_11","0_12","0_13","0_14","0_15","0_16","0_17","0_18","0_19","0_20","1_0","1_1","1_2","1_3","1_4","1_5","1_6","1_7","1_8","1_9","1_10","1_11","1_12","1_13","1_14","1_15","1_16","1_17","1_18","1_19","1_20","2_0","2_1","2_2","2_3","2_4","2_5","2_6","2_7","2_8","2_9","2_10","2_11","2_12","2_13","2_14","2_15","2_16","2_17","2_18","2_19","2_20","3_0","3_1","3_2","3_3","3_4","3_5","3_6","3_7","3_8","3_9","3_10","3_11","3_12","3_13","3_14","3_15","3_16","3_17","3_18","3_19","3_20","4_0","4_1","4_2","4_3","4_4","4_5","4_6","4_7","4_8","4_9","4_10","4_11","4_12","4_13","4_14","4_15","4_16","4_17","4_18","4_19","4_20","5_0","5_1","5_2","5_3","5_4","5_5","5_6","5_7","5_8","5_9","5_10","5_11","5_12","5_13","5_14","5_15","5_16","5_17","5_18","5_19","5_20","6_0","6_1","6_2","6_3","6_4","6_5","6_6","6_7","6_8","6_9","6_10","6_11","6_12","6_13","6_14","6_15","6_16","6_17","6_18","6_19","6_20","7_0","7_1","7_2","7_3","7_4","7_5","7_6","7_7","7_8","7_9","7_10","7_11","7_12","7_13","7_14","7_15","7_16","7_17","7_18","7_19","7_20","8_0","8_1","8_2","8_3","8_4","8_5","8_6","8_7","8_8","8_9","8_10","8_11","8_12","8_13","8_14","8_15","8_16","8_17","8_18","8_19","8_20","9_0","9_1","9_2","9_3","9_4","9_5","9_6","9_7","9_8","9_9","9_10","9_11","9_12","9_13","9_14","9_15","9_16","9_17","9_18","9_19","9_20","10_0","10_1","10_2","10_3","10_4","10_5","10_6","10_7","10_8","10_9","10_10","10_11","10_12","10_13","10_14","10_15","10_16","10_17","10_18","10_19","10_20","11_0","11_1","11_2","11_3","11_4","11_5","11_6","11_7","11_8","11_9","11_10","11_11","11_12","11_13","11_14","11_15","11_16","11_17","11_18","11_19","11_20","12_0","12_1","12_2","12_3","12_4","12_5","12_6","12_7","12_8","12_9","12_10","12_11","12_12","12_13","12_14","12_15","12_16","12_17","12_18","12_19","12_20","13_0","13_1","13_2","13_3","13_4","13_5","13_6","13_7","13_8","13_9","13_10","13_11","13_12","13_13","13_14","13_15","13_16","13_17","13_18","13_19","13_20","14_0","14_1","14_2","14_3","14_4","14_5","14_6","14_7","14_8","14_9","14_10","14_11","14_12","14_13","14_14","14_15","14_16","14_17","14_18","14_19","14_20","15_0","15_1","15_2","15_3","15_4","15_5","15_6","15_7","15_8","15_9","15_10","15_11","15_12","15_13","15_14","15_15","15_16","15_17","15_18","15_19","15_20","16_0","16_1","16_2","16_3","16_4","16_5","16_6","16_7","16_8","16_9","16_10","16_11","16_12","16_13","16_14","16_15","16_16","16_17","16_18","16_19","16_20","17_0","17_1","17_2","17_3","17_4","17_5","17_6","17_7","17_8","17_9","17_10","17_11","17_12","17_13","17_14","17_15","17_16","17_17","17_18","17_19","17_20","18_0","18_1","18_2","18_3","18_4","18_5","18_6","18_7","18_8","18_9","18_10","18_11","18_12","18_13","18_14","18_15","18_16","18_17","18_18","18_19","18_20","19_0","19_1","19_2","19_3","19_4","19_5","19_6","19_7","19_8","19_9","19_10","19_11","19_12","19_13","19_14","19_15","19_16","19_17","19_18","19_19","19_20","20_0","20_1","20_2","20_3","20_4","20_5","20_6","20_7","20_8","20_9","20_10","20_11","20_12","20_13","20_14","20_15","20_16","20_17","20_18","20_19","20_20","21_0","21_1","21_2","21_3","21_4","21_5","21_6","21_7","21_8","21_9","21_10","21_11","21_12","21_13","21_14","21_15","21_16","21_17","21_18","21_19","21_20"]', 'source_frames': '["upload_b380c5285f8da7aa2997bdba57cd4972"]'}


In [None]:
decoded_df = one_hot_decode_aa(best_predicted_SPs)

In [None]:
decoded_df

Unnamed: 0,0_0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,...,21_13,21_14,21_15,21_16,21_17,21_18,21_19,21_20,predictions,decoded_sequence
6909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108413,MMSANVWTLAYLVVSNAAALRE
8894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098545,MVNKVVGSTACAAKSVTTPYVE
6114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094471,MKFLALWLASYGAQATPTSLSE
6545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094046,MHLAPLPSTYLFTQSLATPTAE
5433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093624,MKTASWLATLVLATRLMTLLRE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063457,MQLKELTVAIYLSIALPAFDAE
4484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063449,MRGHLSSTGTCYLQFPLGATLE
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063448,MRLFGGLAALLGLASSISLQDE
6583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063445,MRVTIRAFAATSLLATASLTAE


In [None]:
best_predicted_SPs.to_csv('../data/05_best_signal_peptides/model_22_one_hot_encoded/encoded_i10000_npeptides5000.csv', index=False)
decoded_df.to_csv('../data/05_best_signal_peptides/model_22_one_hot_encoded/decoded_i10000_npeptides5000.csv', index=False)



In [None]:
#best_predicted_SPs = pd.read_csv('../data/05_best_signal_peptides/model_22_/100_i/best_signal_peptides_i_100.csv', index_col=False)

In [None]:
best_predicted_SPs.head(50)


Unnamed: 0,0_0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,...,21_13,21_14,21_15,21_16,21_17,21_18,21_19,21_20,predictions,decoded_sequence
6909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108413,MMSANVWTLAYLVVSNAAALRE
8894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098545,MVNKVVGSTACAAKSVTTPYVE
6114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094471,MKFLALWLASYGAQATPTSLSE
6545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094046,MHLAPLPSTYLFTQSLATPTAE
5433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093624,MKTASWLATLVLATRLMTLLRE
3254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093499,MKGSCIESTACIAISAAIPVTE
2526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093122,MRLAGLLDSLLLVFAAATPLAE
2742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09297,MKSRSSLATKSTLGSALTPLVE
8072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092966,MFTNLLLTSLPGAQSAAAYLGE
9575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092918,MTSKAFFFAALSVFSAAVPLRE


In [None]:
from Bio.SeqRecord import SeqRecord
predicted_signal_peptide_sequences = best_predicted_SPs['decoded_sequence'].to_list()
predicted_signal_peptide_sequences
#predicted_signal_peptide_sequences = [SeqRecord(peptide) for peptide in predicted_signal_peptide_sequences]

['MMSANVWTLAYLVVSNAAALRE',
 'MVNKVVGSTACAAKSVTTPYVE',
 'MKFLALWLASYGAQATPTSLSE',
 'MHLAPLPSTYLFTQSLATPTAE',
 'MKTASWLATLVLATRLMTLLRE',
 'MKGSCIESTACIAISAAIPVTE',
 'MRLAGLLDSLLLVFAAATPLAE',
 'MKSRSSLATKSTLGSALTPLVE',
 'MFTNLLLTSLPGAQSAAAYLGE',
 'MTSKAFFFAALSVFSAAVPLRE',
 'MKIAAMSLFMYGLLVWAQLTAE',
 'MKVFAAVLLLLLPTSALVALRE',
 'MFILAATMFVDTYLSASTLLAE',
 'MKTNSISSTLALAVSLESSLAE',
 'MKTKAALAVAYICQASAAAPAE',
 'MMFSYLLAALTAVTSYAVPLPE',
 'MLFSLSFSLAFSLVVAAAPLTE',
 'MFSSILSSLRSLLQAAEPALAE',
 'MKKLNLAAQAALLKSALAPYAE',
 'MRGNYAFSTLFALGSLAAHADE',
 'MPSTTSFTTLSLLQVPSAPAAE',
 'MKGFSAVDTASMKSALAAALVE',
 'MAPRAVSALQCAGQWVASNPDE',
 'MMIANLIALGPLLVPSAAYARE',
 'MLMNYTSTTATGCTPTPTAAAE',
 'MVRSAIALQGYLAQRAATAYLE',
 'MRRAHSTSLATLLQSSEPAAAE',
 'MATLASWAMSLAATVFVHNCVE',
 'MRGTYLTTLAAAALSGAAPQVE',
 'MKLFAALATLTIGVVPSTSLVE',
 'MRLFAKLTSLYLPQCVVVALTE',
 'MMLNAIISTLLLLLASWFAAAE',
 'MVRSSAALALLGATLLVAPLAE',
 'MKKNHTESSGLASASSFTFWHE',
 'MFGSNGLTSLTALVFLFTCARE',
 'MRNAPWLFLAATASPVAAPPAE',
 'MKIQAFFGMASFLVASETSLTE',
 

In [None]:
from Bio.Seq import Seq 

In [None]:
# add gfp
GFP = 'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK'
predicted_signal_peptide_sequences = [Seq(peptide+GFP)  for peptide in predicted_signal_peptide_sequences]


In [None]:
# Make to SeqRecord
predicted_signal_peptide_sequences = [SeqRecord(peptide)  for peptide in predicted_signal_peptide_sequences]
#predicted_signal_peptide_sequences = [predicted_signal_peptide_sequences[i].id = i  for peptide in range(len(predicted_signal_peptide_sequences))]

In [None]:
predicted_signal_peptide_sequences

[SeqRecord(seq=Seq('MMSANVWTLAYLVVSNAAALREMSKGEELFTGVVPILVELDGDVNGHKFSVSGE...LYK'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('MVNKVVGSTACAAKSVTTPYVEMSKGEELFTGVVPILVELDGDVNGHKFSVSGE...LYK'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('MKFLALWLASYGAQATPTSLSEMSKGEELFTGVVPILVELDGDVNGHKFSVSGE...LYK'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('MHLAPLPSTYLFTQSLATPTAEMSKGEELFTGVVPILVELDGDVNGHKFSVSGE...LYK'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('MKTASWLATLVLATRLMTLLREMSKGEELFTGVVPILVELDGDVNGHKFSVSGE...LYK'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('MKGSCIESTACIAISAAIPVTEMSKGEELFTGVVPILVELDGDVNGHKFSVSGE...LYK'), id='<unknown id>', name='<unknown na

In [None]:
for i in range(len(predicted_signal_peptide_sequences)):
    predicted_signal_peptide_sequences[i].id = str(i+1)
    predicted_signal_peptide_sequences[i].name = f'sp{i+1:00}'

In [None]:
predicted_signal_peptide_sequences[i]

SeqRecord(seq=Seq('MRRLALCLLLMLSVGVRADQHEMSKGEELFTGVVPILVELDGDVNGHKFSVSGE...LYK'), id='5000', name='sp5000', description='<unknown description>', dbxrefs=[])

In [None]:
#for lenght in predicted_signal_peptide_sequences:
#    print(len(lenght))

In [None]:
from Bio import SeqIO

with open(f"../data/05_best_signal_peptides/best_signal_peptides_as_fasta/signal_peptides1.fasta", "w") as output_handle:
    SeqIO.write(predicted_signal_peptide_sequences, output_handle, "fasta")

In [None]:
predicted_signal_peptide_sequences

[SeqRecord(seq='MMSANVWTLAYLVVSNAAALRE', id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq='MVNKVVGSTACAAKSVTTPYVE', id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq='MKFLALWLASYGAQATPTSLSE', id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq='MHLAPLPSTYLFTQSLATPTAE', id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq='MKTASWLATLVLATRLMTLLRE', id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq='MKGSCIESTACIAISAAIPVTE', id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq='MRLAGLLDSLLLVFAAATPLAE', id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq='MKSRSSLATKSTLGSALTPLVE', id='<unknown id>', name='<un

In [None]:
# new_TO_NATURE_peptides = generate_artificial_peptides(list_of_probabilities, 
#                                                         amino_acids, 
#                                                         n_peptides=1000, 
#                                                         max_len=22)

# new_TO_NATURE_peptides_split = split_peptides_sequences(new_TO_NATURE_peptides)
# one_hot_peptides = one_hot_encode_aa(new_TO_NATURE_peptides_split)
# df_w_decoded_seqs = one_hot_decode_aa(one_hot_peptides)
# df_w_decoded_seqs

In [None]:
# decode_sequences = list(df_w_decoded_seqs['decoded_sequence'])
# ref_seqs =  list(new_TO_NATURE_peptides['sequence'])



# count = 0
# for seq in decode_sequences: 
#     if seq in ref_seqs: 
#         count += 1

# if count == len(decode_sequences): 
#     print('successful')


### Get the best peptide sequence

In [None]:
# list_of_aa = []
# for i in range(0,len(best_predicted_SPs)):
#     #signal_peptide = "".join(best_predicted_SPs.iloc[i][:-1].tolist())
#     peptide = best_predicted_SPs.iloc[i][:-1].tolist()

#     #peptide_wo_dash = [aa for aa in peptide if aa != '-']
#     #if '-' in peptide: 
#     list_of_aa.append(peptide)
    
# len(list_of_aa)

In [None]:
#list_of_aa.append(aa1)

In [None]:
#cols = [str(i) for i in range(0,22)]

In [None]:
#best_predicted_SPs['sequence'] = best_predicted_SPs[cols].agg(''.join, axis=1)
#best_predicted_SPs

In [None]:
#best_predicted_SPs = best_predicted_SPs.replace(to_replace = '-', value ='')
#best_predicted_SPs['sequence'] =  [string.replace("-", "") for string in list(best_predicted_SPs['sequence'])]

KeyError: 'sequence'

In [None]:
lenght_of_signal_peptides = []
for k,v in best_predicted_SPs.iterrows(): 
    #print(v['sequence'])
    lenght_of_signal_peptides.append(len(v['sequence']))
best_predicted_SPs['length'] = lenght_of_signal_peptides

In [None]:
best_predicted_SPs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,predictions,sequence,length
0,M,K,F,S,A,I,S,V,L,A,...,L,S,A,G,Q,Q,A,0.032961,MKFSAISVLACIGTSLSAGQQA,22
1,M,M,F,R,Y,I,F,L,G,V,...,A,A,T,A,A,V,A,0.032511,MMFRYIFLGVLALTAAATAAVA,22
2,M,L,F,L,A,A,L,T,L,A,...,G,A,A,A,L,I,P,0.032439,MLFLAALTLALCGYPGAAALIP,22
3,M,K,F,G,A,L,A,S,S,G,...,L,A,A,S,L,V,T,0.030866,MKFGALASSGAGALALAASLVT,22
4,M,R,V,S,A,A,L,T,I,G,...,Y,S,A,F,L,T,E,0.030141,MRVSAALTIGAAMLNYSAFLTE,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,M,H,F,L,A,T,V,S,L,L,...,A,V,A,A,L,H,A,0.019090,MHFLATVSLLCLASAAVAALHA,22
96,M,A,I,Q,A,P,F,S,W,G,...,R,A,A,D,L,A,A,0.019026,MAIQAPFSWGGPLAVRAADLAA,22
97,M,R,F,S,K,A,V,F,S,A,...,A,A,A,A,P,I,V,0.018995,MRFSKAVFSAGAGLSAAAAPIV,22
98,M,K,N,P,S,F,L,P,T,A,...,L,A,F,A,Q,N,A,0.018968,MKNPSFLPTACMLIALAFAQNA,22


In [None]:
SPs_22 = best_predicted_SPs[(best_predicted_SPs[['length']]<23).all(axis=1)]
SPs_22

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,predictions,sequence,length
0,M,L,R,F,S,A,S,P,L,G,...,,,,,,,,0.194617,MLRFSASPLGYKGQAAAADQ,20
1,M,M,R,F,S,W,I,A,F,L,...,,,,,,,,0.185066,MMRFSWIAFLSLLVATAP,18
2,M,V,A,A,S,A,S,F,L,I,...,,,,,,,,0.183458,MVAASASFLILSLVAYLAAQV,21
3,M,V,R,H,S,A,I,F,W,A,...,,,,,,,,0.183349,MVRHSAIFWAAALQVTLP,18
4,M,L,R,F,S,L,F,F,G,A,...,,,,,,,,0.178592,MLRFSLFFGACILQFHAP,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,M,L,R,F,A,L,L,L,F,L,...,,,,,,,,0.142601,MLRFALLLFLCSLLSAAA,18
95,M,V,S,V,I,L,I,G,F,L,...,,,,,,,,0.142263,MVSVILIGFLLGLYSTGATV,20
96,M,M,R,F,S,S,L,L,F,L,...,,,,,,,,0.142099,MMRFSSLLFLNMLLIAPAALI,21
97,M,L,R,F,M,V,S,A,M,A,...,,,,,,,,0.141988,MLRFMVSAMASALYALLASL,20


In [None]:
#SPs_22.to_csv('../data/05_best_signal_peptides/10_000/best_signal_peptides_i_10_000_w_sps_22.csv', index=False)


In [None]:
h2o.shutdown()

  h2o.shutdown()


H2O session _sid_a966 closed.
