# Setup

Libraries import

In [None]:
import os
import random

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

Set the paths of the input datasets

In [None]:
project_dir = "/path/to/project/root"

data_dir = project_dir + "/data"

df_train_file = data_dir + "/datasets/carmen_initial_dataset/training_set.csv"
df_test_file = data_dir + "/datasets/carmen_initial_dataset/test_set.csv"

protein_sequence_file = data_dir + "/datasets/carmen_initial_dataset/protein_sequences.csv"

Select experiment type and configuration

In [None]:
exp_name = "carmen_balanced_1_1"
data_balanced = True
n_false_peps = 1

# exp_name = "carmen_balanced_1_5"
# data_balanced = True
# n_false_peps = 5

# exp_name = "carmen_unbalanced_1_1"
# data_balanced = False
# n_false_peps = 1

# exp_name = "carmen_unbalanced_1_5"
# data_balanced = False
# n_false_peps = 5

In [None]:
output_dir = data_dir + f"/datasets/{exp_name}"
output_test_file = output_dir + "/test_set.csv"
output_train_file = output_dir + "/training_set.csv"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Step 1 loading the training and the test set

Import the training and the test dataset from the CARMEN dataset

In [None]:
df_train = pd.read_csv(df_train_file)
df_test = pd.read_csv(df_test_file)

Adding protein sequences

In [None]:
protein_sequence = pd.read_csv(protein_sequence_file)
protein_sequence = protein_sequence.rename(columns={'name' : 'Protein_id'})

df_train = pd.merge(df_train, protein_sequence, how="left", on=["Protein_id"])
df_test = pd.merge(df_test, protein_sequence, how="left", on=["Protein_id"])

Cleaning step

Delete all duplicates within the datasets.

In [None]:
if data_balanced:
    print('Dataset size with repetition --> '+str(len(df_train)))
    df_train = df_train.drop_duplicates(subset=['peptide'], keep='first', ignore_index=True)
    print('Dataset size without repetition --> '+str(len(df_train)))

    print('Dataset size with repetition --> '+str(len(df_test)))
    df_test = df_test.drop_duplicates(subset=['peptide'], keep='first', ignore_index=True)
    print('Dataset size without repetition --> '+str(len(df_test)))

In [None]:
df_train = pd.merge(df_train, df_test, how ='outer',  on=['peptide'], indicator=True).query('_merge == "left_only"')
df_train.rename(columns={'HLA_x': 'HLA', 'Netmhcpan_binder_x' : 'Netmhcpan_binder',
                        'Protein_id_x': 'Protein_id', 'sequence_x': 'sequence'}, inplace=True)

df_train = df_train.drop(columns=['HLA_y', '_merge', 'Netmhcpan_binder_y', 'Protein_id_y','sequence_y'])
df_train = df_train.drop_duplicates(keep='first', ignore_index=True)

# Step 2 - Fake peptides generation

In [5]:
unique_train = np.unique(df_train.peptide, return_counts = False)
unique_test = np.unique(df_test.peptide, return_counts = False)

peptides = np.concatenate((unique_train, unique_test), axis=None)

UNIQUE_PEPS = np.unique(peptides, return_counts = False)

Save all the unique peptides in a dictionary

In [6]:
dictio_pepts = {}

for key in UNIQUE_PEPS:
    dictio_pepts[key] = '1'

Function: `false_possible_indicies`

Purpose:
* Generates random start and end indices for a potential false peptide sequence within a given sequence.
* Ensures that the generated indices do not overlap with the indices of a true peptide.

Parameters:
* `sequence`: The input sequence.
* `true_peptide`: The true peptide sequence within the input sequence.

Returns:
* A tuple containing the start and end indices of the potential false peptide sequence.

In [None]:
def false_possible_indicies(sequence, true_peptide):
    len_peptide = len(true_peptide)

    start_index = sequence.find(true_peptide)
    end_index = start_index+len_peptide

    false_start = random.randint(0, (len(sequence)-len_peptide)-1)
    false_end = false_start + len_peptide

    if(false_start in range(start_index, end_index) or false_end in range(start_index, end_index)):
        false_start = random.randint(0, (len(sequence)-len_peptide)-1)
        false_end = false_start + len_peptide

    return false_start, false_end

Function: `create_false_string`

Purpose:
* Creates a false peptide sequence based on the provided start and end indices.
* Checks if the generated false peptide contains invalid characters ('U', 'X', '*') or if it's already present in a dictionary of known peptides.

Parameters:
* `sequence`: The input sequence.
* `false_start`: The starting index of the potential false peptide.
* `false_end`: The ending index of the potential false peptide.

Returns:
* A tuple containing the generated false peptide string and a flag indicating its validity:
  * 0: The false peptide is invalid (contains invalid characters or is already in the dictionary).
  * 1: The false peptide is valid.

In [None]:
def create_false_string(sequence, false_start, false_end):
    sequence_list = list(sequence)
    false_peptide = sequence_list[false_start: false_end]

    false_peptide_string = "".join(false_peptide)

    if ('U' in false_peptide_string) or ('X' in false_peptide_string) or ('*' in false_peptide_string):
        return false_peptide_string, 0

    if false_peptide_string in dictio_pepts:
        return false_peptide_string, 0

    else:
        return false_peptide_string, 1

Function: `false_peptide`

Purpose:
* Generates a valid false peptide sequence within a given sequence, ensuring it doesn't overlap with the true peptide and doesn't contain invalid characters or exist in a known peptide dictionary.

Parameters:
* `sequence`: The input sequence.
* `true_peptide`: The true peptide sequence within the input sequence.

Returns:
* A valid false peptide sequence.

In [None]:
def false_peptide(sequence, true_peptide):
    flag = 0

    while flag != 1:
        false_start, false_end = false_possible_indicies(sequence, true_peptide)
        false_peptide, flag = create_false_string(sequence, false_start, false_end)

    return false_peptide

Function: `create_dataframe_with_false_example`

Purpose:
* Augments an existing DataFrame with additional rows representing false peptide examples.
* For each peptide in the original DataFrame, it generates a specified number of false peptide sequences based on the given sequence and true peptide.
* Assigns appropriate labels, HLA values, and NetMHCpan binder predictions to these new rows.

Parameters:
* `df`: The original DataFrame containing peptide, sequence, HLA, and NetMHCpan_binder information.
* `num_false`: The number of false peptide examples to generate for each original peptide.

Returns:
* A new DataFrame containing the original rows and the newly generated false peptide examples.


In [None]:
def create_dataframe_with_false_example(df, num_false):
    d = {}
    peptidi = []
    label = []
    HLA = []
    binder = []

    for index in tqdm(range(0, len(df), 1)):
        peptide = df.iloc[index]['peptide']
        sequence = df.iloc[index]['sequence']
        HLA_value = df.iloc[index]['HLA']
        pan = df.iloc[index]['Netmhcpan_binder']

        flag = 0

        for false in range(num_false):
            if type(sequence) == str and len(sequence) > 50:
                HLAs = HLA_value.split(',')
                for hla in HLAs:
                    false_pep = false_peptide(sequence, peptide)
                    peptidi.append(false_pep)
                    label.append(0)
                    HLA.append(hla)
                    binder.append('F')
                flag = 1

        if flag == 1:
            HLAs = HLA_value.split(',')
            for hla in HLAs:
                peptidi.append(peptide)
                label.append(1)
                HLA.append(hla)
                binder.append(pan)

    d['peptide'] = peptidi
    d['label'] = label
    d['HLA'] = HLA
    d['Netmhcpan_binder'] = binder

    result = pd.DataFrame(data=d)

    return result

# STEP 3 - Call the functions and generate the datasets

In [None]:
df_train_with_false = create_dataframe_with_false_example(df_train, n_false_peps)
df_test_with_false = create_dataframe_with_false_example(df_test, n_false_peps)

Deletion of common peptides in both datasets

All records containing the pair `<peptide, label>` also present in the test set are removed from the train set

In [None]:
df_train_with_false = pd.merge(df_train_with_false, df_test_with_false, how ='outer',  on=['peptide', 'label'], indicator=True).query('_merge == "left_only"')
df_train_with_false.rename(columns={'HLA_x': 'HLA', 'Netmhcpan_binder_x' : 'Netmhcpan_binder'}, inplace=True)

df_train_with_false = df_train_with_false.drop(columns=['HLA_y', '_merge', 'Netmhcpan_binder_y'])
df_train_with_false = df_train_with_false.drop_duplicates(keep='first', ignore_index=True)

Save the new datasets

In [14]:
df_train_with_false.to_csv(output_train_file, encoding='utf-8', index=False)
df_test_with_false.to_csv(output_test_file, encoding='utf-8', index=False)