# Train / test data split

This notebook performs two important tasks:
1. Generates negative training examples by randomly mispairing TCRs and epitopes from the positive training examples collated in the previous notebook.
2. Splits the positive and negative training examples into train and test subsets, ensuring randomisation and mutual exclusivity by TCR.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Set a random seed for reproducibility
seed = 3

# Read in the collated dataframe from the previous notebook as the set of positive training examples
path = 'data/input/collated/collated.csv'
positives = pd.read_csv(path, sep=',', dtype=str)

In [3]:
# Drop the 'source' column, which is not needed at this stage
positives.drop('source', axis=1, inplace=True)

In [4]:
positives

Unnamed: 0,cdr3a,cdr3b,epitope,hla,v_a,j_a,v_b,j_b
0,IVVRSSNTGKLI,ASSQDRDTQY,VMAPRTLIL,HLA-E*01:01,TRAV26-1*01,TRAJ37*01,TRBV14*01,TRBJ2-3*01
1,IVVRSSNTGKLI,ASSQDRDTQY,VMAPRTLIL,HLA-E*01:03,TRAV26-1*01,TRAJ37*01,TRBV14*01,TRBJ2-3*01
2,AVRPLLDGTYIPT,ASSYLGNTGELF,SLLMWITQC,HLA-A*02:01,TRAV21*01,TRAJ6*01,TRBV6-5*01,TRBJ2-2*01
3,IVWGGYQKVT,ASRYRDDSYNEQF,EENLLDFVRF,HLA-B*44:02,TRAV26-1*01,TRAJ13*01,TRBV7-9*01,TRBJ2-1*01
4,AVTTDSWGKLQ,ASRPGLAGGRPEQY,LLFGYPVFV,HLA-A*02:01,TRAV12-2*01,TRAJ24*02,TRBV6-5*01,TRBJ2-7*01
...,...,...,...,...,...,...,...,...
24032,CARSTDSWGKLQF,CASTPEGSYNEQFF,LLWNGPMAV,HLA-A*02:01,TRAV24*01,TRAJ24*01,TRBV7-6*01,TRBJ2-1*01
24033,CWSPFGNEKLTF,CASSLGQGSYEQYF,GTSGSPIVNR,HLA-A*11:01,TRAV8-2*01,TRAJ48*01,TRBV11-2*01,TRBJ2-7*01
24034,CWSPFGNEKLTF,CASSLGQGSYEQYF,GTSGSPIINR,HLA-A*11:01,TRAV8-2*01,TRAJ48*01,TRBV11-2*01,TRBJ2-7*01
24035,SSGNQFYF,CASSQQTGTIGGYTF,NLVPMVATV,HLA-A*02,TRAV21*01,TRAJ49*01,TRBV6-5*01,TRBJ1-2*01


### Create the negative set

In [5]:
# Collect a list of TCRs from the positive training examples
tcr = positives.drop('epitope', axis=1)

In [6]:
# Collect a list of epitopes from the positive training examples
negepitopes = pd.DataFrame(positives['epitope'].unique(), columns=['epitope'])
negepitopes

Unnamed: 0,epitope
0,VMAPRTLIL
1,SLLMWITQC
2,EENLLDFVRF
3,LLFGYPVFV
4,MLWGYLQYV
...,...
206,LPEPLPQGQLGAY
207,LPEPLPQGQLTGY
208,EFFWDANDIY
209,CPSQEPMSIYVY


In [7]:
# Determine the number of times to reuse each epitope to achieve a balanced dataset
n = round(len(tcr)/len(negepitopes))

negatives = pd.DataFrame(columns=positives.columns)

# Randomly mispair the TCRs and epitopes to generate the negative training examples
for i in range(n):
    batch = tcr.sample(negepitopes.size, random_state=seed+i)
    batch.reset_index(inplace=True, drop=True)
    batch['epitope'] = negepitopes['epitope']
    
    negatives = negatives.append(batch, ignore_index=True)

In [8]:
# Remove any negative examples generated that already exist in the positive set
intersection = pd.merge(positives, negatives, how='inner', on=['cdr3a', 'cdr3b', 'epitope'], suffixes=['', '_del'])
intersection.drop(intersection.columns[intersection.columns.str.contains('_del')], axis=1, inplace=True)
negatives = pd.concat([negatives, intersection, intersection]).drop_duplicates(keep=False)

In [9]:
# Add labels
positives['y'] = 1.
negatives['y'] = 0.

# Union the negatives and positives
data = positives.append(negatives, ignore_index=True)

# Shuffle the data
data = data.sample(frac=1, random_state=seed).reset_index(drop=True)

In [10]:
display(data)

Unnamed: 0,cdr3a,cdr3b,epitope,hla,v_a,j_a,v_b,j_b,y
0,CAVSPYNFNKFYF,CASSLAGFQETQYF,FLRGRAYGL,HLA-A*03:01,TRAV20*01,TRAJ21*01,TRBV11-2*01,TRBJ2-5*01,0.0
1,CAGPAGARLMF,CASSLYVNNGYTF,AVFDRKSDAK,HLA-A*11:01,TRAV35*01,TRAJ31*01,TRBV7-9*01,TRBJ1-2*01,1.0
2,CAVSDHSNNARLMF,CASSLGGTNEQFF,NLVPMVASV,HLA-B*08:01,TRAV8-4*01,TRAJ31*01,TRBV4-1*01,TRBJ2-1*01,0.0
3,CVVRSGGYNKLIF,CAITGENTGELFF,KLGGALQAK,HLA-A*03:01,TRAV10*01,TRAJ4*01,TRBV10-3*01,TRBJ2-2*01,1.0
4,CAMREFYNTDKLIF,CASSQDDHGSNEQYF,RAKFKQLL,HLA-B*08:01,TRAV14/DV4*01,TRAJ34*01,TRBV4-3*01,TRBJ2-7*01,1.0
...,...,...,...,...,...,...,...,...,...
47804,CAATRYNNNDMRF,CASTREGAGWSAQHF,YLSNIIPAL,HLA-A*03:01,TRAV13-1*01,TRAJ43*01,TRBV12-3*01,TRBJ1-5*01,0.0
47805,CAVKGTGGFKTIF,CASRSQGALYSNQPQHF,HSKKKCDEI,HLA-B*08:01,TRAV1-1*01,TRAJ9*01,TRBV28*01,TRBJ1-5*01,0.0
47806,CAVTPRNQFYF,CASSQDRVGAGANVLTF,KLGGALQAK,HLA-A*03:01,TRAV8-6*01,TRAJ49*01,TRBV4-1*01,TRBJ2-6*01,1.0
47807,CVVNEKEGNMLTF,CSVGEGLAYEQYF,LLWNGPMAV,HLA-A*02:01,TRAV12-1,TRAJ39,TRBV29-01,TRBJ2-7,1.0


In [11]:
# Split the train and test data ensuring that they do not include any of the same TCRs
tcr.drop_duplicates(inplace=True)
train_tcrs, test_tcrs = train_test_split(tcr, test_size=0.10, random_state=seed)

train = pd.merge(train_tcrs, data, how='inner')
test = pd.merge(test_tcrs, data, how='inner')

In [12]:
# Output the train and test sets
path = 'data/input/collated/train.csv'
train.to_csv(path, index=False)

path = 'data/input/collated/test.csv'
test.to_csv(path, index=False)