# Compute reduced alphabet 

In [1]:
import pandas as pd
import pyrepseq as prs
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Import data

In [2]:
DATA_PATH = '/home/jhenderson/Documents/Projects/data_sets/tcr_sequences/CI_CRUK_datathon/'

back = pd.read_csv(DATA_PATH + 'raw_data/olga_preprocessed.csv')

test = pd.read_csv(DATA_PATH + 'processed_data/train.csv')
test = test[['CDR3A', 'CDR3B', 'epitope', 'Assays']].dropna().reset_index(drop=True)

tcr_info = pd.read_csv(DATA_PATH + 'raw_data/tcr_info_specific.csv') 
tcr_info = tcr_info.rename({'Epitope':'epitope'})

aminoacids = 'ACDEFGHIKLMNPQRSTVWY'

## Clean data

In [3]:
test = test[test['Assays'].str.contains('mer')]

In [4]:
back.loc[:, 'CDR3A_length'] = back['CDR3A'].str.len().astype('Int64')
back.loc[:, 'CDR3B_length'] = back['CDR3B'].str.len().astype('Int64')

back = back[
    (back['CDR3A_length'].isna()) |  # ignore NA values
    (back['CDR3A_length'] >= 9) & 
    (back['CDR3A_length'] <= 18)
]

back = back[
    (back['CDR3B_length'].isna()) |  # ignore NA values
    (back['CDR3B_length'] >= 11) & 
    (back['CDR3B_length'] <= 18)
]

In [5]:
tcr_info.loc[:, 'CDR3A_length'] = tcr_info['CDR3A'].str.len().astype('Int64')
tcr_info.loc[:, 'CDR3B_length'] = tcr_info['CDR3B'].str.len().astype('Int64')

tcr_info = tcr_info[
    (tcr_info['CDR3A_length'].isna()) |  # ignore NA values
    (tcr_info['CDR3A_length'] >= 9) & 
    (tcr_info['CDR3A_length'] <= 18)
]

tcr_info = tcr_info[
    (tcr_info['CDR3B_length'].isna()) |  # ignore NA values
    (tcr_info['CDR3B_length'] >= 11) & 
    (tcr_info['CDR3B_length'] <= 18)
]

In [6]:
def get_middle_five_left(aa_string):
    if pd.isna(aa_string):  # Check for NaN/null values
        return aa_string
    length = len(aa_string)
    
    if( length % 2 == 0 ):
            mid = length // 2 # integer division will ensure that doesnt matter if even or odd - it is floor so always rounds down
            start = mid - 3
            end = mid + 2
    else:
            mid = length // 2
            start = mid - 2
            end = mid + 3

    return aa_string[start:end]


def get_middle_five_right(aa_string):
    if pd.isna(aa_string):  # Check for NaN/null values
        return aa_string
    length = len(aa_string)
    
    if( length % 2 == 0 ):
            mid = length // 2 # integer division will ensure that doesnt matter if even or odd - it is floor so always rounds down
            start = mid - 2
            end = mid + 3
    else:
            mid = length // 2
            start = mid - 2
            end = mid + 3

    return aa_string[start:end]

In [7]:
back['CDR3A'] = back['CDR3A'].apply(get_middle_five_left)
back['CDR3B'] = back['CDR3B'].apply(get_middle_five_left)

In [8]:
tcr_info['CDR3A'] = tcr_info['CDR3A'].apply(get_middle_five_left)
tcr_info['CDR3B'] = tcr_info['CDR3B'].apply(get_middle_five_left)

## Translation evaluation

In [9]:
def evaluate_translation(translation, spc, back, translate_epitope=False):
    
    spc = spc.copy()
    back = back.copy()
    translation_table = str.maketrans(translation)
    
    back['translation_alpha'] = back['CDR3A'].apply(lambda x: x.translate(translation_table))
    back['translation_beta'] = back['CDR3B'].apply(lambda x: x.translate(translation_table))
    spc['translation_alpha'] = spc['CDR3A'].apply(lambda x: x.translate(translation_table))
    spc['translation_beta'] =  spc['CDR3B'].apply(lambda x: x.translate(translation_table))
    
    if translate_epitope: 
        spc['translation_epitope'] =  spc['epitope'].apply(lambda x: x.translate(translation_table))
        return prs.renyi2_entropy(back, 'translation_alpha') + prs.renyi2_entropy(back, 'translation_beta') - prs.renyi2_entropy(spc, 'translation_alpha', 'translation_epitope') - prs.renyi2_entropy(spc, 'translation_beta', 'translation_epitope')
    
    else:
        return prs.renyi2_entropy(back, 'translation_alpha') + prs.renyi2_entropy(back, 'translation_beta') - prs.renyi2_entropy(spc, 'translation_alpha', 'epitope') - prs.renyi2_entropy(spc, 'translation_beta', 'epitope')
        

In [10]:
def make_translation_from_list(aa_to_be_one):
    
    return {aa: '1' if aa in aa_to_be_one else '0' for aa in aminoacids}

In [11]:
def greedy_algorithm(back, spc, translate_epitope=False):
    
    best_amino_acids_to_be_one = []
    best_score = -np.inf
    for i in range(20):
        best_amino_acid_to_be_one = ""
        best_local_score = -np.inf
        for aa in aminoacids:
            if aa not in best_amino_acids_to_be_one:
                list_to_try = best_amino_acids_to_be_one.copy()
                list_to_try.append(aa)
                translation = make_translation_from_list(list_to_try)
                score = evaluate_translation(translation, spc, back, translate_epitope)
                
                if score > best_local_score:
                    best_amino_acid_to_be_one = aa
                    best_local_score = score
        
        if best_local_score <= best_score:
            print("Locally optimal set found")
            return best_amino_acids_to_be_one, best_score
        
        else:
            print(f"Improvement found, new score: {best_local_score:.1f} bits")
            best_score = best_local_score
            best_amino_acids_to_be_one.append(best_amino_acid_to_be_one)
            
    return best_amino_acids_to_be_one, best_score

## Training on training data

### Not translating the epitope

In [16]:
best_1, score = greedy_algorithm(back, test)
best_1, [aa for aa in aminoacids if aa not in best_1], score

Improvement found, new score: 4.0 bits
Improvement found, new score: 5.6 bits
Improvement found, new score: 6.6 bits
Improvement found, new score: 6.9 bits
Improvement found, new score: 7.0 bits
Locally optimal set found


(['G', 'S', 'R', 'T', 'E'],
 ['A', 'C', 'D', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'V', 'W', 'Y'],
 np.float64(7.031338940358852))

In [18]:
optimal_alphabet_1 = make_translation_from_list(best_1)
translation_table_1 = str.maketrans(optimal_alphabet_1)
back['translation_alpha'] = back['CDR3A'].apply(lambda x: x.translate(translation_table_1))
back['translation_beta'] = back['CDR3B'].apply(lambda x: x.translate(translation_table_1))
test['translation_alpha'] = test['CDR3A'].apply(lambda x: x.translate(translation_table_1))
test['translation_beta'] = test['CDR3B'].apply(lambda x: x.translate(translation_table_1))
test['translation_epitope'] = test['epitope'].apply(lambda x: x.translate(translation_table_1)) 
prs.renyi2_entropy(back, 'translation_alpha') + prs.renyi2_entropy(back, 'translation_beta') - prs.renyi2_entropy(test, 'translation_alpha', 'translation_epitope') - prs.renyi2_entropy(test, 'translation_beta', 'translation_epitope') 

np.float64(2.786727214329384)

In [19]:
test['translation_epitope'].value_counts()

translation_epitope
11000    24495
01000     6695
00000     5278
01010     3166
10010     1188
10000     1090
00110      935
01001      934
10100      912
00001      843
00100      758
00010      515
01110      468
00011      299
01100      275
11100      208
01111      127
11001      108
10111      108
00101       91
11101       91
11010       84
10101       73
10011       72
01011       59
10110       51
10001       45
00111       20
01101       16
11110       10
11111        8
11011        2
Name: count, dtype: int64

### Translating the epitope in training

In [20]:
best_2, score = greedy_algorithm(back, test, translate_epitope=True)

Improvement found, new score: 2.8 bits
Improvement found, new score: 3.4 bits
Improvement found, new score: 4.6 bits
Locally optimal set found


In [25]:
best_2, [aa for aa in aminoacids if aa not in best_2], score

(['G', 'S', 'R'],
 ['A',
  'C',
  'D',
  'E',
  'F',
  'H',
  'I',
  'K',
  'L',
  'M',
  'N',
  'P',
  'Q',
  'T',
  'V',
  'W',
  'Y'],
 np.float64(4.649510063613695))

In [26]:
test['epitope'].value_counts()

epitope
GGALQ    24201
LGFVF     4734
DRKSD     2984
KFKQL     2459
TDFSV     1083
         ...  
WDHNP        1
TESLH        1
SYTPV        1
SSHLF        1
TAMDI        1
Name: count, Length: 946, dtype: int64

In [27]:
test['translation_epitope'].value_counts()

translation_epitope
11000    24365
00000     7943
01000     5666
01010     3308
10000     1812
00100     1615
00010     1499
01001      839
00001      485
00110      408
10100      209
11100      180
00111      124
01100       94
11001       75
10010       70
10011       61
00011       55
10101       51
10110       50
10001       33
00101       21
01101       17
11101       14
11010       14
10111        6
11111        2
01110        2
11011        2
11110        2
01111        1
01011        1
Name: count, dtype: int64

In [24]:
optimal_alphabet_2 = make_translation_from_list(best_2)
translation_table_2 = str.maketrans(optimal_alphabet_2)
back['translation_alpha'] = back['CDR3A'].apply(lambda x: x.translate(translation_table_2))
back['translation_beta'] = back['CDR3B'].apply(lambda x: x.translate(translation_table_2))
test['translation_alpha'] = test['CDR3A'].apply(lambda x: x.translate(translation_table_2))
test['translation_beta'] = test['CDR3B'].apply(lambda x: x.translate(translation_table_2))
test['translation_epitope'] = test['epitope'].apply(lambda x: x.translate(translation_table_2)) 
prs.renyi2_entropy(back, 'translation_alpha') + prs.renyi2_entropy(back, 'translation_beta') - prs.renyi2_entropy(test, 'translation_alpha', 'translation_epitope') - prs.renyi2_entropy(test, 'translation_beta', 'translation_epitope') 

np.float64(4.649510063613695)

## Training on minervina and dash