# Compute reduced alphabet 

In [7]:
import pandas as pd
import pyrepseq as prs
import numpy as np

## Import data

In [37]:
DATA_PATH = '/home/jhenderson/Documents/Projects/data_sets/tcr_sequences/CI_CRUK_datathon/'
back = pd.read_csv(DATA_PATH + 'olga_preprocessed.csv')
spc = pd.read_csv(DATA_PATH + 'tcrictionary_tabular.csv')
tcr_info_scp = pd.read_csv(DATA_PATH + 'tcr_info_specific.csv')

In [38]:
spc = spc[spc['TCR species'] == 'HomoSapiens']
spc = spc[['CDR3A', 'CDR3B', 'epitope']].dropna()

## Translation evaluation

In [40]:
aminoacids = 'ACDEFGHIKLMNPQRSTVWY'

In [41]:
def evaluate_translation(translation, spc, back):
    
    translation_table = str.maketrans(translation)
    
    back['translation_alpha'] = back['CDR3A'].apply(lambda x: x.translate(translation_table))
    back['translation_beta'] = back['CDR3B'].apply(lambda x: x.translate(translation_table))
    spc['translation_full'] = spc['CDR3A'].apply(lambda x: x.translate(translation_table)) + "_" + spc['CDR3B'].apply(lambda x: x.translate(translation_table))
    
    return prs.renyi2_entropy(back, 'translation_alpha') + prs.renyi2_entropy(back, 'translation_beta') - prs.renyi2_entropy(spc, 'translation_full', 'epitope')

In [42]:
def make_translation_from_list(aa_to_be_one):
    
    return {aa: '1' if aa in aa_to_be_one else '0' for aa in aminoacids}

## Greedy approach

In [33]:
def greedy_algorithm():
    
    best_amino_acids_to_be_one = []
    best_score = -np.inf
    for i in range(20):
        best_amino_acid_to_be_one = ""
        best_local_score = -np.inf
        for aa in aminoacids:
            if aa not in best_amino_acids_to_be_one:
                list_to_try = best_amino_acids_to_be_one.copy()
                list_to_try.append(aa)
                translation = make_translation_from_list(list_to_try)
                score = evaluate_translation(translation, spc, back)
                
                if score > best_local_score:
                    best_amino_acid_to_be_one = aa
                    best_local_score = score
        
        if best_local_score <= best_score:
            print("Locally optimal set found")
            return best_amino_acids_to_be_one, best_score
        
        else:
            print(f"Improvement found, new score: {best_local_score:.1f} bits")
            best_score = best_local_score
            best_amino_acids_to_be_one.append(best_amino_acid_to_be_one)
            
    return best_amino_acids_to_be_one, best_score

In [34]:
best, score = greedy_algorithm()

Improvement found, new score: 12.4 bits
Improvement found, new score: 15.8 bits
Improvement found, new score: 18.2 bits
Improvement found, new score: 20.0 bits
Improvement found, new score: 21.0 bits
Improvement found, new score: 21.6 bits
Improvement found, new score: 21.7 bits
Improvement found, new score: 21.8 bits
Locally optimal set found


In [35]:
best, [aa for aa in aminoacids if aa not in best], score

(['G', 'S', 'Y', 'A', 'E', 'I', 'K', 'W'],
 ['C', 'D', 'F', 'H', 'L', 'M', 'N', 'P', 'Q', 'R', 'T', 'V'],
 np.float64(21.761432200096724))

In [43]:
optimal_alphabet = make_translation_from_list(best)