Pairwise Distance
import pandas as pd
import numpy as np
import tcrdist as td
from tcrdist import mappers
from tcrdist.repertoire import TCRrep
tcrdist_clone_fn = 'mouse_pairseqs_v1_parsed_seqs_probs_mq20_clones.tsv'
tcrdist_clone_df = pd.read_csv(tcrdist_clone_fn, sep = "\t")
mapping = mappers.tcrdist_clone_df_to_tcrdist2_mapping
tcrdist2_df = mappers.generic_pandas_mapper(df = tcrdist_clone_df,
mapping = mapping)
tr = TCRrep(cell_df = tcrdist2_df, organism = "mouse")
tr.infer_cdrs_from_v_gene(chain = 'alpha', imgt_aligned=True)
tr.infer_cdrs_from_v_gene(chain = 'beta', imgt_aligned=True)
tr.index_cols = ['clone_id', 'subject', 'epitope',
'v_a_gene', 'j_a_gene', 'v_b_gene', 'j_b_gene',
'cdr3_a_aa', 'cdr3_b_aa',
'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa',
'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa',
'cdr3_b_nucseq', 'cdr3_a_nucseq',
'va_countreps', 'ja_countreps',
'vb_countreps', 'jb_countreps',
'va_gene', 'vb_gene',
'ja_gene', 'jb_gene']
tr.deduplicate()
tr._tcrdist_legacy_method_alpha_beta()
distA = tr.dist_a
distB = tr.dist_b
assert np.all(((distA + distB) - tr.paired_tcrdist) == 0)
# K NEAREST NEIGHBORS
pr = namedtuple("perf", ["observed", "predicted", "dist"])
obsereved = tr.clone_df.epitope.to_list()
performance = list()
k = 5
for i,row in tr.clone_df.iterrows():
ind = (tr.clone_df.subject != row.subject) # Index hold out all data from that subject
distances = tr.paired_tcrdist[i,ind] # Get Distances from the ith row, holding out subject
sorted_indices = np.argsort(distances) # Get index of storted distances small to large
sorted_epitopes = tr.clone_df.epitope.iloc[sorted_indices].to_list()# Get epitopes associated wtih those indices
sorted_distances = distances[sorted_indices] # Get distances associated with those neighbors
predicted = sorted_epitopes[0:k] # Get Predicted epitopes for K nearest neighbors
predicted_distance = sorted_distances[0:k] # Get distances for K nearest neighbots
performance.append(pr(obsereved[i], predicted, predicted_distance)) # Save Performance Information
performance[1:10]
# [perf(observed='PA', predicted=['PA', 'm139', 'NP', 'PB1', 'M45'], dist=array([132., 150., 153., 165., 171.])),
# perf(observed='PA', predicted=['m139', 'PA', 'PA', 'PA', 'PA'], dist=array([36., 42., 48., 60., 66.])),
# perf(observed='PA', predicted=['PA', 'PA', 'PA', 'PA', 'PA'], dist=array([33., 38., 42., 42., 54.])),
# perf(observed='PA', predicted=['NP', 'PA', 'PA', 'PA', 'PA'], dist=array([84., 84., 90., 90., 96.])),
# perf(observed='PA', predicted=['PA', 'M38', 'PA', 'PA', 'PA'], dist=array([84., 84., 84., 84., 84.])),
# perf(observed='PA', predicted=['PA', 'PA', 'PA', 'PA', 'PA'], dist=array([24., 30., 32., 42., 45.])),
# perf(observed='PA', predicted=['PA', 'M45', 'PA', 'PA', 'PA'], dist=array([ 0., 53., 54., 56., 56.])),
# perf(observed='PA', predicted=['NP', 'PA', 'PA', 'PA', 'PA'], dist=array([116., 116., 116., 126., 126.])),
# perf(observed='PA', predicted=['PA', 'PA', 'PA', 'PA', 'PA'], dist=array([15., 32., 33., 51., 51.]))]