# Instructions for tcrdist2

2019-06-19 

# Recommended Installation Method

the development version of tcrdist2 can be installed via pip:

```bash
pip install git+https://github.com/kmayerb/tcrdist2.git@API2
```

however, it is highly recommended that you run **tcrdist2**
within a [python virtual environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Using a virtual env isolates the program's dependencies so that installing legacy packages for 
python (2.7.11) -- numpy (1.10.1), scipy (0.16.0), scikit-learn (0.17.1), and matplotlib (1.4.3) --
does not interfere with any of your other ongoing python projects. Setting up a virtual env takes less than
5 minutes using the commands below. To configure your machine to run **tcrdist2** using the correct dependencies,use the [*requirements.txt*](https://github.com/kmayerb/tcrdist2/blob/API2/requirements.txt)
file provided in the tcrdist2 github repository. We assume that you have a working version of condas installed or know how to install python 2.7.11. 

```bash
conda create -n py27v python=2.7.11 pip virtualenv
conda activate py27v
virtualenv venv
conda deactivate
conda deactivate
source ./venv/bin/activate
pip install -r requirements.txt
pip install git+https://github.com/kmayerb/tcrdist2.git@API2
```

- Using condas, install a base python interpretor (Python version 2.7.11) with pip and virtualenv. 
- Activate it: **conda activate py27v**
- Make a virtual env that will contain all of tcrdists dependencies: **virtualenv venv**
- Deactivate condas env (twice to deactivate py27v and base) : **conda deactivate**
- Source venv : **source ./venv/bin/activate.**
- pip install all tcrdists dependencies **pip install -r requirements.txt**
- pip install tcrdist2
- OPTIONAL: Install Blast Tools (see section below)

# Usage

In [52]:
import pandas as pd
import tcrdist as td
from tcrdist import pairwise

## single tcr sequence example

In [3]:
betaNT = 'CGGGGGGGGTACCNTTGNTTAGGTCCTCTACACGGTTAACCTGGTCCCCGAACCGAAGGTCAATAGGGCCTGTATACTGCTGGCACAGAAGTACACAGCTGAGTCCCTGGGTTCTGAGGGCTGGATCTTCAGAGTGGAGTCANN'
betaQuals = '12.12.12.12.12.22.9.8.6.6.6.8.3.0.3.10.3.0.3.10.10.11.20.25.30.37.37.29.27.14.14.15.27.30.41.47.36.50.50.50.42.42.57.57.43.47.53.47.47.47.47.47.47.50.54.57.57.57.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.57.57.57.57.59.59.59.57.57.57.57.57.57.57.57.59.57.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.59.59.59.59.59.57.57.57.59.57.57.43.37.28.28.21.28.23.37.28.30.15.19.17.15.21.20.25.3.0.0'
chain = td.processing.processNT(organism = 'human', chain = 'B', nuc = betaNT, quals = betaQuals, use_parasail = True)

In [4]:
pd.DataFrame(chain)

Unnamed: 0,b_good_hits,b_status,cdr3b,cdr3b_nucseq,cdr3b_plus,cdr3b_quals,jb_alignlen,jb_bitscore_gap,jb_blast_hits,jb_countreps,...,vb_bitscore_gap,vb_blast_hits,vb_countreps,vb_evalue,vb_gene,vb_genes,vb_mismatches,vb_mm,vb_rep,vb_reps
0,"[(TRBV12-3*01, 64, 1.60381089055e-28), (TRBV12...",OK,CASSIQALLTF,tgtgccagcagtatacaggccctattgaccttc,CASSIQALLTF-tgtgccagcagtatacaggccctattgaccttc,57.57.57.57.57.57.59.59.59.57.57.57.57.68.68.6...,9,20,TRBJ1-2*01:29;TRBJ1-6*02:9;TRBJ1-6*01:9;TRBJ1-...,TRBJ1-2,...,0,TRBV12-3*01:64;TRBV12-4*01:64;TRBV12-4*02:60;T...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,1,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01
1,"[(TRBJ1-2*01, 29, 2.54366564738e-13), (TRBJ1-6...",OK,CASSIQALLTF,tgtgccagcagtatacaggccctattgaccttc,CASSIQALLTF-tgtgccagcagtatacaggccctattgaccttc,57.57.57.57.57.57.59.59.59.57.57.57.57.68.68.6...,9,20,TRBJ1-2*01:29;TRBJ1-6*02:9;TRBJ1-6*01:9;TRBJ1-...,TRBJ1-2,...,0,TRBV12-3*01:64;TRBV12-4*01:64;TRBV12-4*02:60;T...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,17,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01


## multiple tcr sequences example

### readPairedSequences - Parasail

In [76]:
psDfp = td.processing.readPairedSequences(paired_seqs_file = "tcrdist/datasets/test_human_pairseqs.tsv", 
                                         organism = "human", use_parasail = True);

RESULTS BASED ON PARASAIL



In [95]:
for i in range(1, len(psDfp.columns)):
    print(psDfp.columns[i])
    print("\t" , psDfp.iloc[0,i])

epitope
('\t', 'pp65')
subject
('\t', 'human_subject0010')
a_good_hits
('\t', [[('TRAV35*01', 262, 1.6400071495711529e-114), ('TRAV35*02', 261, 4.4580016332221818e-114)], [('TRAJ42*01', 53, 9.6026800545086212e-24), ('TRAJ30*01', 16, 1.1253517471925914e-07), ('TRAJ40*01', 14, 8.3152871910356661e-07), ('TRAJ32*01', 13, 2.2603294069810513e-06), ('TRAJ32*02', 13, 2.2603294069810513e-06), ('TRAJ29*01', 12, 6.144212353328203e-06), ('TRAJ6*01', 11, 1.6701700790245646e-05), ('TRAJ4*01', 11, 1.6701700790245646e-05), ('TRAJ34*01', 11, 1.6701700790245646e-05), ('TRAJ53*01', 11, 1.6701700790245646e-05), ('TRAJ56*01', 11, 1.6701700790245646e-05), ('TRAJ37*01', 10, 4.5399929762484834e-05), ('TRAJ31*01', 10, 4.5399929762484834e-05), ('TRAJ17*01', 10, 4.5399929762484834e-05), ('TRAJ14*01', 10, 4.5399929762484834e-05), ('TRAJ57*01', 10, 4.5399929762484834e-05), ('TRAJ3*01', 10, 4.5399929762484834e-05), ('TRAJ18*01', 10, 4.5399929762484834e-05), ('TRAJ50*01', 9, 0.00012340980408667953), ('TRAJ17*01', 9,

### readPairedSequences - BLAST (to compare with tcrdist1)

In [8]:
psDfb = td.processing.readPairedSequences(paired_seqs_file = "tcrdist/datasets/test_human_pairseqs.tsv", 
                                         organism = "human", use_parasail = False);


RESULTS BASED ON BLAST


In [12]:
psDfp[['cdr3a','cdr3b']] == psDfb[['cdr3a','cdr3b']]

Unnamed: 0,cdr3a,cdr3b
0,True,True
1,True,True
2,True,True
3,True,True
4,True,True
5,True,True
6,True,True
7,True,True
8,True,True
9,True,True


In [43]:
psDfp.head()

Unnamed: 0,id,epitope,subject,a_good_hits,a_status,cdr3a,cdr3a_nucseq,cdr3a_plus,cdr3a_quals,ja_alignlen,...,vb_countreps,vb_evalue,vb_gene,vb_genes,vb_mismatches,vb_mm,vb_rep,vb_reps,organism,TCRID
0,human_tcr0001,pp65,human_subject0010,"[[(TRAV35*01, 262, 1.64000714957e-114), (TRAV3...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.68.68.68.68.68.62.62.62.62.62.62.62.68.6...,10,...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ17*01;TRAJ29*01;TRA...
1,human_tcr0002,pp65,human_subject0010,"[[(TRAV35*01, 263, 6.03324913701e-115), (TRAV3...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.68.68.68.68.68.62.62.68.62.62.62.62.68.6...,10,...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ29*01;TRAJ45*01|TRB...
2,human_tcr0003,pp65,human_subject0010,"[[(TRAV35*01, 263, 6.03324913701e-115), (TRAV3...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.62.68.68.68.68.62.62.62.62.62.62.62.68.6...,10,...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ13*01;TRAJ29*01;TRA...
3,human_tcr0004,pp65,human_subject0010,"[[(TRAV35*02, 266, 3.00377787265e-116), (TRAV3...",OK,CAGPRETSYDKVIF,tgtgctgggccccgtgaaacctcctacgacaaggtgatattt,CAGPRETSYDKVIF-tgtgctgggccccgtgaaacctcctacgaca...,68.68.62.62.62.62.62.62.68.68.68.68.68.68.68.6...,10,...,TRBV12-3;TRBV12-4,5.900091e-29,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGPRETSYDKVIF|TRAJ14*01|TRBV12-3*01...
4,human_tcr0005,pp65,human_subject0010,"[[(TRAV35*01, 267, 1.10502812519e-116), (TRAV3...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.68.68.68.68.68.62.62.62.62.62.62.68.68.6...,10,...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ29*01;TRAJ45*01|TRB...


### computeProbs

In [73]:
probDfp = td.processing.computeProbs(psDfp)

In [15]:
probDfp.head()

Unnamed: 0_level_0,a_indels,a_nucseq_prob,a_protseq_prob,b_indels,b_nucseq_prob,b_protseq_prob,cdr3a_new_nucseq,cdr3a_protseq_masked,cdr3a_protseq_prob,cdr3b_new_nucseq,cdr3b_protseq_masked,cdr3b_protseq_prob,ja_rep_prob,jb_rep_prob,va_rep_prob,vb_rep_prob
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,+3-14,1.549461e-09,1.681512e-08,+10-26,1.241938e-16,7.976867e-14,agc,---qA-------,3.311309e-05,at+++++ccctattg,----I-ALL--,2.170112e-11,0.023313,0.090738,0.021782,0.04051
1,+3-14,1.549461e-09,1.681512e-08,+10-26,1.241938e-16,7.976867e-14,agc,---qA-------,3.311309e-05,at+++++ccctattg,----I-ALL--,2.170112e-11,0.023313,0.090738,0.021782,0.04051
2,+3-14,1.549461e-09,1.681512e-08,+10-26,1.241938e-16,7.976867e-14,agc,---qA-------,3.311309e-05,at+++++ccctattg,----I-ALL--,2.170112e-11,0.023313,0.090738,0.021782,0.04051
3,+10-3,6.057827e-16,7.693606e-14,+3-16,5.379021e-10,6.872967e-09,gggccccgtg,--GPRe--------,1.713533e-07,c+++gt,----say-----,1.869795e-06,2.1e-05,0.090738,0.021782,0.04051
4,+3-14,1.549461e-09,1.681512e-08,+10-26,1.241938e-16,7.976867e-14,agc,---qA-------,3.311309e-05,at+++++ccctattg,----I-ALL--,2.170112e-11,0.023313,0.090738,0.021782,0.04051


In [13]:
psDf = psDf.join(probDf)

### identifyClones()

In [13]:
clonesDf = td.processing.identifyClones(psDfp)

In [19]:
clonesDf.head()

Unnamed: 0,TCRID,a_good_hits,a_indels,a_nucseq_prob,a_protseq_prob,a_status,b_good_hits,b_indels,b_nucseq_prob,b_protseq_prob,...,vb_countreps,vb_evalue,vb_gene,vb_genes,vb_mismatches,vb_mm,vb_rep,vb_rep_prob,vb_reps,CLONEID
0,TRAV5*01|CAETRSRDYKLSF|TRAJ20*01|TRBV4-1*01|CA...,"[[(TRAV5*01, 408, 1e-116)], [(TRAJ20*01, 98, 9...",+11-11,6.062788e-14,9.921389e-11,OK,"[[(TRBV4-1*01, 96, 8e-23), (TRBV4-1*02, 94, 3e...",+5-12,9.249081e-13,2.580797e-10,...,TRBV4-1,8.000000000000001e-23,TRBV4-1*01,TRBV4-1*01,1,"[1, 10]",TRBV4-1*01,0.02151,TRBV4-1*01,TRAV5*01|CAETRSRDYKLSF|TRAJ20*01|TRBV4-1*01|CA...
1,TRAV35*01|CAGQASQGNLIF|TRAJ42*01|TRBV12-3*01;T...,"[[(TRAV35*01, 515, 1e-149), (TRAV35*02, 513, 1...",+3-14,2.727014e-09,2.95942e-08,OK,"[[(TRBV12-4*01, 127, 2e-32), (TRBV12-3*01, 127...",+10-26,1.29947e-16,8.346391e-14,...,TRBV12-3;TRBV12-4,2e-32,TRBV12-4*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-4*01,0.04229,TRBV12-3*01;TRBV12-4*01,TRAV35*01|CAGQASQGNLIF|TRAJ42*01|TRBV12-3*01;T...
2,TRAV35*01|CAGPRETSYDKVIF|TRAJ50*01|TRBV12-3*01...,"[[(TRAV35*02, 527, 1e-152), (TRAV35*01, 527, 1...",+10-3,2.8982e-13,3.680792e-11,OK,"[[(TRBV12-4*01, 129, 6e-33), (TRBV12-3*01, 129...",+3-16,5.628201e-10,7.191353e-09,...,TRBV12-3;TRBV12-4,6e-33,TRBV12-4*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-4*01,0.04229,TRBV12-3*01;TRBV12-4*01,TRAV35*01|CAGPRETSYDKVIF|TRAJ50*01|TRBV12-3*01...
3,TRAV22*01|CAVADTGNQFYF|TRAJ49*01|TRBV12-3*01;T...,"[[(TRAV22*01, 385, 1e-109)], [(TRAJ49*01, 107,...",+3-5,1.704553e-08,8.07159e-08,OK,"[[(TRBV12-4*01, 117, 2e-29), (TRBV12-3*01, 117...",+10-27,1.31149e-16,1.976313e-13,...,TRBV12-3;TRBV12-4,2.0000000000000002e-29,TRBV12-4*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-4*01,0.04229,TRBV12-3*01;TRBV12-4*01,TRAV22*01|CAVADTGNQFYF|TRAJ49*01|TRBV12-3*01;T...


In [None]:
for i,cl in enumerate(psDfp.columns):
    print(cl, psDfp.iloc[0, i])
    print("\n")

In [46]:
psDfp

Unnamed: 0,id,epitope,subject,a_good_hits,a_status,cdr3a,cdr3a_nucseq,cdr3a_plus,cdr3a_quals,ja_alignlen,...,vb_countreps,vb_evalue,vb_gene,vb_genes,vb_mismatches,vb_mm,vb_rep,vb_reps,organism,TCRID
0,human_tcr0001,pp65,human_subject0010,"[[(TRAV35*01, 262, 1.64000714957e-114), (TRAV3...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.68.68.68.68.68.62.62.62.62.62.62.62.68.6...,10,...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ17*01;TRAJ29*01;TRA...
1,human_tcr0002,pp65,human_subject0010,"[[(TRAV35*01, 263, 6.03324913701e-115), (TRAV3...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.68.68.68.68.68.62.62.68.62.62.62.62.68.6...,10,...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ29*01;TRAJ45*01|TRB...
2,human_tcr0003,pp65,human_subject0010,"[[(TRAV35*01, 263, 6.03324913701e-115), (TRAV3...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.62.68.68.68.68.62.62.62.62.62.62.62.68.6...,10,...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ13*01;TRAJ29*01;TRA...
3,human_tcr0004,pp65,human_subject0010,"[[(TRAV35*02, 266, 3.00377787265e-116), (TRAV3...",OK,CAGPRETSYDKVIF,tgtgctgggccccgtgaaacctcctacgacaaggtgatattt,CAGPRETSYDKVIF-tgtgctgggccccgtgaaacctcctacgaca...,68.68.62.62.62.62.62.62.68.68.68.68.68.68.68.6...,10,...,TRBV12-3;TRBV12-4,5.900091e-29,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGPRETSYDKVIF|TRAJ14*01|TRBV12-3*01...
4,human_tcr0005,pp65,human_subject0010,"[[(TRAV35*01, 267, 1.10502812519e-116), (TRAV3...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.68.68.68.68.68.62.62.62.62.62.62.68.68.6...,10,...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ29*01;TRAJ45*01|TRB...
5,human_tcr0006,pp65,human_subject0010,"[[(TRAV35*01, 267, 1.10502812519e-116), (TRAV3...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.62.68.68.68.68.62.62.68.68.68.68.68.68.6...,10,...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ29*01;TRAJ45*01|TRB...
6,human_tcr0007,pp65,human_subject0010,"[[(TRAV35*01, 259, 3.2940424157e-113), (TRAV35...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.68.68.68.68.68.57.62.62.62.62.62.62.68.6...,10,...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ29*01;TRAJ45*01|TRB...
7,human_tcr0008,pp65,human_subject0010,"[[(TRAV5*01, 206, 3.43033652793e-90)], [(TRAJ2...",OK,CAETRSRDYKLSF,tgtgcagagacccgcagtagggactacaagctcagcttt,CAETRSRDYKLSF-tgtgcagagacccgcagtagggactacaagct...,68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.6...,10,...,TRBV6-6,1.425164e-21,TRBV4-1*01,TRBV6-6*05,1,"[1, 10]",TRBV4-1*01,TRBV6-6*01,human,TRAV5*01|CAETRSRDYKLSF|TRAJ26*01|TRBV6-6*01|CA...
8,human_tcr0009,pp65,human_subject0010,"[[(TRAV35*01, 267, 1.10502812519e-116), (TRAV3...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.59.68.68.68.68.68.68.68.68.68.68.68.68.6...,10,...,TRBV12-3;TRBV12-4,4.3596100000000005e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,0,"[0, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ29*01;TRAJ45*01|TRB...
9,human_tcr0010,pp65,human_subject0010,"[[(TRAV35*01, 263, 6.03324913701e-115), (TRAV3...",OK,CAGQASQGNLIF,tgtgctgggcaagcaagccaaggaaatctcatcttt,CAGQASQGNLIF-tgtgctgggcaagcaagccaaggaaatctcatcttt,68.68.62.68.68.68.68.62.62.62.62.62.62.62.68.6...,10,...,TRBV12-3;TRBV12-4,1.6038110000000001e-28,TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,1,"[1, 17]",TRBV12-3*01,TRBV12-3*01;TRBV12-4*01,human,TRAV35*01|CAGQASQGNLIF|TRAJ29*01;TRAJ45*01;TRA...


In [50]:
psDfp[['cdr3a','cdr3b']].head()

Unnamed: 0,cdr3a,cdr3b
0,CAGQASQGNLIF,CASSIQALLTF
1,CAGQASQGNLIF,CASSIQALLTF
2,CAGQASQGNLIF,CASSIQALLTF
3,CAGPRETSYDKVIF,CASSSAYYGYTF
4,CAGQASQGNLIF,CASSIQALLTF


In [65]:
psDfp['cdr3a'].tolist()

['CAGQASQGNLIF',
 'CAGQASQGNLIF',
 'CAGQASQGNLIF',
 'CAGPRETSYDKVIF',
 'CAGQASQGNLIF',
 'CAGQASQGNLIF',
 'CAGQASQGNLIF',
 'CAETRSRDYKLSF',
 'CAGQASQGNLIF',
 'CAGQASQGNLIF',
 'CAGQASQGNLIF',
 'CAGQASQGNLIF',
 'CAGQASQGNLIF',
 'CAGQASQGNLIF',
 'CAVADTGNQFYF',
 'CLVGSMDSNYQLIW',
 'CAVPKGSQGNLIF',
 'CAVSDSGTGNQFYF',
 'CAGPFGRLMF',
 'CAGPDGSSNTGKLIF']

In [69]:
pw = td.pairwise.apply_pairwise_distance(psDfp['cdr3a'].tolist())
pw_long = td.pairwise.unpack_dd_to_kkv(pw)
pd.DataFrame(pw_long)

Unnamed: 0,key1,key2,value
0,CAETRSRDYKLSF,CAETRSRDYKLSF,0.0
1,CAETRSRDYKLSF,CAGPDGSSNTGKLIF,0.6
2,CAETRSRDYKLSF,CAGPFGRLMF,0.615385
3,CAETRSRDYKLSF,CAGQASQGNLIF,0.615385
4,CAETRSRDYKLSF,CAVADTGNQFYF,0.666667
5,CAETRSRDYKLSF,CAVPKGSQGNLIF,0.642857
6,CAETRSRDYKLSF,CAVSDSGTGNQFYF,0.714286
7,CAETRSRDYKLSF,CLVGSMDSNYQLIW,0.666667
8,CAGPDGSSNTGKLIF,CAGPDGSSNTGKLIF,0.0
9,CAGPFGRLMF,CAGPDGSSNTGKLIF,0.533333


In [75]:
pdDfp.columns

NameError: name 'pdDfp' is not defined