### Clean and slim the repertoire files

In [26]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [27]:
# Set HLA of interest - change dynamically
hla = 'A*02'
hla_name = hla.replace('*','')

# Generate file names

os.chdir("/Users/ishaharris/Projects/TCR/TCR-Isha/data/Repertoires")

metadata = pd.read_csv("/Users/ishaharris/Projects/TCR/TCR-Isha/data/Repertoires/Cohort01_whole_metadata.tsv", sep="\t") 


metadata_annotat = metadata[metadata['sample_tags'].str.contains(f'HLA-{hla}',case=False, regex = False)]

metadata_annotat = metadata_annotat[metadata_annotat['sample_tags'].str.contains(r'\bcytomegalovirus|CMV\b', case=False, na=False)]
metadata_annotat = metadata_annotat.reset_index(drop=True)

#Extract pos and neg patient IDs
pos_names = metadata_annotat[metadata_annotat['sample_tags'].str.contains(r'\bcytomegalovirus \+|CMV \+\b', case=False, na=False)]
neg_names = metadata_annotat[metadata_annotat['sample_tags'].str.contains(r'\bcytomegalovirus -|CMV -\b', case=False, na=False)]

#Turn into file names
file_names = [name + '.tsv' for name in metadata_annotat['sample_name'].tolist()]
pos_names = [name + '.tsv' for name in pos_names['sample_name'].tolist()]
neg_names = [name + '.tsv' for name in neg_names['sample_name'].tolist()]


metadata_annotat.loc[10,'sample_tags']

print(f'Number of files with CMV and HLA annotation: {len(file_names)}')
print(f'Number of files with CMV +: {len(pos_names)}')
print(f'Number of files with CMV -: {len(neg_names)}')


Number of files with CMV and HLA annotation: 292
Number of files with CMV +: 136
Number of files with CMV -: 156


In [28]:
#!/usr/bin/env python3
import pandas as pd
from pathlib import Path

# === USER SETTINGS ===
INPUT_DIR    = Path("/Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01")      # your raw .tsv files
OUTPUT_DIR   = Path("/Volumes/IshaVerbat/Isha/TCR/filtered_for_tcrdist")    # where slim TSVs & metadata CSV will go
CHAIN        = "beta"                          # always "beta"
ID_COLUMN    = "sample_name"                   # the column that uniquely IDs each file/patient
DATA_COLUMNS = [
    "rearrangement",
    "amino_acid",           # CDR3 sequence
    "v_gene",               # V gene
    "d_gene",               # D gene
    "j_gene",               # J gene
    "frequency",            # clone freq
    "productive_frequency", # productive freq
]
METADATA_COLUMNS = [
    "total_templates",
    "productive_templates",
    "total_reads",
    "total_productive_reads",
    "fraction_productive"
    
    # add more if you need them
]
# =====================

print("📄 Found the following TSV files for processing:")

tsv_files = []

for fname in file_names:
    print(f" Checking file: {fname}")
    
    if not fname.endswith(".tsv"):
        print(f" Skipped (not .tsv): {fname}")
        continue
    if fname.startswith(".") or fname.startswith("._"):
        print(f" Skipped (hidden or Apple metadata): {fname}")
        continue

    full_path = INPUT_DIR / fname
    print(f"  ✅ Accepted: {full_path}")
    tsv_files.append(full_path)


print("Processing the following TSV files")
for f in tsv_files:
    print(f"  {f.name}")
print(f"\n{len(tsv_files)} files found in {INPUT_DIR}.")

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
meta_csv = OUTPUT_DIR / "tcrdist_combined_metadata.csv"

# Load any existing metadata so we can append new rows
if meta_csv.exists():
    metadata_df = pd.read_csv(meta_csv, index_col=ID_COLUMN)
    completed_ids = set(metadata_df.index)
else:
    metadata_df = pd.DataFrame(
        columns=METADATA_COLUMNS,
    )
    metadata_df.index.name = ID_COLUMN
    completed_ids = set()

# How to rename columns for tcrdist3 β-chain
rename_map = {
    "amino_acid":           "cdr3_b_aa",
    "v_gene":               "v_b_gene",
    "j_gene":               "j_b_gene",
    # d_gene, frequency, productive_frequency stay unchanged
}

# Process files one by one
for idx, in_path in enumerate(tsv_files, start=1):
    print(f"\n[{idx}/{len(tsv_files)}] Processing {in_path.name}...")
    
    # Read just the sample ID first to check if we’ve already processed this
    try:
        sample_id = pd.read_csv(in_path, sep="\t", usecols=[ID_COLUMN], nrows=1)[ID_COLUMN][0]
    except Exception as e:
        print(f"Could not read sample ID from {in_path.name}: {e}")
        continue

    if sample_id in completed_ids:
        print(f"Skipping {in_path.name} (already processed)")
        continue

    try:
        # Load only relevant columns
        usecols = [ID_COLUMN] + DATA_COLUMNS + METADATA_COLUMNS
        df = pd.read_csv(in_path, sep="\t", usecols=usecols, dtype=str, low_memory=False)
        print(f'Read {df.shape[0]} rows and {df.shape[1]} columns from {in_path.name}')

        # Extract metadata
        meta = {ID_COLUMN: sample_id}
        for col in METADATA_COLUMNS:
            meta[col] = df[col].iat[0] if df[col].nunique(dropna=False) == 1 else df[col].iat[0]
            df.drop(columns=col, inplace=True)

        # Remove sample ID from table and store metadata
        df.drop(columns=[ID_COLUMN], inplace=True)
        df.rename(columns=rename_map, inplace=True)
        df["frequency"] = df["frequency"].astype(float)
        df["productive_frequency"] = df["productive_frequency"].astype(float)

        # Save slimmed TSV
        out_tsv = OUTPUT_DIR / in_path.name.replace(".tsv", ".slim.tsv")
        df.to_csv(out_tsv, sep="\t", index=False)

        # Append metadata
        metadata_df.loc[sample_id] = meta
        metadata_df.to_csv(meta_csv)
        print(f"📝 Processed {in_path.name} → {out_tsv.name}")

    except Exception as e:
        print(f"Error processing {in_path.name}: {e}")
        continue

print(f"\n Done. Slim files and combined metadata written to: {OUTPUT_DIR}")


📄 Found the following TSV files for processing:
 Checking file: P00492.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00492.tsv
 Checking file: P00412.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00412.tsv
 Checking file: P00416.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00416.tsv
 Checking file: P00037.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00037.tsv
 Checking file: P00346.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00346.tsv
 Checking file: P00353.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00353.tsv
 Checking file: P00278.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00278.tsv
 Checking file: P00260.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00260.tsv
 Checking file: P00208.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00208.tsv
 Checking file: P00143.tsv
  ✅ Acc

🔍 Checking file: P00492.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00492.tsv
🔍 Checking file: P00412.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00412.tsv
🔍 Checking file: P00416.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00416.tsv
🔍 Checking file: P00037.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00037.tsv
🔍 Checking file: P00346.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00346.tsv
🔍 Checking file: P00353.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00353.tsv
🔍 Checking file: P00278.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00278.tsv
🔍 Checking file: P00260.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00260.tsv
🔍 Checking file: P00208.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/All_Emerson_Cohort01/P00208.tsv
🔍 Checking file: P00143.tsv
  ✅ Accepted: /Volumes/IshaVerbat/Isha/TCR/Al

### Load in highconf

In [46]:
# Load high conf data

highconf_file_name = 'VDJ_INFO_AC02_high_conf_CMV_TCRs.csv'

highconf = pd.read_csv(f'/Users/ishaharris/Projects/TCR/TCR-Isha/data/High_Confidence_CMV_TCR/{highconf_file_name}')

highconf

# ------------------ COMMENTED OUT BECAUSE VDJ INFO FILES HAVE AMINO_ACID == ONLY TRB --------------------
# Add a column with just the TRB sequence 

# highconf_trb = highconf.loc[:, 'cdr3s_aa']
# highconf_trb=highconf_trb.apply(lambda x: re.search(r'TRB:([^;]+)', x).group(1) if 'TRB:' in x else None)
# len(highconf_trb)

# highconf['TRB'] = highconf_trb



Unnamed: 0,clonotype_id,amino_acid,count,v_call,d_call,j_call
0,AC02_clonotype9,CASSPYTGTGYGYTF,110.0,TRBV6-5,,TRBJ1-2
1,AC02_clonotype9,CASSPYTGTGYGYTF,85.0,TRBV6-5,,TRBJ1-2
2,AC02_clonotype1,CASSSVNEQFF,74.0,TRBV12-4,,TRBJ2-1
3,AC02_clonotype1,CASSSVNEQFF,71.0,TRBV12-4,,TRBJ2-1
4,AC02_clonotype5,CASSSVNEQFF,44.0,TRBV12-4,,TRBJ2-1
...,...,...,...,...,...,...
4121,AC02_clonotype27,CASSPYTGTGYGYTF,184.0,TRBV6-5,,TRBJ1-2
4122,AC02_clonotype1,CASSSVNEQFF,31.0,TRBV12-4,,TRBJ2-1
4123,AC02_clonotype8,CASSSVNEQFF,52.0,TRBV12-4,,TRBJ2-1
4124,AC02_clonotype8,CASSSVNEQFF,48.0,TRBV12-4,,TRBJ2-1


In [34]:
# Load a sample repertoire
rep = pd.read_csv('/Volumes/IshaVerbat/Isha/TCR/filtered_for_tcrdist/P00021.slim.tsv', sep="\t")

# count 
rep

Unnamed: 0,rearrangement,cdr3_b_aa,frequency,productive_frequency,v_b_gene,d_gene,j_b_gene
0,GCCTTGTTGCTGGGGGACTCGGCCCTGTATCTCTGTGCCAGCAGCC...,CASSPRGARSLETQYF,8.532311e-07,0.000001,TCRBV05-05,unresolved,TCRBJ02-05
1,AACGCCTTGTTGCTGGGGGACTCGGCCCTGTATCTCTGTGCCAGCA...,CASSPGTGAWQPQHF,1.023877e-06,0.000001,TCRBV05-05,TCRBD01-01,TCRBJ01-05
2,CTGAATGTGAACGCCTTGTTGCTGGGGGACTCGGCCCTGTATCTCT...,CASSRDGYGYTF,1.023877e-06,0.000001,TCRBV05-05,unresolved,TCRBJ01-02
3,GTTGCTGGGGGACTCGGCCCTGTATCTCTGTGCCAGCAGCTGGGGG...,,1.023877e-06,,TCRBV05-05,TCRBD02-01,TCRBJ01-05
4,AATGTGAACGCCTTGTTGCTGGGGGACTCGGCCCTGTATCTCTGTG...,CASSLYTNTEAFF,1.194524e-06,0.000001,TCRBV05-05,TCRBD01-01,TCRBJ01-01
...,...,...,...,...,...,...,...
187249,CTGAAGATCCAGCCCTCAGAACCCAGGGACTCAGCTGTGTACTTCT...,CASSSSAREQYF,1.692811e-04,0.000202,unresolved,TCRBD02-01,TCRBJ02-07
187250,GTGACCAGTGCCCATCCTGAAGACAGCAGCTTCTACATCTGCAGGG...,CRAR*QSSGNTIYF,2.214988e-04,,unresolved,TCRBD01-01,TCRBJ01-03
187251,CAGCCCTCAGAACCCAGGGACTCAGCTGTGTACTTCTGTGCCAGCA...,CASSLGGTLNTEAFF,2.517032e-04,0.000300,unresolved,unresolved,TCRBJ01-01
187252,TAGAGTCTGCCATCCCCAACCAGACAGCTCTTTACTTCTGTGCCAC...,,3.288353e-04,,unresolved,TCRBD01-01,TCRBJ01-02


### Write TCRdist calculator

In [None]:
#!/usr/bin/env python
"""
Compute TCRdist between a panel of high‑confidence CMV‑associated TRB clonotypes and each patient's bulk TCRB repertoire.
The script mirrors the logic of the previous Levenshtein implementation but leverages **tcrdist3** for a biochemically‑weighted distance.

Dependencies
------------
    pip install tcrdist3 pandas numpy

Input
-----
* A TSV (or CSV) file containing the CMV‑associated search set (≈200 rows).
* A directory of TSV/CSV files – one per patient – already filtered to unique productive TRB clonotypes (≈3×10⁵ rows each).

Both inputs must expose **cdr3_b_aa**, **v_b_gene**, **j_b_gene** columns (see `prepare_df`).

Output
------
`tcrdist_output.csv` – One row per patient with a histogram of TCRdist values (0‥MAX_DIST).

Notes
-----
* The tcrdist scale is *not* the same as edit distance. A single amino‑acid substitution in CDR3 is ≈ 12 distance units when TRBV and TRBJ genes match. A radius of 36 therefore captures most functional "one‑to‑two‑sub" neighbours (Dash et al., 2017).
* The computation is executed as a sparse **rectangular** distance matrix; only distances ≤ `RADIUS` are kept in memory.
* Adapt `MAX_DIST`, `RADIUS`, and path variables as needed.
"""

import os
import re
import multiprocessing
import numpy as np
import pandas as pd
from tcrdist.repertoire import TCRrep
from tcrdist.breadth import get_safe_chunk

print(' - Loaded libraries.')

# ──────────────────────────────────────────────────────────────────────────────
# User‑configurable parameters
# ──────────────────────────────────────────────────────────────────────────────
MAX_DIST   = 50        # highest TCRdist bucket to tally (inclusive)
RADIUS     = MAX_DIST  # distances > RADIUS are discarded by tcrdist3
HIGHCONF   = highconf   # search‑set file
PATIENT_DIR = "/Volumes/IshaVerbat/Isha/TCR/filtered_for_tcrdist"  # directory with patient files
OUTFILE    = "tcrdist_output.csv"
N_CPUS     = min(multiprocessing.cpu_count(), 8)

# set wd
os.chdir("/Users/ishaharris/Projects/TCR/TCR-Isha/data/tcrdist")

MAX_FILES = 4

print(' - Set parameters.')

# ──────────────────────────────────────────────────────────────────────────────
# Helper functions
# ──────────────────────────────────────────────────────────────────────────────
GENE_PAT = re.compile(r"[*!].*$")


def _normalize_gene(s: str) -> str:
    """Strip prefixes / allele info to IMGT-like gene names understood by tcrdist3,
    remove leading zeros from gene numbers (e.g. TRBV05 → TRBV5),
    and from any numeric subfields (e.g. TRBV5-05 → TRBV5-5)."""
    if pd.isna(s):
        return np.nan

    # Standardize prefix
    s = str(s).replace("TCR", "TR")  # e.g. TCRBV05-05 → TRBV05-05

    # Remove leading zeros from primary gene number (e.g. TRBV05 → TRBV5)
    s = re.sub(
        r"^([A-Z]+)(0+)(\d+)",
        lambda m: f"{m.group(1)}{int(m.group(2) + m.group(3))}",
        s,
    )

    # Remove leading zeros from any subfield after hyphens (e.g. -05 → -5)
    s = re.sub(
        r"-(0+)(\d+)",
        lambda m: f"-{int(m.group(1) + m.group(2))}",
        s,
    )

    # Normalize allele information to *01
    if "*" in s:
        s = re.sub(r"[*!].*$", "*01", s)
    else:
        s = s + "*01"

    return s


def prepare_df(df: pd.DataFrame) -> pd.DataFrame:
    """Bring an arbitrary clonotype table to tcrdist3‑ready shape."""
    # Remove d_gene column
    if "d_gene" in df.columns:
        df.drop(columns=["d_gene"], inplace=True)
    
    rename_map = {
        "amino_acid": "cdr3_b_aa",
        "cdr3s_aa": "cdr3_b_aa",
        "v_call": "v_b_gene",
        "j_call": "j_b_gene",
    }

    df = df.rename(columns=rename_map)
    need = ["cdr3_b_aa", "v_b_gene", "j_b_gene"]
    df = df[need].copy()
    df.dropna(inplace=True)

    df["v_b_gene"] = df["v_b_gene"].apply(_normalize_gene)
    df["j_b_gene"] = df["j_b_gene"].apply(_normalize_gene)
  

    df = df.drop_duplicates().reset_index(drop=True)
    return df

print('Defined functions.')

# ──────────────────────────────────────────────────────────────────────────────
# Load the CMV‑associated search set once
# ──────────────────────────────────────────────────────────────────────────────
print("Loading high‑confidence CMV search set …")
high_raw  = highconf
high_df   = prepare_df(high_raw)
print(high_df.head())

print(f" → {len(high_df):,} unique TRB clonotypes ready for search.")

tr_search = TCRrep(cell_df=high_df, organism="human", chains=["beta"], compute_distances=False)

tr_search.cpus = N_CPUS

# ──────────────────────────────────────────────────────────────────────────────
# Prepare output frame – one row per patient
# ──────────────────────────────────────────────────────────────────────────────
cols = ["patient_id"] + [f"dist_{d}" for d in range(MAX_DIST + 1)]
results = pd.DataFrame(columns=cols)

# ──────────────────────────────────────────────────────────────────────────────
# Main loop over patient repertoires
# ──────────────────────────────────────────────────────────────────────────────
file_names = sorted(fn for fn in os.listdir(PATIENT_DIR) if fn.endswith(('.tsv', '.csv')) and fn.startswith('P'))

for idx, fname in enumerate(file_names, 1):
    if idx > MAX_FILES:
        print(f"Reached maximum number of files to process ({MAX_FILES}).")
        break
    patient_id = os.path.splitext(fname)[0]
    print(f"Processing ({idx}/{len(file_names)}): {patient_id}")

    # Load & clean bulk repertoire
    rep_raw = pd.read_csv(os.path.join(PATIENT_DIR, fname), sep="\t", low_memory=False)
    rep_df  = prepare_df(rep_raw)

    if rep_df.empty:
        print(f" ! No usable sequences in {patient_id} – skipping.")
        continue

    tr_bulk = TCRrep(cell_df=rep_df, organism="human", chains=["beta"], compute_distances=False)



    # Compute sparse rectangular distances (rows=search, cols=bulk)
    chunk_size = get_safe_chunk(tr_search.clone_df.shape[0], tr_bulk.clone_df.shape[0])
    tr_search.compute_sparse_rect_distances(
        df=tr_search.clone_df,
        df2=tr_bulk.clone_df,
        radius=RADIUS,
        chunk_size=chunk_size,
    )

    # Histogram distances ≤ RADIUS
    bins = np.zeros(MAX_DIST + 1, dtype=int)
    for d in tr_search.rw_beta.data.astype(int):
        bins[d] += 1

    # Append to results
    row = {"patient_id": patient_id, **{f"dist_{d}": bins[d] for d in range(MAX_DIST + 1)}}
    results = pd.concat([results, pd.DataFrame([row])], ignore_index=True)

    # Persist after each patient
    results.to_csv(OUTFILE, index=False)
    print(f"   ↳ saved ➜ {OUTFILE}")

    # Free distance matrix to keep RAM stable before next iteration
    del tr_search.rw_beta

print("✔︎ All patient repertoires processed.")


 - Loaded libraries.
 - Set parameters.
Defined functions.
Loading high‑confidence CMV search set …
         cdr3_b_aa     v_b_gene    j_b_gene
0  CASSPYTGTGYGYTF   TRBV6-5*01  TRBJ1-2*01
1      CASSSVNEQFF  TRBV12-4*01  TRBJ2-1*01
2    CASGPVDTDTQYF  TRBV12-5*01  TRBJ2-3*01
3     CASSSLTYEQYF   TRBV5-6*01  TRBJ2-7*01
4    CASSFQGYTEAFF    TRBV28*01  TRBJ1-1*01
 → 216 unique TRB clonotypes ready for search.



  self._validate_cell_df()
  self._validate_db_file()


Processing (1/292): P00002.slim



  self._validate_cell_df()
  self._validate_db_file()
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_ge

213


  0%|          | 0/2 [00:00<?, ?it/s]

   ↳ saved ➜ tcrdist_output.csv
Processing (2/292): P00003.slim



  self._validate_cell_df()
  self._validate_db_file()
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_ge

128


  self.deduplicate()


  0%|          | 0/2 [00:00<?, ?it/s]

   ↳ saved ➜ tcrdist_output.csv
Processing (3/292): P00004.slim



  self._validate_cell_df()
  self._validate_db_file()
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_ge

100


  self.deduplicate()


  0%|          | 0/3 [00:00<?, ?it/s]

   ↳ saved ➜ tcrdist_output.csv
Processing (4/292): P00006.slim



  self._validate_cell_df()
  self._validate_db_file()
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_ge

98


  self.deduplicate()


  0%|          | 0/3 [00:00<?, ?it/s]

   ↳ saved ➜ tcrdist_output.csv
Reached maximum number of files to process (4).
✔︎ All patient repertoires processed.


### Tester area

In [91]:
tr_search = TCRrep(cell_df=high_df, organism="human", chains=["beta"], compute_distances=False)
tr_search.compute_distances()
print("clone_df shape:", tr_search.clone_df.shape)  # should be ≈ 200


  self._validate_db_file()
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = 

ValueError: zero-size array to reduction operation maximum which has no identity

### Try to write it myself

In [173]:
PATIENT_DIR = "/Volumes/IshaVerbat/Isha/TCR/filtered_for_tcrdist"  # directory with patient files
# Prepare df function

def _normalize_gene(s: str) -> str:
    """Strip prefixes / allele info to IMGT-like gene names understood by tcrdist3,
    remove leading zeros from gene numbers (e.g. TRBV05 → TRBV5),
    and from any numeric subfields (e.g. TRBV5-05 → TRBV5-5)."""
    if pd.isna(s):
        return np.nan

    # Standardize prefix
    s = str(s).replace("TCR", "TR")  # e.g. TCRBV05-05 → TRBV05-05

    # Remove leading zeros from primary gene number (e.g. TRBV05 → TRBV5)
    s = re.sub(
        r"^([A-Z]+)(0+)(\d+)",
        lambda m: f"{m.group(1)}{int(m.group(2) + m.group(3))}",
        s,
    )

    # Remove leading zeros from any subfield after hyphens (e.g. -05 → -5)
    s = re.sub(
        r"-(0+)(\d+)",
        lambda m: f"-{int(m.group(1) + m.group(2))}",
        s,
    )

    # Normalize allele information to *01
    if "*" in s:
        s = re.sub(r"[*!].*$", "*01", s)
    else:
        s = s + "*01"

    return s

def prepare_df(df: pd.DataFrame) -> pd.DataFrame:
    """Bring an arbitrary clonotype table to tcrdist3‑ready shape."""
    # Remove d_gene column
    if "d_gene" in df.columns:
        df.drop(columns=["d_gene"], inplace=True)
    
    rename_map = {
        "amino_acid": "cdr3_b_aa",
        "cdr3s_aa": "cdr3_b_aa",
        "v_call": "v_b_gene",
        "j_call": "j_b_gene",
    }

    df = df.rename(columns=rename_map)
    need = ["cdr3_b_aa", "v_b_gene", "j_b_gene"]
    df = df[need].copy()
    df.dropna(inplace=True)

    df["v_b_gene"] = df["v_b_gene"].apply(_normalize_gene)
    df["j_b_gene"] = df["j_b_gene"].apply(_normalize_gene)
  

    df = df.drop_duplicates().reset_index(drop=True)
    return df
    


    
rep_raw = pd.read_csv(os.path.join(PATIENT_DIR, fname), sep="\t", low_memory=False)
df_bulk  = prepare_df(rep_raw)

tr_bulk = TCRrep(cell_df = df_bulk,
                 organism = "human",
                 chains = ["beta"],
                 #db_file = db,
                 compute_distances = False)

t = tr_bulk.show_incomplete()

df_search = tr_bulk.clone_df.copy()




  self._validate_cell_df()
  self._validate_db_file()
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_gene_to_reference_seq2(gene = v,
  f0 = lambda v : self._map_ge

In [184]:
import os
print(os.getcwd())


/Users/ishaharris/Projects/TCR/TCR-Isha/data/Repertoires


In [171]:
import re
import pandas as pd
import numpy as np

def _normalize_gene(s: str) -> str:
    """Strip prefixes / allele info to IMGT-like gene names understood by tcrdist3,
    remove leading zeros from gene numbers (e.g. TRBV05 → TRBV5),
    and from any numeric subfields (e.g. TRBV5-05 → TRBV5-5)."""
    if pd.isna(s):
        return np.nan

    # Standardize prefix
    s = str(s).replace("TCR", "TR")  # e.g. TCRBV05-05 → TRBV05-05

    # Remove leading zeros from primary gene number (e.g. TRBV05 → TRBV5)
    s = re.sub(
        r"^([A-Z]+)(0+)(\d+)",
        lambda m: f"{m.group(1)}{int(m.group(2) + m.group(3))}",
        s,
    )

    # Remove leading zeros from any subfield after hyphens (e.g. -05 → -5)
    s = re.sub(
        r"-(0+)(\d+)",
        lambda m: f"-{int(m.group(1) + m.group(2))}",
        s,
    )

    # Normalize allele information to *01
    if "*" in s:
        s = re.sub(r"[*!].*$", "*01", s)
    else:
        s = s + "*01"

    return s


In [172]:
_normalize_gene('TCRBV05-05')

'TRBV5-5*01'

In [152]:
df_search

Unnamed: 0,cdr3_b_aa,v_b_gene,j_b_gene,cdr1_b_aa,cdr2_b_aa,pmhc_b_aa,count,clone_id


### Updating v gene info

In [218]:
# df = pd.read_json("/Users/ishaharris/Projects/TCR/TCR-Isha/data/tcrdist/imgt.202312-3.sv8.json")
# df

# import json
# import pandas as pd

# Load the JSON data
with open ("/Users/ishaharris/Projects/TCR/TCR-Isha/data/tcrdist/imgt.202312-3.sv8.json") as f:
    data = json.load(f)

# Convert to a DataFrame
df = pd.DataFrame(data)

# Confirm structure
print(df.columns)
print(df.shape)
print(df.head())


Index(['taxonId', 'speciesNames', 'genes', 'meta', 'sequenceFragments'], dtype='object')
(23, 5)


   taxonId           speciesNames  \
0     7955            [zebrafish]   
1     8022        [rainbow_trout]   
2     8030               [salmon]   
3     9258             [platypus]   
4     9541  [macaca fascicularis]   

                                               genes  \
0  [{'baseSequence': 'file://zebrafish_V_IGH.fast...   
1  [{'baseSequence': 'file://rainbow_trout_V_IGH....   
2  [{'baseSequence': 'file://salmon_V_IGH.fasta#I...   
3  [{'baseSequence': 'file://platypus_V_IGH.fasta...   
4  [{'baseSequence': 'file://macacaFascicularis_V...   

                                                meta  \
0  {'comments': ['Imported from: /Volumes/Data/Pr...   
1  {'comments': ['Imported from: /Volumes/Data/Pr...   
2  {'comments': ['Imported from: /Volumes/Data/Pr...   
3  {'comments': ['Imported from: /Volumes/Data/Pr...   
4  {'comments': ['Imported from: /Volumes/Data/Pr...   

                                   sequenceFragments  
0  [{'uri': 'file://zebrafish_C_IGI.fasta#IGIC1*

In [219]:
# Explode the genes column
df_expanded = df.explode('genes').reset_index(drop=True)

# Normalize the dictionary in each gene row
genes_flat = pd.json_normalize(df_expanded['genes'])

# Merge flattened gene info back into the main df
result = pd.concat([df_expanded.drop(columns=['genes']), genes_flat], axis=1)


In [220]:
aliases = ['homosapiens', 'hs', 'hsa', 'human']

human_genes = result[result['speciesNames'].apply(
    lambda names: any(alias.lower() in [n.lower() for n in names] for alias in aliases)
)]


In [222]:
human_genes

#save human_genes to a csv
human_genes.to_csv('/Users/ishaharris/Projects/TCR/TCR-Isha/data/tcrdist/human_genes.csv', index=False)

In [223]:
# save human_genes to a pickle
human_genes.to_pickle('/Users/ishaharris/Projects/TCR/TCR-Isha/data/tcrdist/human_genes.pkl')

In [224]:
# read human_genes pickle
human_genes = pd.read_pickle('/Users/ishaharris/Projects/TCR/TCR-Isha/data/tcrdist/human_genes.pkl')

In [225]:
human_genes

Unnamed: 0,taxonId,speciesNames,meta,sequenceFragments,baseSequence,name,geneType,isFunctional,chains,type,...,anchorPoints.CDR3Begin,anchorPoints.VEnd,anchorPoints.DBegin,anchorPoints.DEnd,anchorPoints.JBegin,anchorPoints.FR4Begin,anchorPoints.FR4End,anchorPoints.CBegin,anchorPoints.CExon1End,anchorPoints.CEnd
2552,9606,"[homosapiens, hs, hsa, human]",{'comments': ['Imported from: /Volumes/Data/Pr...,"[{'uri': 'file://human_C_IGH.fasta#IGHA1*01', ...",file://human_V_IGH.fasta#IGHV1-2*01,IGHV1-2*01,V,True,[IGH],primary,...,285.0,296.0,,,,,,,,
2553,9606,"[homosapiens, hs, hsa, human]",{'comments': ['Imported from: /Volumes/Data/Pr...,"[{'uri': 'file://human_C_IGH.fasta#IGHA1*01', ...",file://human_V_IGH.fasta#IGHV1-2*02,IGHV1-2*02,V,True,[IGH],primary,...,285.0,296.0,,,,,,,,
2554,9606,"[homosapiens, hs, hsa, human]",{'comments': ['Imported from: /Volumes/Data/Pr...,"[{'uri': 'file://human_C_IGH.fasta#IGHA1*01', ...",file://human_V_IGH.fasta#IGHV1-2*04,IGHV1-2*04,V,True,[IGH],primary,...,285.0,296.0,,,,,,,,
2555,9606,"[homosapiens, hs, hsa, human]",{'comments': ['Imported from: /Volumes/Data/Pr...,"[{'uri': 'file://human_C_IGH.fasta#IGHA1*01', ...",file://human_V_IGH.fasta#IGHV1-2*05,IGHV1-2*05,V,True,[IGH],primary,...,285.0,296.0,,,,,,,,
2556,9606,"[homosapiens, hs, hsa, human]",{'comments': ['Imported from: /Volumes/Data/Pr...,"[{'uri': 'file://human_C_IGH.fasta#IGHA1*01', ...",file://human_V_IGH.fasta#IGHV1-2*06,IGHV1-2*06,V,True,[IGH],primary,...,285.0,296.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3827,9606,"[homosapiens, hs, hsa, human]",{'comments': ['Imported from: /Volumes/Data/Pr...,"[{'uri': 'file://human_C_IGH.fasta#IGHA1*01', ...",file://human_C_TRG.fasta#TRGC2*04,TRGC2*04,C,True,[TRG],primary,...,,,,,,,,0.0,330.0,330.0
3828,9606,"[homosapiens, hs, hsa, human]",{'comments': ['Imported from: /Volumes/Data/Pr...,"[{'uri': 'file://human_C_IGH.fasta#IGHA1*01', ...",file://human_C_TRG.fasta#TRGC2*05,TRGC2*05,C,True,[TRG],primary,...,,,,,,,,0.0,330.0,330.0
3829,9606,"[homosapiens, hs, hsa, human]",{'comments': ['Imported from: /Volumes/Data/Pr...,"[{'uri': 'file://human_C_IGH.fasta#IGHA1*01', ...",file://human_C_TRG.fasta#TRGC2*06,TRGC2*06,C,True,[TRG],primary,...,,,,,,,,0.0,330.0,330.0
3830,9606,"[homosapiens, hs, hsa, human]",{'comments': ['Imported from: /Volumes/Data/Pr...,"[{'uri': 'file://human_C_IGH.fasta#IGHA1*01', ...",file://human_C_TRG.fasta#TRGC2*07,TRGC2*07,C,True,[TRG],primary,...,,,,,,,,0.0,330.0,330.0
