In [13]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict

In [60]:
donors = [f"donor{i}" for i in range(1,5)]
donors

['donor1', 'donor2', 'donor3', 'donor4']

In [None]:
dfs = []
donors = [f"donor{i}" for i in range(1,5)]

for donor in donors:
    df = json.load(open(f"/Users/kkim14/tcr_structure_features/data/10X/raw/vdj_v1_hs_aggregated_{donor}_consensus_annotations.json"))
    clones = defaultdict(list)
    for i in df:
        if set([a['feature']['chain'] for a in df[0]['annotations']]).issubset(['TRA','TRB']):
            clones[i.get("clonotype")].append(i)
    orilen = len(clones)
    
    singles = []
    for clone, contig in clones.items():
        chains = []
        for i in contig:
            chains.extend(list(set([a['feature']['chain'] for a in i['annotations']])))
        if len(set(chains)) < 2:
            singles.append(clone)
    
    for i in singles:
        if i in clones:
            del clones[i]
        
    filterlen = len(clones)
    
    
    df = []
    for clone, contigs in clones.items():
        for contig in contigs:
            df.append(
                {
                    'clonotype' : clone,
                    'aa' : contig['aa_sequence'],
                    'chain' : list(set([a['feature']['chain'] for a in contig['annotations']]))[0],
                    'cdr3' : contig['cdr3'],
                    'umi_count' : contig['umi_count']
                    }
                )
            
    df = pd.DataFrame(df)
    
    df_a = df[df['chain']=='TRA']
    df_b = df[df['chain']=='TRB']
    df = df_a.merge(df_b, on='clonotype', suffixes=("_a","_b"))
    df.rename(columns={'aa_a':'tra_aa', 'aa_b':'trb_aa'}, inplace=True)
    df.drop(columns=['chain_a','chain_b'], inplace=True)
    
    df['clonotype_count'] = df.groupby('clonotype').cumcount() + 1
    
    clonotype = df['clonotype'].astype(str) + "-" + df['clonotype_count'].astype(str)
    df['clonotype'] = clonotype
    
    donor_clonotype = donor + "_" + df['clonotype'].astype(str)
    df.insert(0, "donor_clonotype", donor_clonotype)
    
    df.to_csv(f"/Users/kkim14/tcr_structure_features/data/10X/{donor}_consensus_clonotypes.csv", index=False)
    dfs.append(df)

dfs = pd.concat(dfs)
    

In [74]:
df = dfs.groupby(['tra_aa','trb_aa'])['donor_clonotype'].apply(list).reset_index()
df.reset_index(names=['tcr_id'], inplace=True)
df['tcr_id'] = "10x_" + df['tcr_id'].astype(str)
df.set_index("tcr_id", inplace=True)

df

Unnamed: 0_level_0,tra_aa,trb_aa,donor_clonotype
tcr_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10x_0,MACPGFLWALVISTCLEFSMAQTVTQSQPEMSAQEAETVTLSCTYD...,MGTSLLCWMALCLLGADHADTGVSQDPRHKITKRGQNVTFRCDPIS...,[donor4_clonotype6556-1]
10x_1,MACPGFLWALVISTCLEFSMAQTVTQSQPEMSVQEAETVTLSCTYD...,MGTRLLCWAALCLLGAELTEAGVAQSPRYKIIEKRQSVAFWCNPIS...,[donor2_clonotype5866-1]
10x_2,MACPGFLWALVISTCLEFSMAQTVTQSQPEMSVQEAETVTLSCTYD...,MGSRLLCWVLLCLLGAGPVKAGVTQTPRYLIKTRGQQVTLSCSPIS...,[donor1_clonotype28036-1]
10x_3,MACPGFLWALVISTCLEFSMAQTVTQSQPEMSVQEAETVTLSCTYD...,MSNQVLCCVVLCLLGANTVDGGITQSPKYLFRKEGQNVTLSCEQNL...,[donor1_clonotype28036-2]
10x_4,MACPGFLWALVISTCLEFSMAQTVTQSQPEMSVQEAETVTLSCTYD...,MGFRLLCCVAFCLLGAGPVDSGVTQTPKHLITATGQRVTLRCSPRS...,[donor1_clonotype10991-1]
...,...,...,...
10x_88363,MYTYSSGNKEDGRFTAQVDKSSKYISLFIRDSQPSDSATYLCAMSA...,MDTWLVCWAIFSLLKAGLTEPEVTQTPSHQVTQMGQEVILRCVPIS...,[donor1_clonotype30032-1]
10x_88364,MYYCAFLNAGGTSYGKLTFGQGTILTVHPNIQNPDPAVYQLRD,MVSRLLSLVSLCLLGAKHIEAGVTQFPSHSVIEKGQTVTLRCDPIS...,[donor1_clonotype11386-1]
10x_88365,MYYCALPTGGYQKVTFGIGTKLQVIPNIQNPDPAVYQLRD,MGPGLLHWMALCLLGTGHGDAMVIQNPRYQVTQFGKPVTLSCSQTL...,[donor1_clonotype28896-1]
10x_88366,MYYCALRGGSTLGRLYFGRGTQLTVWPDIQNPDPAVYQLRD,MGTRLLCWAALCLLGAELTEAGVAQSPRYKIIEKRQSVAFWCNPIS...,[donor4_clonotype14958-1]


In [75]:
df.to_csv("/Users/kkim14/tcr_structure_features/data/10X/all_tcr_seqs.csv")
df.to_json("/Users/kkim14/tcr_structure_features/data/10X/all_tcr_seqs.json", orient='index', indent=4)