# Coherence analysis for OAS database

## full paired dataset, human only

total sequences in paired dataset: 1'946'101 (no clustering but full OAS db)

In [3]:
# used julia version 1.10.2
using CSV
using DataFrames
using StatsBase

In [4]:
# Load the CSV file into a DataFrame
file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/full_extraction_for_coherence_paired_data_header.csv" 
df = CSV.read(file_path, DataFrame);

In [5]:
# Filter out rows where BType is "Unsorted-B-Cells"
filtered_df = filter(row -> row.BType != "Unsorted-B-Cells", df);

In [6]:
# Extract up to the first two segments (e.g., "IGKV2-30" from "IGKV2-30*01")
filtered_df[!, :general_v_gene_heavy] = replace.(filtered_df.v_call_heavy, r"(^[^*]+?)(?:\*.*)?$" => s"\1"); # the regex removes any part of the string after (and including) the first *, keeping only the portion before it.
filtered_df[!, :general_v_gene_light] = replace.(filtered_df.v_call_light, r"(^[^*]+?)(?:\*.*)?$" => s"\1");

In [7]:
# extract names of the BTypes, remove the "/" and replace it with "_" to avoid confusion with file paths
replace!(filtered_df.BType, "CD27-memory-and-Plasmablast/Plasma-B-Cells" => "CD27-memory-and-Plasmablast_Plasma-B-Cells")
replace!(filtered_df.BType, "Plasmablast/Plasma-B-Cells" => "Plasmablast_Plasma-B-Cells")

unique_btypes = unique(filtered_df.BType)

println("unique BTypes: ", unique_btypes)

unique BTypes: ["Memory-B-Cells", "Plasma-B-Cells", "Naive-B-Cells", "RV+B-Cells", "double-nagative-B-cells", "Plasmablast", "CD27-memory-and-Plasmablast_Plasma-B-Cells", "ASC", "Plasmablast_Plasma-B-Cells"]


In [8]:
# Group by BType
grouped_by_btype = groupby(filtered_df, :BType)


Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,general_v_gene_heavy,general_v_gene_light
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String,String,String7,String,String
1,GGCTTTCTGAGAGTCATGGGCCTCCTGTGCAAGAACATGAAGCACCTGTGGTTTTTCCTCCTGCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTTAAGCCTTTGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTTATTACATCAGCGATGGTTACTTCTGGGCCTGGATCCGGCAGCCCCCAGGGAAGGGGCTGGAGTGGCTTGGGGCTCTCTATCATACTGGGACCACCTACTACAACCCGTCCCTCAAGCGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAACTTTCCCTGAGGCTGAACTCTGTGACCGCCGCAGACACGGCCGTGTATTATTGTGCGAGAGACCCAACCCCGGGAACGCCGGTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV4-38-2*02,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTTAAGCCTTTGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTTATTACATCAGCGATGGTTACTTCTGGGCCTGGATCCGGCAGCCCCCAGGGAAGGGGCTGGAGTGGCTTGGGGCTCTCTATCATACTGGGACCACCTACTACAACCCGTCCCTCAAGCGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAACTTTCCCTGAGGCTGAACTCTGTGACCGCCGCAGACACGGCCGTGTATTATTGTGCGAGAGACCCAACCCCGGGAACGCCGGTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPLETLSLTCTVSGYYISDGYFWAWIRQPPGKGLEWLGALYHTGTTYYNPSLKRRVTISVDTSKNQLSLRLNSVTAADTAVYYCARDPTPGTPVDYWGQGTLVTVSS,QVQLQESGPGLVKPSETLSLTCTVSGYSISSGYYWGWIRQPPGKGLEWIGSIYHSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXGTXXDYWGQGTLVTVSS,ARDPTPGTPVDY,GGGAGGAATCAGTCCCACTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGTTCCCAGGTGCCAGGTGTGACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGCCGGGCAAGTCAGGGGATTAGAGATGATTTAGGCTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAGCGCCTGATCTATGCTGCATCCAATTTGCAAAGTGGGGTCCCATCGAGGTTCAGCGGCAGTGGATCTGGGACAGAATTCACTCTCACAATCAGCAGCCTGCAGCCTGAAGATTCTGCAACTTATTACTGTCTACAGCATAATATTTACCCTCGGACGTTCGGCCAAGGGACCAGGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-17*01,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGCCGGGCAAGTCAGGGGATTAGAGATGATTTAGGCTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAGCGCCTGATCTATGCTGCATCCAATTTGCAAAGTGGGGTCCCATCGAGGTTCAGCGGCAGTGGATCTGGGACAGAATTCACTCTCACAATCAGCAGCCTGCAGCCTGAAGATTCTGCAACTTATTACTGTCTACAGCATAATATTTACCCTCGGACGTTCGGCCAAGGGACCAGGGTGGAAATCAAAC,DIQMTQSPSSLSASVGDRVTITCRASQGIRDDLGWYQQKPGKAPKRLIYAASNLQSGVPSRFSGSGSGTEFTLTISSLQPEDSATYYCLQHNIYPRTFGQGTRVEIK,DIQMTQSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKRLIYAASSLQSGVPSRFSGSGSGTEFTLTISSLQPEDFATYYCLQHNSYPRTFGQGTKVEIK,LQHNIYPRT,QVQLQESGPGLVKPLETLSLTCTVSGYYISDGYFWAWIRQPPGKGLEWLGALYHTGTTYYNPSLKRRVTISVDTSKNQLSLRLNSVTAADTAVYYCARDPTPGTPVDYWGQGTLVTVSS[SEP]DIQMTQSPSSLSASVGDRVTITCRASQGIRDDLGWYQQKPGKAPKRLIYAASNLQSGVPSRFSGSGSGTEFTLTISSLQPEDSATYYCLQHNIYPRTFGQGTRVEIK,Memory-B-Cells,,human,IGHV4-38-2,IGKV1-17
2,ATAAGGGAAATGCTTTCTGAGAGTCATGGACCTCCTGTGCAAGAACATGAAGCACCTGTGGTTCTTCCTCCTGCTGGTGGCGGCTCCCAGATGGGTCCTGTCCCAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTAGTAGTTACTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGAGTATCTATTATAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAATACAAAGGATAGCAGCAGCTGGTACAACCCTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCACCCTCCTCCAAGAGCACCTCTGGGGGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCCCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV4-39*07,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTAGTAGTTACTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGAGTATCTATTATAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAATACAAAGGATAGCAGCAGCTGGTACAACCCTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARIQRIAAAGTTLDYWGQGTLVTVSS,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXIAAAGTXXDYWGQGTLVTVSS,ARIQRIAAAGTTLDY,GAGCTACAACAGGCAGGCAGGGGCAGCAAGATGGTGTTGCAGACCCAGGTCTTCATTTCTCTGTTGCTCTGGATCTCTGGTGCCTACGGGGACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCATCAACTGCAAGTCCAGCCAGAGTGTTTTATACAGCTCCAACAATAAGAACTACTTAGCTTGGTACCAGCAGAAACCAGGACAGCCTCCTAAGCTGCTCATTTACTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAGTATTATAGTACTCCCTTATTCACTTTCGGCCCTGGGACCAAAGTGGATATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV4-1*01,GACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCATCAACTGCAAGTCCAGCCAGAGTGTTTTATACAGCTCCAACAATAAGAACTACTTAGCTTGGTACCAGCAGAAACCAGGACAGCCTCCTAAGCTGCTCATTTACTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAGTATTATAGTACTCCCTTATTCACTTTCGGCCCTGGGACCAAAGTGGATATCAAAC,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPLFTFGPGTKVDIK,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPXFTFGPGTKVDIK,QQYYSTPLFT,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARIQRIAAAGTTLDYWGQGTLVTVSS[SEP]DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPLFTFGPGTKVDIK,Memory-B-Cells,,human,IGHV4-39,IGKV4-1
3,AGTGCTTTCTGAGAGTCATGGACCTCCTGCACAAGAACATGAAACACCTGTGGTTCTTCCTCCTCCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTACAGCAGTGGGGCGCAGGACTATTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCAGTGTCTATGGTGGGACCTTCAGTGGTTACTACTGGACCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAATGGATTGGGGAAATCAATCATAGTGGAAGCACCAACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCTGTCTATTACTGTGCGAGACGGGCCGGGGGATATTGTGATAGTTCCGCCTGCTCGACCTACTGGTATCTCGATGTCTGGGGCCGTGGCACCCTGGTCACTGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV4-34*01,CAGGTGCAGCTACAGCAGTGGGGCGCAGGACTATTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCAGTGTCTATGGTGGGACCTTCAGTGGTTACTACTGGACCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAATGGATTGGGGAAATCAATCATAGTGGAAGCACCAACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCTGTCTATTACTGTGCGAGACGGGCCGGGGGATATTGTGATAGTTCCGCCTGCTCGACCTACTGGTATCTCGATGTCTGGGGCCGTGGCACCCTGGTCACTGTCTCCTCAG,QVQLQQWGAGLLKPSETLSLTCSVYGGTFSGYYWTWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARRAGGYCDSSACSTYWYLDVWGRGTLVTVSS,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXGYCSSTSCXXYWYFDLWGRGTLVTVSS,ARRAGGYCDSSACSTYWYLDV,GCTCTGCTTCAGCTGTGGGCACAAGAGGCAGCACTCAGGACAATCTCCAGCATGGCCTGGTCTCCTCTCCTCCTCACTCTCCTCGCTCACTGCACAGGGTCCTGGGCCCAGTCTGTGCTGACGCAGCCGCCCTCAGTGTCTGGGGCCCCAGGGCAGAGGGTCACCATCTCCTGCACTGGGAGCAGCTCCAACATCGGGGCAGGTTATGATGTACACTGGTACCGGCAACTTCCAGGAACAGCCCCCAAACTCCTCATTTCTGGCAATAGTAATCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCAGCATCCCTGGCCATCACTGGGCTCCAGGCTGAGGATGAGGCTGATTTTTACTGCCAGTCCTATGACACCAGCCTGAGTAGTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV1-40*01,CAGTCTGTGCTGACGCAGCCGCCCTCAGTGTCTGGGGCCCCAGGGCAGAGGGTCACCATCTCCTGCACTGGGAGCAGCTCCAACATCGGGGCAGGTTATGATGTACACTGGTACCGGCAACTTCCAGGAACAGCCCCCAAACTCCTCATTTCTGGCAATAGTAATCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCAGCATCCCTGGCCATCACTGGGCTCCAGGCTGAGGATGAGGCTGATTTTTACTGCCAGTCCTATGACACCAGCCTGAGTAGTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QSVLTQPPSVSGAPGQRVTISCTGSSSNIGAGYDVHWYRQLPGTAPKLLISGNSNRPSGVPDRFSGSKSGTSASLAITGLQAEDEADFYCQSYDTSLSSWVFGGGTKLTVL,QSVLTQPPSVSGAPGQRVTISCTGSSSNIGAGYDVHWYQQLPGTAPKLLIYGNSNRPSGVPDRFSGSKSGTSASLAITGLQAEDEADYYCQSYDSSLSGWVFGGGTKLTVL,QSYDTSLSSWV,QVQLQQWGAGLLKPSETLSLTCSVYGGTFSGYYWTWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARRAGGYCDSSACSTYWYLDVWGRGTLVTVSS[SEP]QSVLTQPPSVSGAPGQRVTISCTGSSSNIGAGYDVHWYRQLPGTAPKLLISGNSNRPSGVPDRFSGSKSGTSASLAITGLQAEDEADFYCQSYDTSLSSWVFGGGTKLTVL,Memory-B-Cells,,human,IGHV4-34,IGLV1-40
4,GAGGGTCCTGCTCACATGGGAAATACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACACCTGTGGTTTTTCCTCCTGCTGGAGGCAGCTCCCAGATGGGTCGTGTCCCAGGTGCAGGTACAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGATGGCTCCATCAGCGGTGGTTATTTCTATTGGAGTTGGATCCGCCAGCCCCCAGGGAAGGGCCTGGAGTGGGTTGGGTCCATCCATCACAGTGGGAACACCTACTACAACCCGGCCCTCGAGAGTCGAGTTACCGTATCAATAGACACGTCGATGAAGCAGTTCTCCCTGAAAATGAGGTCTGTGACGGCCGCAGACACGGCCGTGTATTTCTGTGCCAGAGGAGAGGCAACGGTGATGATGTTTCCTCCGGACTACTGGGGCCAGGGAACCCGGGTCATCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCACCCTCCTCCAAGAGCACCTCTGGGGGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCCCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV4-30-4*01,CAGGTGCAGGTACAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGATGGCTCCATCAGCGGTGGTTATTTCTATTGGAGTTGGATCCGCCAGCCCCCAGGGAAGGGCCTGGAGTGGGTTGGGTCCATCCATCACAGTGGGAACACCTACTACAACCCGGCCCTCGAGAGTCGAGTTACCGTATCAATAGACACGTCGATGAAGCAGTTCTCCCTGAAAATGAGGTCTGTGACGGCCGCAGACACGGCCGTGTATTTCTGTGCCAGAGGAGAGGCAACGGTGATGATGTTTCCTCCGGACTACTGGGGCCAGGGAACCCGGGTCATCGTCTCCTCAG,QVQVQESGPGLVKPSQTLSLTCTVSDGSISGGYFYWSWIRQPPGKGLEWVGSIHHSGNTYYNPALESRVTVSIDTSMKQFSLKMRSVTAADTAVYFCARGEATVMMFPPDYWGQGTRVIVSS,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGDYYWSWIRQPPGKGLEWIGYIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXTVXXXXXDYWGQGTLVTVSS,ARGEATVMMFPPDY,TTATGGGGGCTGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTATTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCCGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGCCGTTGGTGGTTACAACTTTGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCGTGATTTATGGTGTCAGTAATCGGCCCTCAGGGGTTTCTGATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCGCGTCATATAGAACCGACACCATTGTGTTCGGCGGAGGGACGAAGCTGACCGTCCTAAGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-14*03,CAGTCTGCCCCGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGCCGTTGGTGGTTACAACTTTGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCGTGATTTATGGTGTCAGTAATCGGCCCTCAGGGGTTTCTGATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCGCGTCATATAGAACCGACACCATTGTGTTCGGCGGAGGGACGAAGCTGACCGTCCTA,QSAPTQPASVSGSPGQSITISCTGTSSAVGGYNFVSWYQQHPGKAPKLVIYGVSNRPSGVSDRFSGSKSGNTASLTISGLQADDEADYYCASYRTDTIVFGGGTKLTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSXVFGGGTKLTVL,ASYRTDTIV,QVQVQESGPGLVKPSQTLSLTCTVSDGSISGGYFYWSWIRQPPGKGLEWVGSIHHSGNTYYNPALESRVTVSIDTSMKQFSLKMRSVTAADTAVYFCARGEATVMMFPPDYWGQGTRVIVSS[SEP]QSAPTQPASVSGSPGQSITISCTGTSSAVGGYNFVSWYQQHPGKAPKLVIYGVSNRPSGVSDRFSGSKSGNTASLTISGLQADDEADYYCASYRTDTIVFGGGTKLTVL,Memory-B-Cells,,human,IGHV4-30-4,IGLV2-14
5,CCACATCCCTCCTCAGAAGCCCCCAGAGCACAACGCCTCACCATGGACTGGACCTGGAGGATCCTCTTTTTGGTGGCAGCAGCCACAGGTGCCCACTCCCAGGTCCAACTTGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAGGACTTCTGGATACAACTTCCCTTTTTATACTATACATTGGGTGCGCCAGGCCCCCGGACAAGGGCTTGAGTGGATGGGATGGGTCAACGCTGGCAATGGTGACACAAGATATTCACAGAACTTCCAGGGCAGAGTCACCATTACCAGGGACACATCCGCGAGCACAGCCTACATGGAGCTGAGAAGCCTGAGATCTGAGGACACAGCTGTCTATTTTTGTGCGAGAGGGGTTATGCTGGTTTCTTCGCTTGATTACTGGGGCCAGGGAACCCTGATCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV1-3*01,CAGGTCCAACTTGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAGGACTTCTGGATACAACTTCCCTTTTTATACTATACATTGGGTGCGCCAGGCCCCCGGACAAGGGCTTGAGTGGATGGGATGGGTCAACGCTGGCAATGGTGACACAAGATATTCACAGAACTTCCAGGGCAGAGTCACCATTACCAGGGACACATCCGCGAGCACAGCCTACATGGAGCTGAGAAGCCTGAGATCTGAGGACACAGCTGTCTATTTTTGTGCGAGAGGGGTTATGCTGGTTTCTTCGCTTGATTACTGGGGCCAGGGAACCCTGATCACCGTCTCCTCAG,QVQLVQSGAEVKKPGASVKVSCRTSGYNFPFYTIHWVRQAPGQGLEWMGWVNAGNGDTRYSQNFQGRVTITRDTSASTAYMELRSLRSEDTAVYFCARGVMLVSSLDYWGQGTLITVSS,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYAMHWVRQAPGQRLEWMGWINAGNGNTKYSQKFQGRVTITRDTSASTAYMELSSLRSEDTAVYYCARXVMLXXXXDYWGQGTLVTVSS,ARGVMLVSSLDY,AGTCTGGGCGTAAGGAAGCAGCACTGGTGGTGCCTCAGCCATGGCCTGGACCGTTCTCCTCCTCGGCCTCCTCTCTCACTGCACAGGGTCTGCGACCTCCTATGTGCTGACTCAGCCACCCTCGGTGTCAGTGGCCCCAGGACAGACGGCCAGGATTTCCTGTGGGGGAAACTCCATTGGAGCTAAAAGCGTGCATTGGTACCAGCAGAAGCCAGGCCAGGCCCCTGTGTTGGTCGTCCGTGATGACAGGGCCCGGCCCTCAGGGATCCCTGAGCGATTCTCTGGCTCCAACTCTTGGAACACGGCCACCCTGACCATCAGCAGGGTCGAAGCCGGGGATGAGGCCGACTATTACTGTCAGGTCTGGGATCCTAATAGTGATCATCCAGGGGTGTTCGGCGGAGGGACCAAGTTGACTGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV3-21*02,TCCTATGTGCTGACTCAGCCACCCTCGGTGTCAGTGGCCCCAGGACAGACGGCCAGGATTTCCTGTGGGGGAAACTCCATTGGAGCTAAAAGCGTGCATTGGTACCAGCAGAAGCCAGGCCAGGCCCCTGTGTTGGTCGTCCGTGATGACAGGGCCCGGCCCTCAGGGATCCCTGAGCGATTCTCTGGCTCCAACTCTTGGAACACGGCCACCCTGACCATCAGCAGGGTCGAAGCCGGGGATGAGGCCGACTATTACTGTCAGGTCTGGGATCCTAATAGTGATCATCCAGGGGTGTTCGGCGGAGGGACCAAGTTGACTGTCCTAG,SYVLTQPPSVSVAPGQTARISCGGNSIGAKSVHWYQQKPGQAPVLVVRDDRARPSGIPERFSGSNSWNTATLTISRVEAGDEADYYCQVWDPNSDHPGVFGGGTKLTVL,SYVLTQPPSVSVAPGQTARITCGGNNIGSKSVHWYQQKPGQAPVLVVYDDSDRPSGIPERFSGSNSGNTATLTISRVEAGDEADYYCQVWDSSSDHPXVFGGGTKLTVL,QVWDPNSDHPGV,QVQLVQSGAEVKKPGASVKVSCRTSGYNFPFYTIHWVRQAPGQGLEWMGWVNAGNGDTRYSQNFQGRVTITRDTSASTAYMELRSLRSEDTAVYFCARGVMLVSSLDYWGQGTLITVSS[SEP]SYVLTQPPSVSVAPGQTARISCGGNSIGAKSVHWYQQKPGQAPVLVVRDDRARPSGIPERFSGSNSWNTATLTISRVEAGDEADYYCQVWDPNSDHPGVFGGGTKLTVL,Memory-B-Cells,,human,IGHV1-3,IGLV3-21
6,CGAGCCCAGCACTGGAAGTCGCCGGTGTTTCCATTCGGTGATCATCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGTTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCACTGTCAGGTGCAAATGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGAAGGTCCCTGAGACTCTCCTGTGCAACGTCTGGATTTAAGTTCGGTATCTATCCTATGCACTGGGTCCGGCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCTGTCATCTCCTATGATGGGAGCAAAACAGACTACGCAGACTCCCTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACTTTGTTTCTCCAAATGAACAACCTGAGACCTGAGGACACGGCTGTGTATTTCTGTGCGACCCCGGAAGGCAATTGTGATGGTGTTTGCTATTGGTCGGAGGGATACTTCCAGCAGTGGGGCCAGGGCACCCTGGTCAGCGTTTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-30-3*02,CAGGTGCAAATGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGAAGGTCCCTGAGACTCTCCTGTGCAACGTCTGGATTTAAGTTCGGTATCTATCCTATGCACTGGGTCCGGCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCTGTCATCTCCTATGATGGGAGCAAAACAGACTACGCAGACTCCCTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACTTTGTTTCTCCAAATGAACAACCTGAGACCTGAGGACACGGCTGTGTATTTCTGTGCGACCCCGGAAGGCAATTGTGATGGTGTTTGCTATTGGTCGGAGGGATACTTCCAGCAGTGGGGCCAGGGCACCCTGGTCAGCGTTTCCTCAG,QVQMVESGGGVVQPGRSLRLSCATSGFKFGIYPMHWVRQAPGKGLEWVAVISYDGSKTDYADSLKGRFTISRDNSKNTLFLQMNNLRPEDTAVYFCATPEGNCDGVCYWSEGYFQQWGQGTLVSVSS,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAXXXXXCGGDCYXXXXYFQHWGQGTLVTVSS,ATPEGNCDGVCYWSEGYFQQ,GGGGAGAAGTCTCTCTCAGTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGACTCCTGCTGCTCTGGCTCCCAGAAAGCAGCTGTGACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTATTGGAGACAGAGTCACCATCACTTGCCGGGCGAGTCAGGACATTAATAATTATTTAGCCTGGTATCAGCAGAAACCAGGGAAAGTTCCTAAACTCCTGATCTATGCTGCATCCACTTTGCATTCAGGAGTCCCATCTCGGTTCAGTGGCAGTGGGTCTGGGACAGATTTCTCTCTTACCATCAGCAGCCTGCAGCCTGACGATATTGCATCTTATTACTGTCAAAAGTATAACAGTGCCATCACCTTCGGCCAAGGGACCCGACTGGACATTAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-27*01,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTATTGGAGACAGAGTCACCATCACTTGCCGGGCGAGTCAGGACATTAATAATTATTTAGCCTGGTATCAGCAGAAACCAGGGAAAGTTCCTAAACTCCTGATCTATGCTGCATCCACTTTGCATTCAGGAGTCCCATCTCGGTTCAGTGGCAGTGGGTCTGGGACAGATTTCTCTCTTACCATCAGCAGCCTGCAGCCTGACGATATTGCATCTTATTACTGTCAAAAGTATAACAGTGCCATCACCTTCGGCCAAGGGACCCGACTGGACATTAAAC,DIQMTQSPSSLSASIGDRVTITCRASQDINNYLAWYQQKPGKVPKLLIYAASTLHSGVPSRFSGSGSGTDFSLTISSLQPDDIASYYCQKYNSAITFGQGTRLDIK,DIQMTQSPSSLSASVGDRVTITCRASQGISNYLAWYQQKPGKVPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDVATYYCQKYNSAITFGQGTRLEIK,QKYNSAIT,QVQMVESGGGVVQPGRSLRLSCATSGFKFGIYPMHWVRQAPGKGLEWVAVISYDGSKTDYADSLKGRFTISRDNSKNTLFLQMNNLRPEDTAVYFCATPEGNCDGVCYWSEGYFQQWGQGTLVSVSS[SEP]DIQMTQSPSSLSASIGDRVTITCRASQDINNYLAWYQQKPGKVPKLLIYAASTLHSGVPSRFSGSGSGTDFSLTISSLQPDDIASYYCQKYNSAITFGQGTRLDIK,Memory-B-Cells,,human,IGHV3-30-3,IGKV1-27
7,AGCTCTGGGAGAGGAGCCCCAGCCTTGGGATTCCCAAGTGTTTTCATTCAGTGATCAGGACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGATTTTCCTTGCTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGGTGGTGGAGTCTGGGGGAGGCTTGGTAAAGCCGGGGGGGTCCCTCAGACTCTCCTGTGCAGCCTCTGGATTCACTTTCAGTAACGCCTGGATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTGGCCGTATTAAAAGGAAAATTGAAGGTTGGACAACAGACTACGCTGCTCCCGTGAAAGGCAGATTCACCATTTCAAGAGATGATTCAAAAAACACACTGTATCTGCAAATGAACAGCCTGAAAACCGAGGACACAGCCATATATTATTGTACCACAGACACAGTGGCTACGAGTGATTATGCTTTTGATTTCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-15*01,GAGGTGCAGGTGGTGGAGTCTGGGGGAGGCTTGGTAAAGCCGGGGGGGTCCCTCAGACTCTCCTGTGCAGCCTCTGGATTCACTTTCAGTAACGCCTGGATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTGGCCGTATTAAAAGGAAAATTGAAGGTTGGACAACAGACTACGCTGCTCCCGTGAAAGGCAGATTCACCATTTCAAGAGATGATTCAAAAAACACACTGTATCTGCAAATGAACAGCCTGAAAACCGAGGACACAGCCATATATTATTGTACCACAGACACAGTGGCTACGAGTGATTATGCTTTTGATTTCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAG,EVQVVESGGGLVKPGGSLRLSCAASGFTFSNAWMNWVRQAPGKGLEWVGRIKRKIEGWTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAIYYCTTDTVATSDYAFDFWGQGTMVTVSS,EVQLVESGGGLVKPGGSLRLSCAASGFTFSNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTTXXVATXXXAFDVWGQGTMVTVSS,TTDTVATSDYAFDF,AGCTTCAGCTGTGGGTAGAGAAGACAGGACTCAGGACAATCTCCAGCATGGCCAGCTTCCCTCTCCTCCTCACCCTCCTCACTCACTGTGCAGGGTCCTGGGCCCAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGCAGAGGGTCACCCTCTCTTGTTCTGGAAGCAGCTCCAACATCGGAAGTAAGACTGTAAATTGGTACCAGCAGCTCCCAGGAGCGGCCCCCAAACTCCTCATCTATACTACTGATCAGCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCTGCCTCCCTGGCCATCAGTGGGCTCCAGTCTGAGGATGAGGCTGATTATTACTGTGCAGCATGGGATGACAGCCTGAATGGTTATGTCTTCGGAACTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV1-44*01,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGCAGAGGGTCACCCTCTCTTGTTCTGGAAGCAGCTCCAACATCGGAAGTAAGACTGTAAATTGGTACCAGCAGCTCCCAGGAGCGGCCCCCAAACTCCTCATCTATACTACTGATCAGCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCTGCCTCCCTGGCCATCAGTGGGCTCCAGTCTGAGGATGAGGCTGATTATTACTGTGCAGCATGGGATGACAGCCTGAATGGTTATGTCTTCGGAACTGGGACCAAGGTCACCGTCCTAG,QSVLTQPPSASGTPGQRVTLSCSGSSSNIGSKTVNWYQQLPGAAPKLLIYTTDQRPSGVPDRFSGSKSGTSASLAISGLQSEDEADYYCAAWDDSLNGYVFGTGTKVTVL,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNTVNWYQQLPGTAPKLLIYSNNQRPSGVPDRFSGSKSGTSASLAISGLQSEDEADYYCAAWDDSLNGYVFGTGTKVTVL,AAWDDSLNGYV,EVQVVESGGGLVKPGGSLRLSCAASGFTFSNAWMNWVRQAPGKGLEWVGRIKRKIEGWTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAIYYCTTDTVATSDYAFDFWGQGTMVTVSS[SEP]QSVLTQPPSASGTPGQRVTLSCSGSSSNIGSKTVNWYQQLPGAAPKLLIYTTDQRPSGVPDRFSGSKSGTSASLAISGLQSEDEADYYCAAWDDSLNGYVFGTGTKVTVL,Memory-B-Cells,,human,IGHV3-15,IGLV1-44
8,CTGCTGAAGAAAACCAGCCCTGCAGCTCTGGGAGAGGAGCCCCAGCCCTGGGATTCCCAGCTGTTTCTGCTTGCTGATCAGGACTGCACACAGAGAACTCACCATGGAGTTTGGGCTGAACTGGGTTTTCCTTGTTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCTACTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTACTGGATGTACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTAATATTGATGGGAGTAGGACAACCTACGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACACTATATCTGCAAATGAACAGTCTGACAGCCGAGGACACGGCTGTATATTACTGTGCGAGAGACCCGGACACTTCGAACAAAATTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-74*01,GAGGTGCTACTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTACTGGATGTACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTAATATTGATGGGAGTAGGACAACCTACGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACACTATATCTGCAAATGAACAGTCTGACAGCCGAGGACACGGCTGTATATTACTGTGCGAGAGACCCGGACACTTCGAACAAAATTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVLLVESGGGLVQPGGSLRLSCAASGFTFSSYWMYWVRQAPGKGLVWVSRINIDGSRTTYADSVKGRFTISRDNAKNTLYLQMNSLTAEDTAVYYCARDPDTSNKIDYWGQGTLVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCARXXXXXXXXDYWGQGTLVTVSS,ARDPDTSNKIDY,GATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCCCAGGATCCAGTGGGGATGTTGTGGTGACTCAGTCTCCACTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCGGGTCTAGTGAAAGCCTCCTATACAGTAATGGAAACACCTACTTGAGTTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTAATTTATCAGGTTTCTAACCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAAAATCAGCAGGGTGGAGGTTGAGGATGTTGGGGTTTATTTCTGCATGCAAGGTACATATTTGCCGATCACCTTCGGCCAGGGGACACGACTGGAGATTAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-30*01,GATGTTGTGGTGACTCAGTCTCCACTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCGGGTCTAGTGAAAGCCTCCTATACAGTAATGGAAACACCTACTTGAGTTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTAATTTATCAGGTTTCTAACCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAAAATCAGCAGGGTGGAGGTTGAGGATGTTGGGGTTTATTTCTGCATGCAAGGTACATATTTGCCGATCACCTTCGGCCAGGGGACACGACTGGAGATTAAAC,DVVVTQSPLSLPVTLGQPASISCGSSESLLYSNGNTYLSWFQQRPGQSPRRLIYQVSNRDSGVPDRFSGSGSGTDFTLKISRVEVEDVGVYFCMQGTYLPITFGQGTRLEIK,DVVMTQSPLSLPVTLGQPASISCRSSQSLVYSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTHWPITFGQGTRLEIK,MQGTYLPIT,EVLLVESGGGLVQPGGSLRLSCAASGFTFSSYWMYWVRQAPGKGLVWVSRINIDGSRTTYADSVKGRFTISRDNAKNTLYLQMNSLTAEDTAVYYCARDPDTSNKIDYWGQGTLVTVSS[SEP]DVVVTQSPLSLPVTLGQPASISCGSSESLLYSNGNTYLSWFQQRPGQSPRRLIYQVSNRDSGVPDRFSGSGSGTDFTLKISRVEVEDVGVYFCMQGTYLPITFGQGTRLEIK,Memory-B-Cells,,human,IGHV3-74,IGKV2-30
9,AGCTCTCAGAGAGGTGCCTTAGCCCTGGATTCCAAGGCATTTCCACTTGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTGGGGCTGTGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTCAGTACCTATAGCATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTTCATACATTAGTAGTAGTAGTAATAGCATATACTACGCAGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCGAGTCTCATTATAGACTGGGGATCTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-48*04,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTCAGTACCTATAGCATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTTCATACATTAGTAGTAGTAGTAATAGCATATACTACGCAGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCGAGTCTCATTATAGACTGGGGATCTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFIFSTYSMNWVRQAPGKGLEWVSYISSSSNSIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCASLIIDWGSDYWGQGTLVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYSMNWVRQAPGKGLEWVSYISSSSSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAXXXXXWGXDYWGQGTLVTVSS,ASLIIDWGSDY,GATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCCCAGGATCCAGTGGGGAAGTTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAAAGCCTCGTATACAATGATGGAAACACCTACTTGAATTGGTTTCACCAGAGGCCAGGCCAATCTCCAAGGCGCCTAATTTATAAGGTTTCTAACCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAAAATCAGCAGGGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGGTACACAGTGGCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-30*01,GAAGTTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAAAGCCTCGTATACAATGATGGAAACACCTACTTGAATTGGTTTCACCAGAGGCCAGGCCAATCTCCAAGGCGCCTAATTTATAAGGTTTCTAACCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAAAATCAGCAGGGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGGTACACAGTGGCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,EVVMTQSPLSLPVTLGQPASISCRSSQSLVYNDGNTYLNWFHQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTQWPLTFGGGTKVEIK,DVVMTQSPLSLPVTLGQPASISCRSSQSLVYSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTHWPLTFGGGTKVEIK,MQGTQWPLT,EVQLVESGGGLVQPGGSLRLSCAASGFIFSTYSMNWVRQAPGKGLEWVSYISSSSNSIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCASLIIDWGSDYWGQGTLVTVSS[SEP]EVVMTQSPLSLPVTLGQPASISCRSSQSLVYNDGNTYLNWFHQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTQWPLTFGGGTKVEIK,Memory-B-Cells,,human,IGHV3-48,IGKV2-30
10,AGCTCTGAGAGAGGAGCCTTAGCCCTGGATTCCAAGGCCTATCCACTTGGTGATCAGCACTGAGTACCGAGGATTAACCATGGAACTGGGGCTCCGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCCTGGTCAAGCCTGGGGGGTCCCTGAGACTCTCCTGTGTAGGCTCTGGATTCACCTTCAGTGCCTATAGCATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCATCCATTAGTACGAAGAGTAATTACATATTCTATGCAGACTCACTGAAGGGCCGATTCACCATATCCAGAGACAGCGCCAAGAACTCAGTCTTTCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCGAGGAAACTAGGATATTGTACTGCTGATGTATGTTATGGGGGGGATGCCTTTGATCTCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-21*01,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCCTGGTCAAGCCTGGGGGGTCCCTGAGACTCTCCTGTGTAGGCTCTGGATTCACCTTCAGTGCCTATAGCATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCATCCATTAGTACGAAGAGTAATTACATATTCTATGCAGACTCACTGAAGGGCCGATTCACCATATCCAGAGACAGCGCCAAGAACTCAGTCTTTCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCGAGGAAACTAGGATATTGTACTGCTGATGTATGTTATGGGGGGGATGCCTTTGATCTCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAG,EVQLVESGGGLVKPGGSLRLSCVGSGFTFSAYSMNWVRQAPGKGLEWVSSISTKSNYIFYADSLKGRFTISRDSAKNSVFLQMNSLRAEDTAVYYCARKLGYCTADVCYGGDAFDLWGQGTMVTVSS,EVQLVESGGGLVKPGGSLRLSCAASGFTFSSYSMNWVRQAPGKGLEWVSSISSSSSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAXXXGYCTGGVCYXXDAFDVWGQGTMVTVSS,ARKLGYCTADVCYGGDAFDL,AGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGACACCACCGGAGAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGTAGGGCCAGTCAGAGTATTGCCGACAACTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGGAGATTTTGCAGTGTATTACTGTCAGCAATATGGTACCTCATCGTACACTTTTGGCCAGGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-20*01,GAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGTAGGGCCAGTCAGAGTATTGCCGACAACTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGGAGATTTTGCAGTGTATTACTGTCAGCAATATGGTACCTCATCGTACACTTTTGGCCAGGGGACCAAGGTGGAGATCAAAC,EIVLTQSPGTLSLSPGERATLSCRASQSIADNYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPGDFAVYYCQQYGTSSYTFGQGTKVEIK,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSXYTFGQGTKLEIK,QQYGTSSYT,EVQLVESGGGLVKPGGSLRLSCVGSGFTFSAYSMNWVRQAPGKGLEWVSSISTKSNYIFYADSLKGRFTISRDSAKNSVFLQMNSLRAEDTAVYYCARKLGYCTADVCYGGDAFDLWGQGTMVTVSS[SEP]EIVLTQSPGTLSLSPGERATLSCRASQSIADNYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPGDFAVYYCQQYGTSSYTFGQGTKVEIK,Memory-B-Cells,,human,IGHV3-21,IGKV3-20

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,general_v_gene_heavy,general_v_gene_light
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String,String,String7,String,String
1,AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCATTTGGTGATCAGGACTGAACAGAGAGAACTCATCATGGAGTTTGGGCTGAGTTGGCTTTTTCTTGTGGCTACTTTAAAAGGTGTCCAGTGTGTGGTGCAGCTGGTGGAGTCGGGGGGAGGCTTGGTACAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCGCTTTTAGAAACTATGTCATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGGTATTAGTGACAGTGGTACTAACACATACTATGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAGGACCACGCTGTATCTGCAAATGAGCAGCCTGAGAGCCGAGGACGCGGCCGTATATTACTGTGCGAAGCCCCCCGACTGGAACCCCTTCTTATTACGATATTTTGTCGACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-23*04,GGTGCAGCTGGTGGAGTCGGGGGGAGGCTTGGTACAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCGCTTTTAGAAACTATGTCATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGGTATTAGTGACAGTGGTACTAACACATACTATGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAGGACCACGCTGTATCTGCAAATGAGCAGCCTGAGAGCCGAGGACGCGGCCGTATATTACTGTGCGAAGCCCCCCGACTGGAACCCCTTCTTATTACGATATTTTGTCGACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,VQLVESGGGLVQPGGSLRLSCAASGFAFRNYVMSWVRQAPGKGLEWVSGISDSGTNTYYADSVKGRFTISRDNSRTTLYLQMSSLRAEDAAVYYCAKPPDWNPFLLRYFVDWGQGTLVTVSS,VQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAXXXXXXXXXLRYFDYWGQGTLVTVSS,AKPPDWNPFLLRYFVD,GGGGCTGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTATTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGCGGTTATAACTATGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGCTTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAACACTTATGTCTTCGGAAGTGGGACCAAGGTCACCGTCCTTAGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGA,L,IGLV2-14*03,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGCGGTTATAACTATGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGCTTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAACACTTATGTCTTCGGAAGTGGGACCAAGGTCACCGTCCT,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQHHPGKAPKLMLYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSNTYVFGSGTKVTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTYVFGTGTKVTVL,SSYTSSNTYV,VQLVESGGGLVQPGGSLRLSCAASGFAFRNYVMSWVRQAPGKGLEWVSGISDSGTNTYYADSVKGRFTISRDNSRTTLYLQMSSLRAEDAAVYYCAKPPDWNPFLLRYFVDWGQGTLVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQHHPGKAPKLMLYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSNTYVFGSGTKVTVL,Plasmablast_Plasma-B-Cells,Multiple-sclerosis,human,IGHV3-23,IGLV2-14
2,GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTCTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCCGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGAAGTTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCATATGATGGAAGCAATGAATATTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACACTGTATCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAAGTCGTGGGCGTCCCGGACCAAGGCTACTACTACTACGGTATGGACGTCTGGGGCCAGGGGACCACGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-30*18,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCCGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGAAGTTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCATATGATGGAAGCAATGAATATTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACACTGTATCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAAGTCGTGGGCGTCCCGGACCAAGGCTACTACTACTACGGTATGGACGTCTGGGGCCAGGGGACCACGGTCACCGTCTCCTCA,QVQLVESGGGVVQPGRSPRLSCAASGFTFRSYGMHWVRQAPGKGLEWVAVISYDGSNEYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKVVGVPDQGYYYYGMDVWGQGTTVTVSS,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXVGXXXXXYYYYGMDVWGQGTTVTVSS,AKVVGVPDQGYYYYGMDV,GGGGGCTGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACATTGGTGATTATAACTATGTCTCCTGGTATCAACAGTACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAAGCGGCCCTCAGGGGTTTCTGATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCGGGACTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGTACCATTTATGTCTTCGGAACTGGGACCAAGGTCTCCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-14*01,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACATTGGTGATTATAACTATGTCTCCTGGTATCAACAGTACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAAGCGGCCCTCAGGGGTTTCTGATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCGGGACTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGTACCATTTATGTCTTCGGAACTGGGACCAAGGTCTCCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDIGDYNYVSWYQQYPGKAPKLMIYDVSKRPSGVSDRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSTIYVFGTGTKVSVL,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYEVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSXYVFGTGTKVTVL,SSYTSSTIYV,QVQLVESGGGVVQPGRSPRLSCAASGFTFRSYGMHWVRQAPGKGLEWVAVISYDGSNEYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKVVGVPDQGYYYYGMDVWGQGTTVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDIGDYNYVSWYQQYPGKAPKLMIYDVSKRPSGVSDRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSTIYVFGTGTKVSVL,Plasmablast_Plasma-B-Cells,Multiple-sclerosis,human,IGHV3-30,IGLV2-14
3,ACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTCCTCCTGGTGGCAGCTCCCAGATGGGTCCTTTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGCAGCCTTCACAGACCCTGTCCCTCACCTGCATTGTCTCTGGTGGCTCCATTAGCAGTGGTACTTACTACTGGAGCTGGATCCGGCAGCCCGCCGGGAAGGGACTGGAGTGGATTGGGCGTATCTATACCAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAGGCTGAGTTCTGTGACCGCCGCAGACACGGCCGTGTATTACTGTGCGAGAGAGGAGCGGGGGAAGTATTACTATGATAGTAGTGGCTATTACCCGGGGCACCTTGACTCCTGGGGCCAGGGAACCCTGGTCAGTGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV4-61*02,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGCAGCCTTCACAGACCCTGTCCCTCACCTGCATTGTCTCTGGTGGCTCCATTAGCAGTGGTACTTACTACTGGAGCTGGATCCGGCAGCCCGCCGGGAAGGGACTGGAGTGGATTGGGCGTATCTATACCAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAGGCTGAGTTCTGTGACCGCCGCAGACACGGCCGTGTATTACTGTGCGAGAGAGGAGCGGGGGAAGTATTACTATGATAGTAGTGGCTATTACCCGGGGCACCTTGACTCCTGGGGCCAGGGAACCCTGGTCAGTGTCTCCTCAG,QVQLQESGPGLVQPSQTLSLTCIVSGGSISSGTYYWSWIRQPAGKGLEWIGRIYTSGSTNYNPSLKSRVTISVDTSKNQFSLRLSSVTAADTAVYYCAREERGKYYYDSSGYYPGHLDSWGQGTLVSVSS,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGRIYTSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXXYYYDSSGYYXXXXDYWGQGTLVTVSS,AREERGKYYYDSSGYYPGHLDS,GGGGTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGACACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAGTTATAACCTTGTCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCATTAAGCGGCCCTCAGGACTTTCTACTCGCTTCTCTGGCTCCAAGTCTGGCATCACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGAGGACGAGGCTGAATATTACTGCTGCTCATATGCAGGTAGTGGCCTTTGGGTGTTCGGCGGAGGGACCGAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-23*02,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAGTTATAACCTTGTCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCATTAAGCGGCCCTCAGGACTTTCTACTCGCTTCTCTGGCTCCAAGTCTGGCATCACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGAGGACGAGGCTGAATATTACTGCTGCTCATATGCAGGTAGTGGCCTTTGGGTGTTCGGCGGAGGGACCGAGCTGACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVIKRPSGLSTRFSGSKSGITASLTISGLQAEDEAEYYCCSYAGSGLWVFGGGTELTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSSXWVFGGGTKLTVL,CSYAGSGLWV,QVQLQESGPGLVQPSQTLSLTCIVSGGSISSGTYYWSWIRQPAGKGLEWIGRIYTSGSTNYNPSLKSRVTISVDTSKNQFSLRLSSVTAADTAVYYCAREERGKYYYDSSGYYPGHLDSWGQGTLVSVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVIKRPSGLSTRFSGSKSGITASLTISGLQAEDEAEYYCCSYAGSGLWVFGGGTELTVL,Plasmablast_Plasma-B-Cells,Multiple-sclerosis,human,IGHV4-61,IGLV2-23
4,TGGGGAGTGACTCCTGTGCCCCACCATGGACACACTTTGCTCCACGCTCCTGCTGCTGACCATCCCTTCATGGGTCTTGTCCCAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTCTCTGGGTTCTCACTCAACACTAATGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCAGGAGAGGCCCTGGAGTGGCTTGCACTCATTTATTGGGATGGTGATGAGCGCTACAGTCCATCTCTGAAGAGCAGGCTCACCATGACCAAGGACACCTCCAAAAACCAGGTGGTCCTTACAATGACCAACATGGACCCTGTGGACACAGCCACATATTACTGTGCACACCATTATTATGGTTCGGGAGGTTCTTATTCTTCATTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV2-5*02,CAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTCTCTGGGTTCTCACTCAACACTAATGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCAGGAGAGGCCCTGGAGTGGCTTGCACTCATTTATTGGGATGGTGATGAGCGCTACAGTCCATCTCTGAAGAGCAGGCTCACCATGACCAAGGACACCTCCAAAAACCAGGTGGTCCTTACAATGACCAACATGGACCCTGTGGACACAGCCACATATTACTGTGCACACCATTATTATGGTTCGGGAGGTTCTTATTCTTCATTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QITLKESGPTLVKPTQTLTLTCTFSGFSLNTNGVGVGWIRQPPGEALEWLALIYWDGDERYSPSLKSRLTMTKDTSKNQVVLTMTNMDPVDTATYYCAHHYYGSGGSYSSFDYWGQGTLVTVSS,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWDDDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHXYYGSGSYYXXFDYWGQGTLVTVSS,AHHYYGSGGSYSSFDY,AGGAGTCAGACCCAGTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGTTCCCAGGTTCCAGATGCGACATCCAGATGACCCAGTCTCCATCTTCCGTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGTCGGGCGAGTCAGGGTATTGGCACCTGGTTAGCCTGGTATCAGCAGAAACCAGGGAGAGCCCCTAAGCTCCTGATCTATGCTACATCCAGATTGCAAAGTGGGGTCCCATCAAGGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCTTCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTACTATTGTCAACAGGTTAACAGTTTCCCGTTCACTTTTGGCCAGGGGTCCAAGCTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-12*01,GACATCCAGATGACCCAGTCTCCATCTTCCGTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGTCGGGCGAGTCAGGGTATTGGCACCTGGTTAGCCTGGTATCAGCAGAAACCAGGGAGAGCCCCTAAGCTCCTGATCTATGCTACATCCAGATTGCAAAGTGGGGTCCCATCAAGGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCTTCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTACTATTGTCAACAGGTTAACAGTTTCCCGTTCACTTTTGGCCAGGGGTCCAAGCTGGAGATCAAAC,DIQMTQSPSSVSASVGDRVTITCRASQGIGTWLAWYQQKPGRAPKLLIYATSRLQSGVPSRFSGSGSGTDFTLTFSSLQPEDFATYYCQQVNSFPFTFGQGSKLEIK,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPXTFGQGTKLEIK,QQVNSFPFT,QITLKESGPTLVKPTQTLTLTCTFSGFSLNTNGVGVGWIRQPPGEALEWLALIYWDGDERYSPSLKSRLTMTKDTSKNQVVLTMTNMDPVDTATYYCAHHYYGSGGSYSSFDYWGQGTLVTVSS[SEP]DIQMTQSPSSVSASVGDRVTITCRASQGIGTWLAWYQQKPGRAPKLLIYATSRLQSGVPSRFSGSGSGTDFTLTFSSLQPEDFATYYCQQVNSFPFTFGQGSKLEIK,Plasmablast_Plasma-B-Cells,Multiple-sclerosis,human,IGHV2-5,IGKV1-12
5,ACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTCCTCCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTCTGTCTGGTGGCTCCATCAGCAGTACCAGTTACTACTGGACCTGGATCCGGCAGACCGCCGAGAAGGGACTGGAGTGGATTGGGCGTATCTATGCCAGTGGGGCCACCCACTATAACCCCTCCCTCAAGAGTCGAGTCACCATGTCAATAGACACGTCCAAGAATGAATTCTCCCTGACGGTGACGTCTGTGACCGCCGCAGACACGGCCGTATATTTCTGTGCGAGGGGCCCCTTCGAGTTTAACAGCTATCGGGGTGCTTTTGATATCTGGGGCCAAGGGATAATGGTCACCGTCTCTTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCACCCTCCTCCAAGAGCACCTCTGGGGGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCCCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV4-61*02,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTCTGTCTGGTGGCTCCATCAGCAGTACCAGTTACTACTGGACCTGGATCCGGCAGACCGCCGAGAAGGGACTGGAGTGGATTGGGCGTATCTATGCCAGTGGGGCCACCCACTATAACCCCTCCCTCAAGAGTCGAGTCACCATGTCAATAGACACGTCCAAGAATGAATTCTCCCTGACGGTGACGTCTGTGACCGCCGCAGACACGGCCGTATATTTCTGTGCGAGGGGCCCCTTCGAGTTTAACAGCTATCGGGGTGCTTTTGATATCTGGGGCCAAGGGATAATGGTCACCGTCTCTTCAG,QVQLQESGPGLVKPSQTLSLTCTLSGGSISSTSYYWTWIRQTAEKGLEWIGRIYASGATHYNPSLKSRVTMSIDTSKNEFSLTVTSVTAADTAVYFCARGPFEFNSYRGAFDIWGQGIMVTVSS,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGRIYTSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAXXXXXXXSYXXAFDIWGQGTMVTVSS,ARGPFEFNSYRGAFDI,AGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACCGGAGAAATTGTCTTGACGCAGTCTCCTGGCACCCTGTCTTTGTCTCCAGGGGAGAGAGCCACCCTCTCCTGCAGGACCAGTCAGAGTGTTGGCAGCAGTTACTTAGGCTGGTATCAGCAGAGACGTGGCCAGGCTCCCAGGCTCCTCATTTATGGTGCATCCAGAAGGGTCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTACTTCACCTCTCACTTTTGGCCAGGGGACCAAGCTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-20*01,GAAATTGTCTTGACGCAGTCTCCTGGCACCCTGTCTTTGTCTCCAGGGGAGAGAGCCACCCTCTCCTGCAGGACCAGTCAGAGTGTTGGCAGCAGTTACTTAGGCTGGTATCAGCAGAGACGTGGCCAGGCTCCCAGGCTCCTCATTTATGGTGCATCCAGAAGGGTCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTACTTCACCTCTCACTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC,EIVLTQSPGTLSLSPGERATLSCRTSQSVGSSYLGWYQQRRGQAPRLLIYGASRRVTGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGTSPLTFGQGTKLEIK,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSPXTFGQGTKLEIK,QQYGTSPLT,QVQLQESGPGLVKPSQTLSLTCTLSGGSISSTSYYWTWIRQTAEKGLEWIGRIYASGATHYNPSLKSRVTMSIDTSKNEFSLTVTSVTAADTAVYFCARGPFEFNSYRGAFDIWGQGIMVTVSS[SEP]EIVLTQSPGTLSLSPGERATLSCRTSQSVGSSYLGWYQQRRGQAPRLLIYGASRRVTGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGTSPLTFGQGTKLEIK,Plasmablast_Plasma-B-Cells,Multiple-sclerosis,human,IGHV4-61,IGKV3-20
6,GGGAGCATCACCCAGCAACCACATCTGTCCTCTAGAGAATCCCCTGAGAGCTCCGTTCCTCACCATGGACTGGACCTGGAGGATCCTCTTCTTGGTGGCAGCAGCCACAGGAGCCCACTCCCAGGTGCGCTTGGTGCAGTCGGGGGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAGGGCTTCTGGATACACCTTCACCGGCAATTATATACACTGGGTGCGCCAGGCCCCTGGACAAGGGCCTGAATGGATGGGACGGATCAACTCTAATAGTGGTGGCACAAAATATGCACAGAAGTTTCAGGGCAGGGTCACCATGACCAGGGACACGTCCATCAATACGGTCTACGTGGAGCTGAATAGCCTGCGATCTGACGACACGGCCGTGTATTATTGTGCTACAGGGGTAATAGTTACGGATTCTTTTGATTTCTGGGGCCAAGGGACAATGGTCACCGTCTCTTTAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV1-2*06,CAGGTGCGCTTGGTGCAGTCGGGGGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAGGGCTTCTGGATACACCTTCACCGGCAATTATATACACTGGGTGCGCCAGGCCCCTGGACAAGGGCCTGAATGGATGGGACGGATCAACTCTAATAGTGGTGGCACAAAATATGCACAGAAGTTTCAGGGCAGGGTCACCATGACCAGGGACACGTCCATCAATACGGTCTACGTGGAGCTGAATAGCCTGCGATCTGACGACACGGCCGTGTATTATTGTGCTACAGGGGTAATAGTTACGGATTCTTTTGATTTCTGGGGCCAAGGGACAATGGTCACCGTCTCTT,QVRLVQSGAEVKKPGASVKVSCRASGYTFTGNYIHWVRQAPGQGPEWMGRINSNSGGTKYAQKFQGRVTMTRDTSINTVYVELNSLRSDDTAVYYCATGVIVTDSFDFWGQGTMVTVS,QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLEWMGRINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCARXXIVXDAFDVWGQGTMVTVS,ATGVIVTDSFDF,GGGACTGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCTCTGGATCCAGTGGGGATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGAATAGTAATGGCTACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGCAGTCTCCACAGCTCCTGATCTATTTGGCTTCTAATCGGGCCACCGGGGTCCCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGCATTTATTACTGCATGCAAACTCTACAAACTCCTCGAACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-28*01,GATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGAATAGTAATGGCTACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGCAGTCTCCACAGCTCCTGATCTATTTGGCTTCTAATCGGGCCACCGGGGTCCCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGCATTTATTACTGCATGCAAACTCTACAAACTCCTCGAACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,DIVMTQSPLSLPVTPGEPASISCRSSQSLLNSNGYNYLDWYLQKPGQSPQLLIYLASNRATGVPDRFSGSGSGTDFTLKISRVEAEDVGIYYCMQTLQTPRTFGQGTKVEIK,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPXTFGQGTKVEIK,MQTLQTPRT,QVRLVQSGAEVKKPGASVKVSCRASGYTFTGNYIHWVRQAPGQGPEWMGRINSNSGGTKYAQKFQGRVTMTRDTSINTVYVELNSLRSDDTAVYYCATGVIVTDSFDFWGQGTMVTVS[SEP]DIVMTQSPLSLPVTPGEPASISCRSSQSLLNSNGYNYLDWYLQKPGQSPQLLIYLASNRATGVPDRFSGSGSGTDFTLKISRVEAEDVGIYYCMQTLQTPRTFGQGTKVEIK,Plasmablast_Plasma-B-Cells,Multiple-sclerosis,human,IGHV1-2,IGKV2-28
7,TCGATTTTATTTTCTTATATGGGGATGCTTTCTGAGAGTCATGGATCTCATGTGCAAGAAAGTGAAGCACCTGTGGTTCTTCCTCCTGCTGGTGGCGGCTCCCAGATGGGTCCTGTCCCAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGCAGTAATTACTTCTGGGGTTGGATCCGCCAGCCCCCAGGGAAGGGTCTGGAGTACATTGGGAGTATCTATTATAGTGCGCGCACCTATATCAACCCGTCTCTCAGGAGTCGAGTCACCATGTCCGTCGACACATCCGGAAACCGGATTTCCCTCAAGCTGACCTCTGTGACCGCGGCAGACACGGCTCTGTATTTCTGTGTGAGACATGCCTTCCTGGGCGGACGGACCAAGGACTTTGACTTCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV4-39*01,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGCAGTAATTACTTCTGGGGTTGGATCCGCCAGCCCCCAGGGAAGGGTCTGGAGTACATTGGGAGTATCTATTATAGTGCGCGCACCTATATCAACCCGTCTCTCAGGAGTCGAGTCACCATGTCCGTCGACACATCCGGAAACCGGATTTCCCTCAAGCTGACCTCTGTGACCGCGGCAGACACGGCTCTGTATTTCTGTGTGAGACATGCCTTCCTGGGCGGACGGACCAAGGACTTTGACTTCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSNYFWGWIRQPPGKGLEYIGSIYYSARTYINPSLRSRVTMSVDTSGNRISLKLTSVTAADTALYFCVRHAFLGGRTKDFDFWGQGTLVTVSS,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXLGXXXXXFDYWGQGTLVTVSS,VRHAFLGGRTKDFDF,AGGAGTCAGACTCAGTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGCTCCCAGGTGCCAAATGTGACATCCAGATGACCCAGTCTCCTTCCACCCTGTCTGCATCTGTAGGAGACAGAGTCACCATCAGTTGTCGGGCCAGTCAGAGTATTAGTAACTGGGTGGCCTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAACTCCTGATCTATGATGCCTCCAGTTTGGAAAGTGGGGTCCCATCAAGGTTCAGCGGCAGTGGATCTGGGACAGAATTCACTCTCACCATCAGCAGCCTGCAGCCTGATGATTTTGCAACTTATTACTGCCAGCACTATAATATTTATTCTCCGTGGATGTTCGGCCAAGGGACCAAGGTGGAAGTCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-5*01,GACATCCAGATGACCCAGTCTCCTTCCACCCTGTCTGCATCTGTAGGAGACAGAGTCACCATCAGTTGTCGGGCCAGTCAGAGTATTAGTAACTGGGTGGCCTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAACTCCTGATCTATGATGCCTCCAGTTTGGAAAGTGGGGTCCCATCAAGGTTCAGCGGCAGTGGATCTGGGACAGAATTCACTCTCACCATCAGCAGCCTGCAGCCTGATGATTTTGCAACTTATTACTGCCAGCACTATAATATTTATTCTCCGTGGATGTTCGGCCAAGGGACCAAGGTGGAAGTCAAAC,DIQMTQSPSTLSASVGDRVTISCRASQSISNWVAWYQQKPGKAPKLLIYDASSLESGVPSRFSGSGSGTEFTLTISSLQPDDFATYYCQHYNIYSPWMFGQGTKVEVK,DIQMTQSPSTLSASVGDRVTITCRASQSISSWLAWYQQKPGKAPKLLIYDASSLESGVPSRFSGSGSGTEFTLTISSLQPDDFATYYCQQYNSYSPWTFGQGTKVEIK,QHYNIYSPWM,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSNYFWGWIRQPPGKGLEYIGSIYYSARTYINPSLRSRVTMSVDTSGNRISLKLTSVTAADTALYFCVRHAFLGGRTKDFDFWGQGTLVTVSS[SEP]DIQMTQSPSTLSASVGDRVTISCRASQSISNWVAWYQQKPGKAPKLLIYDASSLESGVPSRFSGSGSGTEFTLTISSLQPDDFATYYCQHYNIYSPWMFGQGTKVEVK,Plasmablast_Plasma-B-Cells,Multiple-sclerosis,human,IGHV4-39,IGKV1-5
8,AGTGACTCCTGTGCCCCACCATGGACACACTTTGCTCCACGCTCCTGCTGCTGACCATCCCTTCATGGGTCTTGTCCCAGATCATCTTGAAGGAGTCTGGTCCTACCCTGGTGAAACCCACACAGACCCTCACGCTGACCTGTATTTTCTCTGGATTTTCACTCAGCGAAGATAGAGTGGCTGTGGCCTGGATCCGCCAGCCCCCAGGAAAGGCCCTGGAGTGGCTTGGTCTCTTTTATTGGGATGATGACAAACGCTTTAGTCCATCCCTGAAGAGCCGCCTCTCCCTCACCAGGGACCCCTCCAGAGACCAGGTGGTCCTTACCATGACCAACATGGACCCTGCGGACACAGGCACCTATTACTGTGCCCACAGACCGCGCGGGGGACAAGTTTTTTTTGAATCGTGGGGCCCGGGAGCCCTGGTCACCGTCTCCTCAGCTTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCTGGGGGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCCCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV2-5*02,CAGATCATCTTGAAGGAGTCTGGTCCTACCCTGGTGAAACCCACACAGACCCTCACGCTGACCTGTATTTTCTCTGGATTTTCACTCAGCGAAGATAGAGTGGCTGTGGCCTGGATCCGCCAGCCCCCAGGAAAGGCCCTGGAGTGGCTTGGTCTCTTTTATTGGGATGATGACAAACGCTTTAGTCCATCCCTGAAGAGCCGCCTCTCCCTCACCAGGGACCCCTCCAGAGACCAGGTGGTCCTTACCATGACCAACATGGACCCTGCGGACACAGGCACCTATTACTGTGCCCACAGACCGCGCGGGGGACAAGTTTTTTTTGAATCGTGGGGCCCGGGAGCCCTGGTCACCGTCTCCTCAG,QIILKESGPTLVKPTQTLTLTCIFSGFSLSEDRVAVAWIRQPPGKALEWLGLFYWDDDKRFSPSLKSRLSLTRDPSRDQVVLTMTNMDPADTGTYYCAHRPRGGQVFFESWGPGALVTVSS,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWDDDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHRXXGGXXXXXXWGQGTLVTVSS,AHRPRGGQVFFES,GCTCTGCTTCAGCTGTGGGCACAAGAGGCAGCACTCAGGACAATCTCCAGCATGGCCTGGTCTCCTCTCCTCCTCACTCTCCTCGCTCACTGCACAGGGTCCTGGGCCCAGTCTCTACTGACGCAGCCGCCCTCAGTGTCTGGGGCCCCAGGGCAGTCGGTCACCATCTCCTGCACTGGGACCAGCTCCAATCTCGGGGCAGGATATGATGTACATTGGTATCAGCAGCGTCCACGATCGGCCCCCACACTTCTCATCCATAATAACTACCATCGGCCCTCAGGTGTCTCTGACCGATTTTCAGGCTCCAAGTCTGGCACCTCAGCCTCACTGACCATCACTGGACTCCAGGCGGAGGATGAGGCTGATTATTACTGCCAATCATATGACCTGAACCTCAGCGCTCCCTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,L,IGLV1-40*01,CAGTCTCTACTGACGCAGCCGCCCTCAGTGTCTGGGGCCCCAGGGCAGTCGGTCACCATCTCCTGCACTGGGACCAGCTCCAATCTCGGGGCAGGATATGATGTACATTGGTATCAGCAGCGTCCACGATCGGCCCCCACACTTCTCATCCATAATAACTACCATCGGCCCTCAGGTGTCTCTGACCGATTTTCAGGCTCCAAGTCTGGCACCTCAGCCTCACTGACCATCACTGGACTCCAGGCGGAGGATGAGGCTGATTATTACTGCCAATCATATGACCTGAACCTCAGCGCTCCCTTCGGCGGAGGGACCAAG,QSLLTQPPSVSGAPGQSVTISCTGTSSNLGAGYDVHWYQQRPRSAPTLLIHNNYHRPSGVSDRFSGSKSGTSASLTITGLQAEDEADYYCQSYDLNLSAPFGGGTK,QSVLTQPPSVSGAPGQRVTISCTGSSSNIGAGYDVHWYQQLPGTAPKLLIYGNSNRPSGVPDRFSGSKSGTSASLAITGLQAEDEADYYCQSYDSSLXXXFGGGTK,QSYDLNLSAP,QIILKESGPTLVKPTQTLTLTCIFSGFSLSEDRVAVAWIRQPPGKALEWLGLFYWDDDKRFSPSLKSRLSLTRDPSRDQVVLTMTNMDPADTGTYYCAHRPRGGQVFFESWGPGALVTVSS[SEP]QSLLTQPPSVSGAPGQSVTISCTGTSSNLGAGYDVHWYQQRPRSAPTLLIHNNYHRPSGVSDRFSGSKSGTSASLTITGLQAEDEADYYCQSYDLNLSAPFGGGTK,Plasmablast_Plasma-B-Cells,Multiple-sclerosis,human,IGHV2-5,IGLV1-40
9,AGCTCTGGGAGAGGAGCCCCAGCCGTGAGATTCCCAGGAGTTTCCACTTGGTGATCAGCACTGAACACAGACCACCAACCATGGAGTTTGGGCTTAGCTGGGTTTTCCTTGTTGCTCTTATAAAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCAGGGCGGTCCCTGAGACTCTCCTGTACAGCTTCTGGATTCACCTTTGGTGATTATGCTATGAATTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTAGGTTTCATTAGAGACAAAGCTTATGGTGGGACAGCAGAATACGCCGCGTCTGTGGAGGGCAGATTCACCGTCTCAAGGGATGATTCCAAAGGCATCGCCTATCTGCAAATGAACAGCCTGAAAACTGAGGACTCCGCCCTGTACTACTGCACTAGAGGAGTTTGGTGGGAGCTAGCCCCCGGTGGCCACTACTACTACTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCACCCTCCTCCAAGAGCACCTCTGGGGGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCCCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV3-49*04,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCAGGGCGGTCCCTGAGACTCTCCTGTACAGCTTCTGGATTCACCTTTGGTGATTATGCTATGAATTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTAGGTTTCATTAGAGACAAAGCTTATGGTGGGACAGCAGAATACGCCGCGTCTGTGGAGGGCAGATTCACCGTCTCAAGGGATGATTCCAAAGGCATCGCCTATCTGCAAATGAACAGCCTGAAAACTGAGGACTCCGCCCTGTACTACTGCACTAGAGGAGTTTGGTGGGAGCTAGCCCCCGGTGGCCACTACTACTACTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCA,EVQLVESGGGLVQPGRSLRLSCTASGFTFGDYAMNWVRQAPGKGLEWVGFIRDKAYGGTAEYAASVEGRFTVSRDDSKGIAYLQMNSLKTEDSALYYCTRGVWWELAPGGHYYYYMDVWGKGTTVTVSS,EVQLVESGGGLVQPGRSLRLSCTASGFTFGDYAMSWVRQAPGKGLEWVGFIRSKAYGGTTEYAASVKGRFTISRDDSKSIAYLQMNSLKTEDTAVYYCTRXXXWELXXXXXYYYYMDVWGKGTTVTVSS,TRGVWWELAPGGHYYYYMDV,TGAGCGCAGAAGGCAGGACTCGGGACAATCTTCATCATGACCTGCTCCCCTCTCCTCCTCACCCTTCTCATTCACTGCACAGGGTCCTGGGCCCAGTCTGTGTTGACGCAGCCGCCCTCAATGTCTGCGGCCCCAGGACAGAGGGTCACCATCTCCTGCTCTGGAAGCAGCTCCAACATTGGGAACAATCAAGTATCCTGGTACCAGCAACTCCCAGGAACAGCCCCCAAACTCCTCATTTATGACAATAATAAACGACCCTCAGGGATTCCTGACCGATTCTCTGGCTCCAAGTCTGGCACGTCAGCCACCCTGGACATCACCGGACTCCAGACTGGGGACGAGGCCGATTATTACTGCGGGACATGGGATGCCAGCCTGAGTGGTTTATTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV1-51*01,CAGTCTGTGTTGACGCAGCCGCCCTCAATGTCTGCGGCCCCAGGACAGAGGGTCACCATCTCCTGCTCTGGAAGCAGCTCCAACATTGGGAACAATCAAGTATCCTGGTACCAGCAACTCCCAGGAACAGCCCCCAAACTCCTCATTTATGACAATAATAAACGACCCTCAGGGATTCCTGACCGATTCTCTGGCTCCAAGTCTGGCACGTCAGCCACCCTGGACATCACCGGACTCCAGACTGGGGACGAGGCCGATTATTACTGCGGGACATGGGATGCCAGCCTGAGTGGTTTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QSVLTQPPSMSAAPGQRVTISCSGSSSNIGNNQVSWYQQLPGTAPKLLIYDNNKRPSGIPDRFSGSKSGTSATLDITGLQTGDEADYYCGTWDASLSGLFGGGTKLTVL,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVSWYQQLPGTAPKLLIYDNNKRPSGIPDRFSGSKSGTSATLGITGLQTGDEADYYCGTWDSSLSXXFGGGTKLTVL,GTWDASLSGL,EVQLVESGGGLVQPGRSLRLSCTASGFTFGDYAMNWVRQAPGKGLEWVGFIRDKAYGGTAEYAASVEGRFTVSRDDSKGIAYLQMNSLKTEDSALYYCTRGVWWELAPGGHYYYYMDVWGKGTTVTVSS[SEP]QSVLTQPPSMSAAPGQRVTISCSGSSSNIGNNQVSWYQQLPGTAPKLLIYDNNKRPSGIPDRFSGSKSGTSATLDITGLQTGDEADYYCGTWDASLSGLFGGGTKLTVL,Plasmablast_Plasma-B-Cells,Multiple-sclerosis,human,IGHV3-49,IGLV1-51
10,ATCATCCAACAACCACATCCCTTCTCTACAGAAGCCTCTGAGAGGAAAGTTCTTCACCATGGACTGGACCTGGAGGGTCTTCTGCTTGCTGGCTGTAGCTCCAGGTGCTCTCTCCCAGGACCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAAGGCATCTGGATACATCTTCACCACCTACTATATGCACTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGAATGATCTACCCTAGTGGTGGTCGCACAAGCTACGCACAGAAGTTCCAGGGCAGAGTCACCATGACCAGGGACACGTCCACGAGCACAGTCTACATGGAGCTGAGCGGCCTGAGATCTGAGGATACGGCCGTGTATTACTGTGCGAAAAGCTACTACTACTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV1-46*01,CAGGACCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAAGGCATCTGGATACATCTTCACCACCTACTATATGCACTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGAATGATCTACCCTAGTGGTGGTCGCACAAGCTACGCACAGAAGTTCCAGGGCAGAGTCACCATGACCAGGGACACGTCCACGAGCACAGTCTACATGGAGCTGAGCGGCCTGAGATCTGAGGATACGGCCGTGTATTACTGTGCGAAAAGCTACTACTACTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCA,QDQLVQSGAEVKKPGASVKVSCKASGYIFTTYYMHWVRQAPGQGLEWMGMIYPSGGRTSYAQKFQGRVTMTRDTSTSTVYMELSGLRSEDTAVYYCAKSYYYYMDVWGKGTTVTVSS,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYMHWVRQAPGQGLEWMGIINPSGGSTSYAQKFQGRVTMTRDTSTSTVYMELSSLRSEDTAVYYCAXXYYYYMDVWGKGTTVTVSS,AKSYYYYMDV,CCTGGGTCAGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACCGGAGAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCACCTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTAGTTCACCGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-20*01,GAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCACCTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTAGTTCACCGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,EIVLTQSPGTLSLSPGERATLSCRASQSVSSTYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSPWTFGQGTKVEIK,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSPWTFGQGTKVEIK,QQYGSSPWT,QDQLVQSGAEVKKPGASVKVSCKASGYIFTTYYMHWVRQAPGQGLEWMGMIYPSGGRTSYAQKFQGRVTMTRDTSTSTVYMELSGLRSEDTAVYYCAKSYYYYMDVWGKGTTVTVSS[SEP]EIVLTQSPGTLSLSPGERATLSCRASQSVSSTYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSPWTFGQGTKVEIK,Plasmablast_Plasma-B-Cells,Multiple-sclerosis,human,IGHV1-46,IGKV3-20


In [9]:
output_dir = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes"

# Save each group as a separate CSV file
for (btype_group_id, btype_group) in enumerate(grouped_by_btype)
    # Extract the BType name (to use in the file name)
    btype_name = btype_group.BType[1]  # Assuming BType exists and is consistent within each group
    
    # Create a sanitized file name (replace spaces or special characters if needed)
    file_name = "BType_$(replace(btype_name, r"\s" => "_")).csv"
    
    # Write the group to a CSV file
    CSV.write(joinpath(output_dir, file_name), btype_group, writeheader=true)
end

println("Grouped data saved to CSV files.")

Grouped data saved to CSV files.


In [10]:
# using Pkg
# #Pkg.add("StatsBase")
# Pkg.add("DataStructures")

In [11]:
# Check the column names of the DataFrame to identify any discrepancies
memory_grouped_df = CSV.read("/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/BType_Memory-B-Cells.csv", DataFrame)
println("Column names in memory_grouped_df:")
names(memory_grouped_df)

Column names in memory_grouped_df:


20-element Vector{String}:
 "sequence_heavy"
 "locus_heavy"
 "v_call_heavy"
 "sequence_alignment_heavy"
 "sequence_alignment_aa_heavy"
 "germline_alignment_aa_heavy"
 "cdr3_aa_heavy"
 "sequence_light"
 "locus_light"
 "v_call_light"
 "sequence_alignment_light"
 "sequence_alignment_aa_light"
 "germline_alignment_aa_light"
 "cdr3_aa_light"
 "sequence_alignment_heavy_sep_light"
 "BType"
 "Disease"
 "Species"
 "general_v_gene_heavy"
 "general_v_gene_light"

In [12]:
# Load the CSV file into a DataFrame
file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/BType_Plasma-B-Cells.csv" 
df_plasma_b = CSV.read(file_path, DataFrame);

In [13]:
println("Column names in df_plasma_b:")
names(df_plasma_b)

Column names in df_plasma_b:


20-element Vector{String}:
 "sequence_heavy"
 "locus_heavy"
 "v_call_heavy"
 "sequence_alignment_heavy"
 "sequence_alignment_aa_heavy"
 "germline_alignment_aa_heavy"
 "cdr3_aa_heavy"
 "sequence_light"
 "locus_light"
 "v_call_light"
 "sequence_alignment_light"
 "sequence_alignment_aa_light"
 "germline_alignment_aa_light"
 "cdr3_aa_light"
 "sequence_alignment_heavy_sep_light"
 "BType"
 "Disease"
 "Species"
 "general_v_gene_heavy"
 "general_v_gene_light"

In [14]:
# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
plasma_b_grouped_df = groupby(df_plasma_b, [:general_v_gene_heavy, :cdr3_aa_heavy])


Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,general_v_gene_heavy,general_v_gene_light
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String7,String7,String15,String15
1,AGATCACCGTTTCTTATATGGGGAGTGACTCCTGTGCCCCACCATGGACACACTTTGCTCCACGCTCCTGCTGCTGACCATCCCTTCATGGGTCTTGTCCCAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTTTCTGGGTTCTCACTCAGCACTCGTGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCAGGAAAGGCCCTGGAGTGGCTTGGAATCATTTATTACAATGGTGATAAACGCTACAGCCCATCTCTGAAGAGCAGGCTCACCATCACCAGGGACACCTCCAAAAACCAGGTGGTCCTTACAATGACCAACATGGACCCTGGAGACACAGCCACATATTACTGTGCACGCGAGCCCTATACTGACCACGACCAGCACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV2-5*01,CAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTTTCTGGGTTCTCACTCAGCACTCGTGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCAGGAAAGGCCCTGGAGTGGCTTGGAATCATTTATTACAATGGTGATAAACGCTACAGCCCATCTCTGAAGAGCAGGCTCACCATCACCAGGGACACCTCCAAAAACCAGGTGGTCCTTACAATGACCAACATGGACCCTGGAGACACAGCCACATATTACTGTGCACGCGAGCCCTATACTGACCACGACCAGCACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTRGVGVGWIRQPPGKALEWLGIIYYNGDKRYSPSLKSRLTITRDTSKNQVVLTMTNMDPGDTATYYCAREPYTDHDQHYFDYWGQGTLVTVSS,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWNDDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAXXXXXXXXQHYFDYWGQGTLVTVSS,AREPYTDHDQHYFDY,AGGAGTCAGACCCAGTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGTTCCCAGGTTCCAGATGCGACATCCAGATGACCCAGTCTCCATCTTCCGTGTCAGCATCTGTAGGAGACAGAGTCAGCATCACTTGCCGGGCGAGTCAGGGTATTAGCAGCTGGTTGGCCTGGTATCAGCAAAAACCAGGGAAAGTCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTGCAAAATGGGGTCCCATCAAGGTTCAGTGGCAGTGGATCCGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTACTATTGTCAACAGGAGAACAGTTTCCCGTTGACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-12*01,GACATCCAGATGACCCAGTCTCCATCTTCCGTGTCAGCATCTGTAGGAGACAGAGTCAGCATCACTTGCCGGGCGAGTCAGGGTATTAGCAGCTGGTTGGCCTGGTATCAGCAAAAACCAGGGAAAGTCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTGCAAAATGGGGTCCCATCAAGGTTCAGTGGCAGTGGATCCGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTACTATTGTCAACAGGAGAACAGTTTCCCGTTGACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIQMTQSPSSVSASVGDRVSITCRASQGISSWLAWYQQKPGKVPKLLIYAASSLQNGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQENSFPLTFGGGTKVEIK,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPXTFGGGTKVEIK,QQENSFPLT,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTRGVGVGWIRQPPGKALEWLGIIYYNGDKRYSPSLKSRLTITRDTSKNQVVLTMTNMDPGDTATYYCAREPYTDHDQHYFDYWGQGTLVTVSS[SEP]DIQMTQSPSSVSASVGDRVSITCRASQGISSWLAWYQQKPGKVPKLLIYAASSLQNGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQENSFPLTFGGGTKVEIK,Plasma-B-Cells,,human,IGHV2-5,IGKV1-12
2,TTAGGCTTTCTTATATGGGGAGTGACTCCTGTGCCCCACCATGGACACACTTTGCTCCACGCTCCTGCTGCTGACCATCCCTTCATGGGTCTTGTCCCAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTTTCTGGGTTCTCACTCAGCACTCGTGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCAGGAAAGGCCCTGGAGTGGCTTGGAATCATTTATTACAATGGTGATAAACGCTACAGCCCATCTCTGAAGAGCAGGCTCACCATCACCAGGGACACCTCCAAAAACCAGGTGGTCCTTACAATGACCAACATGGACCCTGGAGACACAGCCACATATTACTGTGCACGCGAGCCCTATACTGACCACGACCAGCACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV2-5*01,CAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTTTCTGGGTTCTCACTCAGCACTCGTGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCAGGAAAGGCCCTGGAGTGGCTTGGAATCATTTATTACAATGGTGATAAACGCTACAGCCCATCTCTGAAGAGCAGGCTCACCATCACCAGGGACACCTCCAAAAACCAGGTGGTCCTTACAATGACCAACATGGACCCTGGAGACACAGCCACATATTACTGTGCACGCGAGCCCTATACTGACCACGACCAGCACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTRGVGVGWIRQPPGKALEWLGIIYYNGDKRYSPSLKSRLTITRDTSKNQVVLTMTNMDPGDTATYYCAREPYTDHDQHYFDYWGQGTLVTVSS,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWNDDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAXXXXXXXXQHYFDYWGQGTLVTVSS,AREPYTDHDQHYFDY,AGGAGTCAGACCCAGTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGTTCCCAGGTTCCAGATGCGACATCCAGATGACCCAGTCTCCATCTTCCGTGTCAGCATCTGTAGGAGACAGAGTCAGCATCACTTGCCGGGCGAGTCAGGGTATTAGCAGCTGGTTGGCCTGGTATCAGCAAAAACCAGGGAAAGTCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTACTATTGTCAACAGGAGAACAGTTTCCCGTTGACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-12*01,GACATCCAGATGACCCAGTCTCCATCTTCCGTGTCAGCATCTGTAGGAGACAGAGTCAGCATCACTTGCCGGGCGAGTCAGGGTATTAGCAGCTGGTTGGCCTGGTATCAGCAAAAACCAGGGAAAGTCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTACTATTGTCAACAGGAGAACAGTTTCCCGTTGACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIQMTQSPSSVSASVGDRVSITCRASQGISSWLAWYQQKPGKVPKLLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQENSFPLTFGGGTKVEIK,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPXTFGGGTKVEIK,QQENSFPLT,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTRGVGVGWIRQPPGKALEWLGIIYYNGDKRYSPSLKSRLTITRDTSKNQVVLTMTNMDPGDTATYYCAREPYTDHDQHYFDYWGQGTLVTVSS[SEP]DIQMTQSPSSVSASVGDRVSITCRASQGISSWLAWYQQKPGKVPKLLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQENSFPLTFGGGTKVEIK,Plasma-B-Cells,,human,IGHV2-5,IGKV1-12

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,general_v_gene_heavy,general_v_gene_light
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String7,String7,String15,String15
1,TGGGGAGCTCTGGGAGAGGAGCCCCAGGCCCGGGATTCCCAGGTGTTTCCATTCAGTGATCAGCACTGAAGACAGAAGACTCATCATGGAGTTCTGGCTGAGCTGGGTTCTCCTTGTTGCCATTTTAAAAGATGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGACTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTTCAGCCTCTGGATTCACCTTCAGTAATTATGCTATGCACTGGGTCCGCCAGGCTCCAGGGAAGGGACTGGAATATGTTTCAGGTATTATTAGTAATGGGGGTAGCACATACTATGCAGACTCCGTGAAGGGCAGATTCATCACCTCCAGAGACAATTCCAAGAACACGCTGTATCTTCAAATGAGCAGTCTGAGAACTGAGGACACGGCTCTGTATTACTGTGTGAAAGTATTGCAGGGCTACTACTACTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-64D*06,GAGGTGCAGCTGGTGGAGTCTGGGGGAGACTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTTCAGCCTCTGGATTCACCTTCAGTAATTATGCTATGCACTGGGTCCGCCAGGCTCCAGGGAAGGGACTGGAATATGTTTCAGGTATTATTAGTAATGGGGGTAGCACATACTATGCAGACTCCGTGAAGGGCAGATTCATCACCTCCAGAGACAATTCCAAGAACACGCTGTATCTTCAAATGAGCAGTCTGAGAACTGAGGACACGGCTCTGTATTACTGTGTGAAAGTATTGCAGGGCTACTACTACTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCA,EVQLVESGGDLVQPGGSLRLSCSASGFTFSNYAMHWVRQAPGKGLEYVSGIISNGGSTYYADSVKGRFITSRDNSKNTLYLQMSSLRTEDTALYYCVKVLQGYYYYMDVWGKGTTVTVSS,EVQLVESGGGLVQPGGSLRLSCSASGFTFSSYAMHWVRQAPGKGLEYVSAISSNGGSTYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVYYCVKVLXXYYYYMDVWGKGTTVTVSS,VKVLQGYYYYMDV,GGGAGAGCCCTGGGGAGGAACTGCTCAGTTAGGACCCAGAGGGAACCATGGAAGCCCCAGCTCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACCGGAGAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCTACTTAGCCTGGTACCAACAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGATGCATCCAACAGGGCCACTGGCATCCCAGCCAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGCCTAGAGCCTGAAGATTTTGCAGTTTATTATTGTCAGCAACGTAGCAACTGGCTGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-11*01,GAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCTACTTAGCCTGGTACCAACAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGATGCATCCAACAGGGCCACTGGCATCCCAGCCAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGCCTAGAGCCTGAAGATTTTGCAGTTTATTATTGTCAGCAACGTAGCAACTGGCTGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRLLIYDASNRATGIPARFSGSGSGTDFTLTISSLEPEDFAVYYCQQRSNWLLTFGGGTKVEIK,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRLLIYDASNRATGIPARFSGSGSGTDFTLTISSLEPEDFAVYYCQQRSNWXLTFGGGTKVEIK,QQRSNWLLT,EVQLVESGGDLVQPGGSLRLSCSASGFTFSNYAMHWVRQAPGKGLEYVSGIISNGGSTYYADSVKGRFITSRDNSKNTLYLQMSSLRTEDTALYYCVKVLQGYYYYMDVWGKGTTVTVSS[SEP]EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRLLIYDASNRATGIPARFSGSGSGTDFTLTISSLEPEDFAVYYCQQRSNWLLTFGGGTKVEIK,Plasma-B-Cells,,human,IGHV3-64D,IGKV3-11


In [15]:
# Step 1: Get the size of each group
group_sizes = [nrow(group) for group in plasma_b_grouped_df]

# Step 2: Count the occurrences of each group size
size_counts = combine(groupby(DataFrame(size = group_sizes), :size), nrow => :count)

println(size_counts)

[1m72×2 DataFrame[0m
[1m Row [0m│[1m size  [0m[1m count [0m
     │[90m Int64 [0m[90m Int64 [0m
─────┼──────────────
   1 │     1  16149
   2 │     2   3196
   3 │     3   1046
   4 │     4    514
   5 │     5    277
   6 │     6    158
   7 │     7    114
   8 │     8     86
   9 │     9     50
  10 │    10     51
  11 │    11     37
  12 │    12     32
  13 │    13     36
  14 │    14     30
  15 │    15     19
  16 │    16     19
  17 │    17     11
  18 │    18     14
  19 │    19     11
  20 │    20     12
  21 │    21      6
  22 │    22      6
  23 │    23      9
  24 │    24      5
  25 │    25      7
  26 │    26      4
  27 │    27      3
  28 │    28      1
  29 │    29      6
  30 │    30      8
  31 │    31      2
  32 │    32      2
  33 │    33      2
  34 │    34      2
  35 │    35      3
  36 │    37      1
  37 │    38      1
  38 │    41      2
  39 │    42      1
  40 │    43      1
  41 │    46      2
  42 │    47      1
  43 │    48      1
  44 │    49

In [16]:
# Step 1: Filter out groups with only one row
filtered_groups = filter(g -> nrow(g) > 1, plasma_b_grouped_df)

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,general_v_gene_heavy,general_v_gene_light
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String7,String7,String15,String15
1,AGATCACCGTTTCTTATATGGGGAGTGACTCCTGTGCCCCACCATGGACACACTTTGCTCCACGCTCCTGCTGCTGACCATCCCTTCATGGGTCTTGTCCCAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTTTCTGGGTTCTCACTCAGCACTCGTGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCAGGAAAGGCCCTGGAGTGGCTTGGAATCATTTATTACAATGGTGATAAACGCTACAGCCCATCTCTGAAGAGCAGGCTCACCATCACCAGGGACACCTCCAAAAACCAGGTGGTCCTTACAATGACCAACATGGACCCTGGAGACACAGCCACATATTACTGTGCACGCGAGCCCTATACTGACCACGACCAGCACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV2-5*01,CAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTTTCTGGGTTCTCACTCAGCACTCGTGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCAGGAAAGGCCCTGGAGTGGCTTGGAATCATTTATTACAATGGTGATAAACGCTACAGCCCATCTCTGAAGAGCAGGCTCACCATCACCAGGGACACCTCCAAAAACCAGGTGGTCCTTACAATGACCAACATGGACCCTGGAGACACAGCCACATATTACTGTGCACGCGAGCCCTATACTGACCACGACCAGCACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTRGVGVGWIRQPPGKALEWLGIIYYNGDKRYSPSLKSRLTITRDTSKNQVVLTMTNMDPGDTATYYCAREPYTDHDQHYFDYWGQGTLVTVSS,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWNDDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAXXXXXXXXQHYFDYWGQGTLVTVSS,AREPYTDHDQHYFDY,AGGAGTCAGACCCAGTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGTTCCCAGGTTCCAGATGCGACATCCAGATGACCCAGTCTCCATCTTCCGTGTCAGCATCTGTAGGAGACAGAGTCAGCATCACTTGCCGGGCGAGTCAGGGTATTAGCAGCTGGTTGGCCTGGTATCAGCAAAAACCAGGGAAAGTCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTGCAAAATGGGGTCCCATCAAGGTTCAGTGGCAGTGGATCCGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTACTATTGTCAACAGGAGAACAGTTTCCCGTTGACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-12*01,GACATCCAGATGACCCAGTCTCCATCTTCCGTGTCAGCATCTGTAGGAGACAGAGTCAGCATCACTTGCCGGGCGAGTCAGGGTATTAGCAGCTGGTTGGCCTGGTATCAGCAAAAACCAGGGAAAGTCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTGCAAAATGGGGTCCCATCAAGGTTCAGTGGCAGTGGATCCGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTACTATTGTCAACAGGAGAACAGTTTCCCGTTGACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIQMTQSPSSVSASVGDRVSITCRASQGISSWLAWYQQKPGKVPKLLIYAASSLQNGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQENSFPLTFGGGTKVEIK,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPXTFGGGTKVEIK,QQENSFPLT,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTRGVGVGWIRQPPGKALEWLGIIYYNGDKRYSPSLKSRLTITRDTSKNQVVLTMTNMDPGDTATYYCAREPYTDHDQHYFDYWGQGTLVTVSS[SEP]DIQMTQSPSSVSASVGDRVSITCRASQGISSWLAWYQQKPGKVPKLLIYAASSLQNGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQENSFPLTFGGGTKVEIK,Plasma-B-Cells,,human,IGHV2-5,IGKV1-12
2,TTAGGCTTTCTTATATGGGGAGTGACTCCTGTGCCCCACCATGGACACACTTTGCTCCACGCTCCTGCTGCTGACCATCCCTTCATGGGTCTTGTCCCAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTTTCTGGGTTCTCACTCAGCACTCGTGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCAGGAAAGGCCCTGGAGTGGCTTGGAATCATTTATTACAATGGTGATAAACGCTACAGCCCATCTCTGAAGAGCAGGCTCACCATCACCAGGGACACCTCCAAAAACCAGGTGGTCCTTACAATGACCAACATGGACCCTGGAGACACAGCCACATATTACTGTGCACGCGAGCCCTATACTGACCACGACCAGCACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV2-5*01,CAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTTTCTGGGTTCTCACTCAGCACTCGTGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCAGGAAAGGCCCTGGAGTGGCTTGGAATCATTTATTACAATGGTGATAAACGCTACAGCCCATCTCTGAAGAGCAGGCTCACCATCACCAGGGACACCTCCAAAAACCAGGTGGTCCTTACAATGACCAACATGGACCCTGGAGACACAGCCACATATTACTGTGCACGCGAGCCCTATACTGACCACGACCAGCACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTRGVGVGWIRQPPGKALEWLGIIYYNGDKRYSPSLKSRLTITRDTSKNQVVLTMTNMDPGDTATYYCAREPYTDHDQHYFDYWGQGTLVTVSS,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWNDDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAXXXXXXXXQHYFDYWGQGTLVTVSS,AREPYTDHDQHYFDY,AGGAGTCAGACCCAGTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGTTCCCAGGTTCCAGATGCGACATCCAGATGACCCAGTCTCCATCTTCCGTGTCAGCATCTGTAGGAGACAGAGTCAGCATCACTTGCCGGGCGAGTCAGGGTATTAGCAGCTGGTTGGCCTGGTATCAGCAAAAACCAGGGAAAGTCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTACTATTGTCAACAGGAGAACAGTTTCCCGTTGACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-12*01,GACATCCAGATGACCCAGTCTCCATCTTCCGTGTCAGCATCTGTAGGAGACAGAGTCAGCATCACTTGCCGGGCGAGTCAGGGTATTAGCAGCTGGTTGGCCTGGTATCAGCAAAAACCAGGGAAAGTCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTACTATTGTCAACAGGAGAACAGTTTCCCGTTGACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIQMTQSPSSVSASVGDRVSITCRASQGISSWLAWYQQKPGKVPKLLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQENSFPLTFGGGTKVEIK,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPXTFGGGTKVEIK,QQENSFPLT,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTRGVGVGWIRQPPGKALEWLGIIYYNGDKRYSPSLKSRLTITRDTSKNQVVLTMTNMDPGDTATYYCAREPYTDHDQHYFDYWGQGTLVTVSS[SEP]DIQMTQSPSSVSASVGDRVSITCRASQGISSWLAWYQQKPGKVPKLLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQENSFPLTFGGGTKVEIK,Plasma-B-Cells,,human,IGHV2-5,IGKV1-12

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,general_v_gene_heavy,general_v_gene_light
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String7,String7,String15,String15
1,AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCATTTGGTGATCAGGACTGAACAGAGAGAACTCACCATGGAGTTTGGGCTGAGGTGGCTTTTTCTTGTGGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGACTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACTTTTAGCGACTATGCCATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAATTATTGGTGTGGATGGTACTACCACATTCTACGCAGACTCCCTGAAGGGCCGGTTTACCATCTCCAGAGACAACTCGAAGAACACGGTGTTTCTGCAGATGAACAGCCTGAGAGCCGAGGACACGGCCGTTTATTACTGTGCGACAGGCCGATATTGTGATAGAATCAGCTGCCCCGGGACAAGGTGGTTCGACTCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCAGCTGTCCTACAGTCCTCAGGA,H,IGHV3-23*04,GAGGTGCAGCTGGTGGAGTCTGGGGGAGACTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACTTTTAGCGACTATGCCATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAATTATTGGTGTGGATGGTACTACCACATTCTACGCAGACTCCCTGAAGGGCCGGTTTACCATCTCCAGAGACAACTCGAAGAACACGGTGTTTCTGCAGATGAACAGCCTGAGAGCCGAGGACACGGCCGTTTATTACTGTGCGACAGGCCGATATTGTGATAGAATCAGCTGCCCCGGGACAAGGTGGTTCGACTCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGDLVQPGGSLRLSCAASGFTFSDYAMTWVRQAPGKGLEWVSIIGVDGTTTFYADSLKGRFTISRDNSKNTVFLQMNSLRAEDTAVYYCATGRYCDRISCPGTRWFDSWGQGTLVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXYCSSTSCXXXXWFDSWGQGTLVTVSS,ATGRYCDRISCPGTRWFDS,AGAGCCCTGGGGAGGAACTGCTCAGTTAGGACCCAGAGGGAACCATGGAAGCCCCAGCTCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATGCCACCGGAGAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCCACTTAGCCTGGTACCAACAAAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGAGGCATCCAACAGGGCCACGGGCATCCCAGCCAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGCCTAGAGGCTGAAGATTTTGCGGTTTATTACTGTCAGCAGCGGAGCGACTGGCCCGCCACCTTCGGCCCTGGGACCAAAGTGGATATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-11*01,GAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCCACTTAGCCTGGTACCAACAAAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGAGGCATCCAACAGGGCCACGGGCATCCCAGCCAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGCCTAGAGGCTGAAGATTTTGCGGTTTATTACTGTCAGCAGCGGAGCGACTGGCCCGCCACCTTCGGCCCTGGGACCAAAGTGGATATCAAAC,EIVLTQSPATLSLSPGERATLSCRASQSVSSHLAWYQQKPGQAPRLLIYEASNRATGIPARFSGSGSGTDFTLTISSLEAEDFAVYYCQQRSDWPATFGPGTKVDIK,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRLLIYDASNRATGIPARFSGSGSGTDFTLTISSLEPEDFAVYYCQQRSNWPXTFGPGTKVDIK,QQRSDWPAT,EVQLVESGGDLVQPGGSLRLSCAASGFTFSDYAMTWVRQAPGKGLEWVSIIGVDGTTTFYADSLKGRFTISRDNSKNTVFLQMNSLRAEDTAVYYCATGRYCDRISCPGTRWFDSWGQGTLVTVSS[SEP]EIVLTQSPATLSLSPGERATLSCRASQSVSSHLAWYQQKPGQAPRLLIYEASNRATGIPARFSGSGSGTDFTLTISSLEAEDFAVYYCQQRSDWPATFGPGTKVDIK,Plasma-B-Cells,,human,IGHV3-23,IGKV3-11
2,AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCATTTGGTGATCAGGACTGAACAGAGAGAACTCACCATGGAGTTTGGGCTGAGGTGGCTTTTTCTTGTGGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGACTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACTTTTAGCGACTATGCCATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAATTATTGGTGTGGATGGTACTACCACATTCTACGCAGACTCCCTGAAGGGCCGGTTTACCATCTCCAGAGACAACTCGAAGAACACGGTGTTTCTGCAGATGAACAGCCTGAGAGCCGAGGACACGGCCGTTTATTACTGTGCGACAGGCCGATATTGTGATAGAATCAGCTGCCCCGGGACAAGGTGGTTCGACTCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCAGCTGTCCTACAGTCCTCAGGA,H,IGHV3-23*04,GAGGTGCAGCTGGTGGAGTCTGGGGGAGACTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACTTTTAGCGACTATGCCATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAATTATTGGTGTGGATGGTACTACCACATTCTACGCAGACTCCCTGAAGGGCCGGTTTACCATCTCCAGAGACAACTCGAAGAACACGGTGTTTCTGCAGATGAACAGCCTGAGAGCCGAGGACACGGCCGTTTATTACTGTGCGACAGGCCGATATTGTGATAGAATCAGCTGCCCCGGGACAAGGTGGTTCGACTCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGDLVQPGGSLRLSCAASGFTFSDYAMTWVRQAPGKGLEWVSIIGVDGTTTFYADSLKGRFTISRDNSKNTVFLQMNSLRAEDTAVYYCATGRYCDRISCPGTRWFDSWGQGTLVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXYCSSTSCXXXXWFDSWGQGTLVTVSS,ATGRYCDRISCPGTRWFDS,GGGGAGGAACTGCTCAGTTAGGACCCAGAGGGAACCATGGAAGCCCCAGCTCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATGCCACCGGAGAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCCACTTAGCCTGGTACCAACAAAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGAGGCATCCAACAGGGCCACGGGCATCCCAGCCAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGCCTAGAGGCTGAAGATTTTGCGGTTTATTACTGTCAGCAGCGGAGCGACTGGCCCGCCACCTTCGGCCCTGGGACCAAAGTGGATATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-11*01,GAAATTGTGTTGACACAGTCTCCAGCCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCCACTTAGCCTGGTACCAACAAAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGAGGCATCCAACAGGGCCACGGGCATCCCAGCCAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGCCTAGAGGCTGAAGATTTTGCGGTTTATTACTGTCAGCAGCGGAGCGACTGGCCCGCCACCTTCGGCCCTGGGACCAAAGTGGATATCAAAC,EIVLTQSPATLSLSPGERATLSCRASQSVSSHLAWYQQKPGQAPRLLIYEASNRATGIPARFSGSGSGTDFTLTISSLEAEDFAVYYCQQRSDWPATFGPGTKVDIK,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRLLIYDASNRATGIPARFSGSGSGTDFTLTISSLEPEDFAVYYCQQRSNWPXTFGPGTKVDIK,QQRSDWPAT,EVQLVESGGDLVQPGGSLRLSCAASGFTFSDYAMTWVRQAPGKGLEWVSIIGVDGTTTFYADSLKGRFTISRDNSKNTVFLQMNSLRAEDTAVYYCATGRYCDRISCPGTRWFDSWGQGTLVTVSS[SEP]EIVLTQSPATLSLSPGERATLSCRASQSVSSHLAWYQQKPGQAPRLLIYEASNRATGIPARFSGSGSGTDFTLTISSLEAEDFAVYYCQQRSDWPATFGPGTKVDIK,Plasma-B-Cells,,human,IGHV3-23,IGKV3-11


In [17]:
group_sizes = [nrow(group) for group in filtered_groups]

size_counts = combine(groupby(DataFrame(size = group_sizes), :size), nrow => :count) # size is the group size, count is the number of groups with that size

println(size_counts)

[1m71×2 DataFrame[0m
[1m Row [0m│[1m size  [0m[1m count [0m
     │[90m Int64 [0m[90m Int64 [0m
─────┼──────────────
   1 │     2   3196
   2 │     3   1046
   3 │     4    514
   4 │     5    277
   5 │     6    158
   6 │     7    114
   7 │     8     86
   8 │     9     50
   9 │    10     51
  10 │    11     37
  11 │    12     32
  12 │    13     36
  13 │    14     30
  14 │    15     19
  15 │    16     19
  16 │    17     11
  17 │    18     14
  18 │    19     11
  19 │    20     12
  20 │    21      6
  21 │    22      6
  22 │    23      9
  23 │    24      5
  24 │    25      7
  25 │    26      4
  26 │    27      3
  27 │    28      1
  28 │    29      6
  29 │    30      8
  30 │    31      2
  31 │    32      2
  32 │    33      2
  33 │    34      2
  34 │    35      3
  35 │    37      1
  36 │    38      1
  37 │    41      2
  38 │    42      1
  39 │    43      1
  40 │    46      2
  41 │    47      1
  42 │    48      1
  43 │    49      1
  44 │    50

In [18]:
# Step 2: Calculate the percentage of identical genes for each group
percentages = []

for group in filtered_groups
    # Count occurrences of each unique gene in `general_v_gene_light`
    gene_counts = combine(groupby(group, "general_v_gene_light"), nrow => :count)
    
    # Calculate percentage of the most common gene
    most_common_count = maximum(gene_counts.count)
    total_count = sum(gene_counts.count)
    percentage_identical = (most_common_count / total_count) * 100
    
    push!(percentages, percentage_identical)  # Store the percentage
end

# Step 3: Calculate the average percentage
average_percentage = mean(percentages)

println("Average percentage of identical genes across groups: $average_percentage%")
average_percentage

Average percentage of identical genes across groups: 97.35545178851979%


97.35545178851979

In [19]:
percentages_plasma = []

for group in filtered_groups
    # Count occurrences of each unique gene in `general_v_gene_light`
    gene_counts = combine(groupby(group, "general_v_gene_light"), nrow => :count)
    
    # Calculate percentage of the most common gene
    most_common_count = maximum(gene_counts.count)
    total_count = sum(gene_counts.count)
    percentage_identical = (most_common_count / total_count) * 100
    #println("Percentage of identical genes in general_v_gene_light: $percentage_identical%")

    push!(percentages_plasma, percentage_identical)
    
end

println("Percentages:")
println(percentages_plasma)



Percentages:
Any[100.0, 100.0, 100.0, 100.0, 57.14285714285714, 100.0, 100.0, 100.0, 85.71428571428571, 50.0, 100.0, 66.66666666666666, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 80.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 50.0, 50.0, 66.66666666666666, 100.0, 100.0, 90.2439024390244, 50.0, 100.0, 40.0, 100.0, 100.0, 100.0, 100.0, 50.0, 100.0, 100.0, 100.0, 100.0, 57.14285714285714, 100.0, 100.0, 80.0, 100.0, 100.0, 100.0, 50.0, 100.0, 100.0, 33.33333333333333, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 33.33333333333333, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 56.666666666666664, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 75.0, 100.0, 100.0, 100.0, 100.0, 66.66666666666666, 100.0, 100.0, 100.0, 100.0, 85.71428571428571, 100.0, 100.0, 100.0, 100.0, 100.0, 50.0, 100.0, 100.0, 100.0, 100.0, 100.0, 50.0, 80.0, 100.0, 75.

In [20]:
# Step 3: Calculate the average percentage
average_percentage = mean(percentages_plasma)

println("Average percentage of identical genes across groups: $average_percentage%")
average_percentage

Average percentage of identical genes across groups: 97.35545178851979%


97.35545178851979

In [21]:
# Create a sample dataset
test_df = DataFrame(
    general_v_gene_heavy = ["VH1", "VH1", "VH2", "VH2", "VH2", "VH3", "VH3", "VH3", "VH3"],
    cdr3_aa_heavy = ["A", "A", "B", "B", "B", "C", "C", "C", "C"],
    general_v_gene_light = ["VL1", "VL1", "VL2", "VL2", "VL3", "VL4", "VL4", "VL5", "VL5"]
)

# Group the DataFrame by `general_v_gene_heavy` and `cdr3_aa_heavy`
grouped_test_df = groupby(test_df, [:general_v_gene_heavy, :cdr3_aa_heavy])

println("Grouped DataFrame:")
println(grouped_test_df)

Grouped DataFrame:
GroupedDataFrame with 3 groups based on keys: general_v_gene_heavy, cdr3_aa_heavy
Group 1 (2 rows): general_v_gene_heavy = "VH1", cdr3_aa_heavy = "A"
[1m Row [0m│[1m general_v_gene_heavy [0m[1m cdr3_aa_heavy [0m[1m general_v_gene_light [0m
     │[90m String               [0m[90m String        [0m[90m String               [0m
─────┼───────────────────────────────────────────────────────────
   1 │ VH1                   A              VL1
   2 │ VH1                   A              VL1
Group 2 (3 rows): general_v_gene_heavy = "VH2", cdr3_aa_heavy = "B"
[1m Row [0m│[1m general_v_gene_heavy [0m[1m cdr3_aa_heavy [0m[1m general_v_gene_light [0m
     │[90m String               [0m[90m String        [0m[90m String               [0m
─────┼───────────────────────────────────────────────────────────
   1 │ VH2                   B              VL2
   2 │ VH2                   B              VL2
   3 │ VH2                   B              VL3
Group 3 (

In [22]:
all_percentages = []

for group in grouped_test_df
    # Count occurrences of each unique gene in `general_v_gene_light`
    gene_counts = combine(groupby(group, "general_v_gene_light"), nrow => :count)
    
    # Calculate percentage of the most common gene
    most_common_count = maximum(gene_counts.count)
    total_count = sum(gene_counts.count)
    percentage_identical = (most_common_count / total_count) * 100
    println("Percentage of identical genes in general_v_gene_light: $percentage_identical%")

    push!(all_percentages, percentage_identical)
    
end

println("Percentages:")
println(all_percentages)
all_percentages


Percentage of identical genes in general_v_gene_light: 100.0%
Percentage of identical genes in general_v_gene_light: 66.66666666666666%
Percentage of identical genes in general_v_gene_light: 50.0%
Percentages:
Any[100.0, 66.66666666666666, 50.0]


3-element Vector{Any}:
 100.0
  66.66666666666666
  50.0

In [23]:
# Step 3: Calculate the average percentage
average_percentage = mean(all_percentages)

println("Average percentage of identical genes across groups: $average_percentage%")

Average percentage of identical genes across groups: 72.22222222222221%


In [24]:
# Load the CSV file into a DataFrame
file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/BType_Plasma-B-Cells.csv" 
df_plasma_b = CSV.read(file_path, DataFrame);

# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
plasma_b_grouped_df = groupby(df_plasma_b, [:general_v_gene_heavy, :cdr3_aa_heavy])

# Step 1: Filter out groups with only one row
filtered_groups = filter(g -> nrow(g) > 1, plasma_b_grouped_df)

# Step 2: Calculate the percentage of identical genes for each group
percentages = []

for group in filtered_groups
    # Count occurrences of each unique gene in `general_v_gene_light`
    gene_counts = combine(groupby(group, "general_v_gene_light"), nrow => :count)
    
    # Calculate percentage of the most common gene
    most_common_count = maximum(gene_counts.count)
    total_count = sum(gene_counts.count)
    percentage_identical = (most_common_count / total_count) * 100
    
    push!(percentages, percentage_identical)  # Store the percentage
end

# Step 3: Calculate the average percentage
average_percentage = mean(percentages)

println("Average percentage of identical genes across groups: $average_percentage%")
average_percentage

Average percentage of identical genes across groups: 97.35545178851979%


97.35545178851979

In [25]:
using Pkg
Pkg.add("Glob")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Manifest.toml`


In [26]:
using CSV
using DataFrames
using Statistics
using Glob  # For file pattern matching

group_size_threshold = 20

# Define the directory containing the files
directory_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes"

# Get all CSV files in the directory
file_paths = glob("*.csv", directory_path)

# Create a DataFrame to store results
results = DataFrame(file_name = String[], average_percentage = Float64[])

# Loop through each file and perform the analysis
for file_path in file_paths
    println("Processing file: $file_path")
    
    # Load the CSV file into a DataFrame
    df = CSV.read(file_path, DataFrame)
    
    # Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
    grouped_df = groupby(df, [:general_v_gene_heavy, :cdr3_aa_heavy])
    
    # Step 1: Filter out groups with only one row
    filtered_groups = filter(g -> nrow(g) > group_size_threshold, grouped_df)
    
    # Step 2: Calculate the percentage of identical genes for each group
    percentages = []
    for group in filtered_groups
        # Count occurrences of each unique gene in `general_v_gene_light`
        gene_counts = combine(groupby(group, :general_v_gene_light), nrow => :count)
        
        # Calculate percentage of the most common gene
        most_common_count = maximum(gene_counts.count)
        total_count = sum(gene_counts.count)
        percentage_identical = (most_common_count / total_count) * 100
        
        push!(percentages, percentage_identical)  # Store the percentage
    end
    
    # Step 3: Calculate the average percentage
    average_percentage = isempty(percentages) ? 0.0 : mean(percentages)
    
    println("Average percentage of identical genes across groups in $file_path: $average_percentage%")
    
    # Add results to the DataFrame
    push!(results, (file_name = file_path, average_percentage = average_percentage))
end

# Save results to a CSV file
output_file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/results/BTypes_oas_coherence_results_thresh_$group_size_threshold.csv"
CSV.write(output_file_path, results)

println("Results saved to $output_file_path")


Processing file: /ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/BType_ASC.csv
Average percentage of identical genes across groups in /ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/BType_ASC.csv: 100.0%
Processing file: /ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/BType_CD27-memory-and-Plasmablast_Plasma-B-Cells.csv
Average percentage of identical genes across groups in /ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/BType_CD27-memory-and-Plasmablast_Plasma-B-Cells.csv: 98.66666666666667%
Processing file: /ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/BType_Memory-B-Cells.csv
Average percentage of identical genes across groups in /ibmm_data2/oas_database/pa

In [27]:
using CSV
using DataFrames
using Statistics
using Glob  # For file pattern matching

group_size_threshold = 1

only_group_by_v_gene = true

if only_group_by_v_gene
    save_file_name = "only_group_by_v_gene"
    println("Grouping only by `general_v_gene_heavy`")
else
    save_file_name = "group_by_v_gene_and_cdr3"
    println("Grouping by `general_v_gene_heavy` and `cdr3_aa_heavy`")
end

# Define the directory containing the files
directory_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes"

# Get all CSV files in the directory
file_paths = glob("*.csv", directory_path)

# Create a DataFrame to store results
results = DataFrame(file_name = String[], average_percentage = Float64[])

# Loop through each file and perform the analysis
for file_path in file_paths
    println("Processing file: $file_path")
    
    # Load the CSV file into a DataFrame
    df = CSV.read(file_path, DataFrame)
    
    # Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
    if only_group_by_v_gene
        grouped_df = groupby(df, [:general_v_gene_heavy])
    else
        grouped_df = groupby(df, [:general_v_gene_heavy, :cdr3_aa_heavy])
    end
    
    # Step 1: Filter out groups with only one row
    filtered_groups = filter(g -> nrow(g) > group_size_threshold, grouped_df)

    group_sizes = [nrow(group) for group in filtered_groups]
    size_counts = combine(groupby(DataFrame(size = group_sizes), :size), nrow => :count)
    println(size_counts)
    
    # Step 2: Calculate the percentage of identical genes for each group
    percentages = []
    for group in filtered_groups
        # Count occurrences of each unique gene in `general_v_gene_light`
        gene_counts = combine(groupby(group, :general_v_gene_light), nrow => :count)
        
        # Calculate percentage of the most common gene
        most_common_count = maximum(gene_counts.count)
        total_count = sum(gene_counts.count)
        percentage_identical = (most_common_count / total_count) * 100
        
        push!(percentages, percentage_identical)  # Store the percentage
    end
    
    # Step 3: Calculate the average percentage
    average_percentage = isempty(percentages) ? 0.0 : mean(percentages)
    
    println("Average percentage of identical genes across groups in $file_path: $average_percentage%")
    
    # Add results to the DataFrame
    push!(results, (file_name = file_path, average_percentage = average_percentage))
end

# Save results to a CSV file
output_file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/results/$(save_file_name)_BTypes_oas_coherence_results_thresh_$(group_size_threshold).csv"
CSV.write(output_file_path, results)

println("Results saved to $output_file_path")

Grouping only by `general_v_gene_heavy`
Processing file: /ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/BType_ASC.csv
[1m36×2 DataFrame[0m
[1m Row [0m│[1m size  [0m[1m count [0m
     │[90m Int64 [0m[90m Int64 [0m
─────┼──────────────
   1 │     9      4
   2 │   119      1
   3 │    53      2
   4 │    65      1
   5 │   114      1
   6 │    85      1
   7 │    33      1
   8 │   143      1
   9 │    43      1
  10 │    35      1
  11 │    18      1
  12 │    59      1
  13 │    24      2
  14 │    36      1
  15 │    11      2
  16 │    62      1
  17 │     4      2
  18 │    55      1
  19 │    87      1
  20 │    32      1
  21 │    17      1
  22 │    58      1
  23 │     8      1
  24 │    51      1
  25 │    12      1
  26 │    26      1
  27 │    20      1
  28 │    39      1
  29 │     5      1
  30 │     7      1
  31 │    40      1
  32 │    22      1
  33 │     6      1
  34 │     3      1
  35 │ 

In [28]:
# Load the CSV file into a DataFrame
file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/BType_Naive-B-Cells.csv" 
df_naive_b = CSV.read(file_path, DataFrame);

# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
naive_b_grouped_df = groupby(df_naive_b, [:general_v_gene_heavy, :cdr3_aa_heavy])

# Step 1: Filter out groups with only one row
filtered_groups = filter(g -> nrow(g) > 1, naive_b_grouped_df)

# Step 2: Calculate the percentage of identical genes for each group
percentages = []

for group in filtered_groups
    # Count occurrences of each unique gene in `general_v_gene_light`
    gene_counts = combine(groupby(group, "general_v_gene_light"), nrow => :count)
    
    # Calculate percentage of the most common gene
    most_common_count = maximum(gene_counts.count)
    total_count = sum(gene_counts.count)
    percentage_identical = (most_common_count / total_count) * 100
    
    push!(percentages, percentage_identical)  # Store the percentage
end

# Step 3: Calculate the average percentage
average_percentage = mean(percentages)

println("Average percentage of identical genes across groups: $average_percentage%")
average_percentage

Average percentage of identical genes across groups: 98.23227669659636%


98.23227669659636

# Patient Information (Subjects)

In [29]:
file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/full_extraction_for_coherence_paired_data_extra_cols_header.csv" 
subjects_df = CSV.read(file_path, DataFrame);

print(subjects_df.Subject[1:100])

String15["390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c", "390c"]

In [30]:
subjects_counts = countmap(subjects_df.Subject)

Dict{String15, Int64} with 35 entries:
  "Patient-6"    => 1605
  "390c"         => 1012
  "Patient-15"   => 3849
  "Patient-14"   => 4232
  "Patient-5"    => 3574
  "Subject-BCP4" => 2515
  "Donor-45"     => 4103
  "S-CoV11"      => 21204
  "S-CoV10"      => 10915
  "S-CoV13"      => 20187
  "Subject-BCP5" => 8728
  "Patient-2"    => 4162
  "None"         => 179468
  "Patient-1"    => 6770
  "Patient-12"   => 3314
  "Donor-2"      => 421075
  "Subject-BCP9" => 6368
  "Patient-7"    => 1584
  "Subject-BCP8" => 3963
  ⋮              => ⋮

### Preprocess the data 

In [31]:
# Filter out rows where BType is "Unsorted-B-Cells"
filtered_df_subjects = filter(row -> row.BType != "Unsorted-B-Cells", subjects_df);

# Extract up to the first two segments (e.g., "IGKV2-30" from "IGKV2-30*01")
filtered_df_subjects[!, :general_v_gene_heavy] = replace.(filtered_df_subjects.v_call_heavy, r"(^[^*]+?)(?:\*.*)?$" => s"\1");
filtered_df_subjects[!, :general_v_gene_light] = replace.(filtered_df_subjects.v_call_light, r"(^[^*]+?)(?:\*.*)?$" => s"\1");

# extract names of the BTypes
replace!(filtered_df_subjects.BType, "CD27-memory-and-Plasmablast/Plasma-B-Cells" => "CD27-memory-and-Plasmablast_Plasma-B-Cells")
replace!(filtered_df_subjects.BType, "Plasmablast/Plasma-B-Cells" => "Plasmablast_Plasma-B-Cells")

unique_btypes = unique(filtered_df_subjects.BType)

println("unique BTypes: ", unique_btypes)

# Group by BType
grouped_by_btype = groupby(filtered_df_subjects, :BType)

output_dir = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/extra_cols"

# Save each group as a separate CSV file
for (btype_group_id, btype_group) in enumerate(grouped_by_btype)
    # Extract the BType name (to use in the file name)
    btype_name = btype_group.BType[1]  # Assuming BType exists and is consistent within each group

    # print number of rows in each group
    println("Number of rows in group $btype_name: ", nrow(btype_group))
    
    # Create a sanitized file name (replace spaces or special characters if needed)
    file_name = "BType_$(replace(btype_name, r"\s" => "_"))_extra_cols.csv"
    
    # Write the group to a CSV file
    CSV.write(joinpath(output_dir, file_name), btype_group, writeheader=true)
end

println("Grouped data saved to CSV files.")

unique BTypes: ["Memory-B-Cells", "Plasma-B-Cells", "Naive-B-Cells", "RV+B-Cells", "double-nagative-B-cells", "Plasmablast", "CD27-memory-and-Plasmablast_Plasma-B-Cells", "ASC", "Plasmablast_Plasma-B-Cells"]
Number of rows in group Memory-B-Cells: 509879
Number of rows in group Plasma-B-Cells: 41614
Number of rows in group Naive-B-Cells: 666907
Number of rows in group RV+B-Cells: 771
Number of rows in group double-nagative-B-cells: 4826
Number of rows in group Plasmablast: 9248
Number of rows in group ASC: 1534
Number of rows in group CD27-memory-and-Plasmablast_Plasma-B-Cells: 31232
Number of rows in group Plasmablast_Plasma-B-Cells: 17257
Grouped data saved to CSV files.


### Memory B Cells

In [32]:
file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/extra_cols/BType_Memory-B-Cells_extra_cols.csv" 
mem_subj_df = CSV.read(file_path, DataFrame);

print(mem_subj_df.Subject[1:100])

String15["Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "Donor-1", "

In [33]:
# Filter and print entries with "D" in their name for each column
println("Entries with 'D' in general_v_gene_heavy:")
println(filter(row -> occursin("D", row[:general_v_gene_heavy]), mem_subj_df)[:, :general_v_gene_heavy])

println("Entries with 'D' in general_v_gene_light:")
filter(row -> occursin("D", row[:general_v_gene_light]), mem_subj_df)[:, :general_v_gene_light]


Entries with 'D' in general_v_gene_heavy:
String15["IGHV3-64D", "IGHV3-43D", "IGHV3-64D", "IGHV3-43D", "IGHV3-43D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-43D", "IGHV3-43D", "IGHV3-64D", "IGHV3-64D", "IGHV3-43D", "IGHV3-43D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-43D", "IGHV3-64D", "IGHV3-64D", "IGHV3-43D", "IGHV3-64D", "IGHV3-64D", "IGHV3-43D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-43D", "IGHV3-64D", "IGHV3-64D", "IGHV3-43D", "IGHV3-64D", "IGHV3-43D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-43D", "IGHV3-64D", "IGHV3-64D", "IGHV3-43D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", 

9007-element PooledArrays.PooledVector{String15, UInt32, Vector{UInt32}}:
 "IGKV1D-16"
 "IGKV1D-8"
 "IGKV2D-29"
 "IGKV2D-29"
 "IGKV3D-15"
 "IGKV3D-11"
 "IGKV1D-12"
 "IGKV3D-20"
 "IGKV6D-21"
 "IGKV1D-13"
 ⋮
 "IGKV1D-12"
 "IGKV1D-13"
 "IGKV3D-20"
 "IGKV3D-15"
 "IGKV1D-16"
 "IGKV1D-16"
 "IGKV3D-15"
 "IGKV3D-20"
 "IGKV3D-15"

In [34]:

# Add the new columns
mem_subj_df[!, :general_v_gene_heavy_no_para] = replace.(mem_subj_df.general_v_gene_heavy, r"D" => "")
mem_subj_df[!, :general_v_gene_light_no_para] = replace.(mem_subj_df.general_v_gene_light, r"D" => "")

# Display the modified DataFrame
println(first(mem_subj_df, 10))  # Print the first 10 rows for verification

[1m10×25 DataFrame[0m
[1m Row [0m│[1m sequence_heavy                    [0m[1m locus_heavy [0m[1m v_call_heavy  [0m[1m sequence_alignment_heavy          [0m[1m sequence_alignment_aa_heavy       [0m[1m germline_alignment_aa_heavy       [0m[1m cdr3_aa_heavy         [0m[1m sequence_light                    [0m[1m locus_light [0m[1m v_call_light [0m[1m sequence_alignment_light          [0m[1m sequence_alignment_aa_light       [0m[1m germline_alignment_aa_light       [0m[1m cdr3_aa_light [0m[1m sequence_alignment_heavy_sep_light [0m[1m BType          [0m[1m Disease [0m[1m Species [0m[1m Subject  [0m[1m Author            [0m[1m Age     [0m[1m general_v_gene_heavy [0m[1m general_v_gene_light [0m[1m general_v_gene_heavy_no_para [0m[1m general_v_gene_light_no_para [0m
     │[90m String                            [0m[90m String1     [0m[90m String15      [0m[90m String                            [0m[90m String                        

In [35]:
# Filter and print entries with "D" in their name for each column
println("Entries with 'D' in general_v_gene_heavy_no_para:")
println(filter(row -> occursin("D", row[:general_v_gene_heavy_no_para]), mem_subj_df)[:, :general_v_gene_heavy_no_para])

println("Entries with 'D' in general_v_gene_light_no_para:")
filter(row -> occursin("D", row[:general_v_gene_light_no_para]), mem_subj_df)[:, :general_v_gene_light_no_para]


Entries with 'D' in general_v_gene_heavy_no_para:
String[]
Entries with 'D' in general_v_gene_light_no_para:


String[]

In [36]:
mem_subjects_counts = countmap(mem_subj_df.Subject)

Dict{String15, Int64} with 15 entries:
  "Subject-BCP4" => 2090
  "S-CoV11"      => 21204
  "S-CoV10"      => 10915
  "S-CoV13"      => 20187
  "Subject-BCP5" => 5793
  "None"         => 17050
  "Donor-2"      => 140581
  "Subject-BCP9" => 3390
  "Subject-BCP8" => 1075
  "Donor-3"      => 44097
  "S-CoV1"       => 9210
  "Donor-4"      => 50177
  "Donor-1"      => 178545
  "Subject-BCP6" => 3358
  "Subject-BCP3" => 2207

In [37]:
# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
mem_subj_grouped_df = groupby(mem_subj_df, [:general_v_gene_heavy_no_para, :cdr3_aa_heavy])

# Step 1: Filter out groups with only one row
mem_subj_filtered_grouped = filter(g -> nrow(g) > 1, mem_subj_grouped_df)

# Step 2: Calculate the percentage of identical genes for each group
percentages = []

for group in mem_subj_filtered_grouped
    # Count occurrences of each unique gene in `general_v_gene_light`
    gene_counts = combine(groupby(group, "general_v_gene_light_no_para"), nrow => :count)
    
    # Calculate percentage of the most common gene
    most_common_count = maximum(gene_counts.count)
    total_count = sum(gene_counts.count)
    percentage_identical = (most_common_count / total_count) * 100
    
    push!(percentages, percentage_identical)  # Store the percentage
end

# Step 3: Calculate the average percentage
average_percentage = mean(percentages)

println("Average percentage of identical genes across groups: $average_percentage%")
average_percentage

Average percentage of identical genes across groups: 96.84673826790085%


96.84673826790085

In [38]:
group_sizes_mem_subj = [nrow(group) for group in mem_subj_filtered_grouped]

size_counts_mem_subj = combine(groupby(DataFrame(size = group_sizes_mem_subj), :size), nrow => :count)

println(size_counts_mem_subj)

[1m131×2 DataFrame[0m
[1m Row [0m│[1m size  [0m[1m count [0m
     │[90m Int64 [0m[90m Int64 [0m
─────┼──────────────
   1 │     2  29552
   2 │     3   8629
   3 │     4   3923
   4 │     5   2185
   5 │     6   1338
   6 │     7    904
   7 │     8    612
   8 │     9    469
   9 │    10    361
  10 │    11    249
  11 │    12    194
  12 │    13    153
  13 │    14    148
  14 │    15    103
  15 │    16     92
  16 │    17     68
  17 │    18     67
  18 │    19     66
  19 │    20     50
  20 │    21     39
  21 │    22     32
  22 │    23     30
  23 │    24     24
  24 │    25     22
  25 │    26     14
  26 │    27     25
  27 │    28     25
  28 │    29     20
  29 │    30     14
  30 │    31     15
  31 │    32     14
  32 │    33      2
  33 │    34     13
  34 │    35     14
  35 │    36     11
  36 │    37      6
  37 │    38     10
  38 │    39      7
  39 │    40      7
  40 │    41      4
  41 │    42      5
  42 │    43      7
  43 │    44      6
  44 │    4

In [39]:
# filter out every group that has the same entry in Subject
filtered_groups_only_sev_subj = filter(g -> length(unique(g.Subject)) > 1, mem_subj_filtered_grouped)

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String,String7,String15,String31,String7,String15,String15,String,String
1,GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGTAGCTTTGGCATGCACTGGGTCCGGCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCGGTCATATCATTTTCATTTATGGGAAGTTTTGAATACTATTCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAGCACGCTGTATCTGCAAATGGACAACCTGAGAGTTGAGGACACGGCTGTATATTACTGTGTGAAAGAATCGAATGCTTTTGATGTCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-30*18,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGTAGCTTTGGCATGCACTGGGTCCGGCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCGGTCATATCATTTTCATTTATGGGAAGTTTTGAATACTATTCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAGCACGCTGTATCTGCAAATGGACAACCTGAGAGTTGAGGACACGGCTGTATATTACTGTGTGAAAGAATCGAATGCTTTTGATGTCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAG,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSFGMHWVRQAPGKGLEWVAVISFSFMGSFEYYSDSVKGRFTISRDNSKSTLYLQMDNLRVEDTAVYYCVKESNAFDVWGQGTMVTVSS,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXXAFDVWGQGTMVTVSS,VKESNAFDV,GAAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACCATGCCCTGGGCTCTGCTCCTCCTGACCCTCCTCACTCACTCTGCAGTGTCAGTGGTCCAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCCTGAGACAGACCGCCACAGTCACCTGCACTGGGAACAGCAACAATGTTGGCGACCAAGGAGCCGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAACTCCTTTCCTACAGGAATAACAACCGGCCCTCAGGGATCTCAGAGAGATTATCTGCATCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACAGTAGACTCAATGTTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV10-54*01,CAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCCTGAGACAGACCGCCACAGTCACCTGCACTGGGAACAGCAACAATGTTGGCGACCAAGGAGCCGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAACTCCTTTCCTACAGGAATAACAACCGGCCCTCAGGGATCTCAGAGAGATTATCTGCATCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACAGTAGACTCAATGTTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QAGLTQPPSVSKGLRQTATVTCTGNSNNVGDQGAAWLQQHQGHPPKLLSYRNNNRPSGISERLSASRSGNTASLTITGLQPEDEADYYCSAWDSRLNVWVFGGGTKLTVL,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGAAWLQQHQGHPPKLLSYRNNNRPSGISERLSASRSGNTASLTITGLQPEDEADYYCSAWDSSLSXWVFGGGTKLTVL,SAWDSRLNVWV,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSFGMHWVRQAPGKGLEWVAVISFSFMGSFEYYSDSVKGRFTISRDNSKSTLYLQMDNLRVEDTAVYYCVKESNAFDVWGQGTMVTVSS[SEP]QAGLTQPPSVSKGLRQTATVTCTGNSNNVGDQGAAWLQQHQGHPPKLLSYRNNNRPSGISERLSASRSGNTASLTITGLQPEDEADYYCSAWDSRLNVWVFGGGTKLTVL,Memory-B-Cells,,human,Donor-1,"Phad et al., 2022",no,IGHV3-30,IGLV10-54,IGHV3-30,IGLV10-54
2,GGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAATGTCAGGTGCAGCTGGCGGAGTCTGGGGGAGGCTTAGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTAACTCTCAGTAATTATGTCACGCACTGGGTCCGCCAGGCTCCAGGCAAGGGACCGGAGTGGCTGGCAGATATGTCACAGGATGGAAGTATTAGAATGTATGCAGGCTCCGTGAGGGGCCGATTCACCATCTCTAGAGACAATTCCGAGAACACTCTCTATCTGCAAATGAACAGCCTGAGAGTTGAGGACACGGGTGTATATTACTGTGTGAAAGAATCCAATGCTTTTGATGTCTGGGGCCAAGGGACAATGGTCATCGTCTCTTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCAGCTGTCCTACAGTCCTCAGGA,H,IGHV3-30*18,CAGGTGCAGCTGGCGGAGTCTGGGGGAGGCTTAGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTAACTCTCAGTAATTATGTCACGCACTGGGTCCGCCAGGCTCCAGGCAAGGGACCGGAGTGGCTGGCAGATATGTCACAGGATGGAAGTATTAGAATGTATGCAGGCTCCGTGAGGGGCCGATTCACCATCTCTAGAGACAATTCCGAGAACACTCTCTATCTGCAAATGAACAGCCTGAGAGTTGAGGACACGGGTGTATATTACTGTGTGAAAGAATCCAATGCTTTTGATGTCTGGGGCCAAGGGACAATGGTCATCGTCTCTTCAG,QVQLAESGGGLVQPGRSLRLSCAASGLTLSNYVTHWVRQAPGKGPEWLADMSQDGSIRMYAGSVRGRFTISRDNSENTLYLQMNSLRVEDTGVYYCVKESNAFDVWGQGTMVIVSS,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXXAFDVWGQGTMVTVSS,VKESNAFDV,GCTCCAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACCATGCCCTGGGCTCTGCTCCTCCTGACCCTCCTCACTCACTCTGCAGTGTCAGTGGTCCAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGCCTTGGGACAGACCGCCACACTCACCTGCACTGGGAACAACTACGATGTTGGCAGCCAAGGAGCAGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAACTCCTCTCGTACAGGAATAACAACCGGCCCTCAGGGATTTCAGAGAGGTTCTCTGCGTCCAGGTCAGGGAACGCAGACTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTATTGCTCAGCATGGGACGACAGTATCAGTGGTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV10-54*04,CAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGCCTTGGGACAGACCGCCACACTCACCTGCACTGGGAACAACTACGATGTTGGCAGCCAAGGAGCAGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAACTCCTCTCGTACAGGAATAACAACCGGCCCTCAGGGATTTCAGAGAGGTTCTCTGCGTCCAGGTCAGGGAACGCAGACTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTATTGCTCAGCATGGGACGACAGTATCAGTGGTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QAGLTQPPSVSKALGQTATLTCTGNNYDVGSQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNADSLTITGLQPEDEADYYCSAWDDSISGWVFGGGTKLTVL,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDSSLSXWVFGGGTKLTVL,SAWDDSISGWV,QVQLAESGGGLVQPGRSLRLSCAASGLTLSNYVTHWVRQAPGKGPEWLADMSQDGSIRMYAGSVRGRFTISRDNSENTLYLQMNSLRVEDTGVYYCVKESNAFDVWGQGTMVIVSS[SEP]QAGLTQPPSVSKALGQTATLTCTGNNYDVGSQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNADSLTITGLQPEDEADYYCSAWDDSISGWVFGGGTKLTVL,Memory-B-Cells,SARS-COV-2,human,Donor-2,"Jaffe et al., 2022",35,IGHV3-30,IGLV10-54,IGHV3-30,IGLV10-54

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String,String7,String15,String31,String7,String15,String15,String,String
1,ACTCTGCTGAAGAAAACCAGCCCTGCAGCTCTGGGAGAGGAGCCCCAGCCCTGGGATTCCCAGCTGTTTCTGCTTGCTGATCAGGACTGCACACAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAACTACTGGATGCACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCAAGTATTAATAATGATGGGAGTAGGACAGATTATGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACCGCTGTGTATTACTGTGCAACAGTATTTGAGTACTGGGGCCAGGGAATTCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV3-74*01,GAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAACTACTGGATGCACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCAAGTATTAATAATGATGGGAGTAGGACAGATTATGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACCGCTGTGTATTACTGTGCAACAGTATTTGAGTACTGGGGCCAGGGAATTCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSNYWMHWVRQAPGKGLVWVSSINNDGSRTDYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGILVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCARXFDYWGQGTLVTVSS,ATVFEY,TGGGCTCCAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACCATGCCCTGGGCTCTGCTCCTCCTGACCCTCCTCACTCACTCTGCAGTGTCAGTGGTCCAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACCCTCACCTGCACTGGGAACAGCAACAATGTTGGCAACCAAGGAGCATCTTGGCTGCAGCATCACCAGGGCCACCCTCCCAAAGTCCTATCCTACAGGAATAACAACCGGCCCTCAGGGATCTCAGAGAGATTCTCTGCATCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGTCTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACACCAGCCTCAATGCCGGGCTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV10-54*04,CAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACCCTCACCTGCACTGGGAACAGCAACAATGTTGGCAACCAAGGAGCATCTTGGCTGCAGCATCACCAGGGCCACCCTCCCAAAGTCCTATCCTACAGGAATAACAACCGGCCCTCAGGGATCTCAGAGAGATTCTCTGCATCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGTCTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACACCAGCCTCAATGCCGGGCTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGASWLQHHQGHPPKVLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDTSLNAGLFGGGTKLTVL,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDSSLSAXXFGGGTKLTVL,SAWDTSLNAGL,EVQLVESGGGLVQPGGSLRLSCAASGFTFSNYWMHWVRQAPGKGLVWVSSINNDGSRTDYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGILVTVSS[SEP]QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGASWLQHHQGHPPKVLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDTSLNAGLFGGGTKLTVL,Memory-B-Cells,SARS-COV-2,human,S-CoV13,"Sokal et al, 2021",51,IGHV3-74,IGLV10-54,IGHV3-74,IGLV10-54
2,GGAGCCCCAGCCCTGGGATTCCCAGCTGTTTCTGCTTGCTGATCAGGACTGCACACAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCGGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGAGTCCCTGAGACTCTCCTGTTCAGCCTCTGGATTCACCTTCAGTAACTACTGGATCCACTGGGTCCGCCAAGCGCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTGATACTGATGGGAGTGGCACAAGTTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTGTATTATTGTGCAACCGTCTTTGAATATTGGGGCCAGGGAGCCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCAGCTGTCCTACAGTCCTCAGGA,H,IGHV3-74*01,GAGGTGCGGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGAGTCCCTGAGACTCTCCTGTTCAGCCTCTGGATTCACCTTCAGTAACTACTGGATCCACTGGGTCCGCCAAGCGCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTGATACTGATGGGAGTGGCACAAGTTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTGTATTATTGTGCAACCGTCTTTGAATATTGGGGCCAGGGAGCCCTGGTCACCGTCTCCTCAG,EVRLVESGGGLVQPGESLRLSCSASGFTFSNYWIHWVRQAPGKGLVWVSRIDTDGSGTSYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGALVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCAXXFDYWGQGTLVTVSS,ATVFEY,GGGGAAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACCATGCCCTGGGCTCTGCTCCTCCTGACCCTCCTCACTCACTCTGCAGTGTCAGTGGTCCAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACACTCACCTGCACTGGGGACAGCAACAATGTTGGCCGCCAAGGAGCAGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAAGCCCTATCCTACAGGGATAACAACCGGCCCTCAGACATCTCAGAGAGATTCTCTGCGTCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACATCAGTCTCAATGCTGTGGTCTTCGGCGGAGGGACCACGCTGACCGTCTTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV10-54*04,CAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACACTCACCTGCACTGGGGACAGCAACAATGTTGGCCGCCAAGGAGCAGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAAGCCCTATCCTACAGGGATAACAACCGGCCCTCAGACATCTCAGAGAGATTCTCTGCGTCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACATCAGTCTCAATGCTGTGGTCTTCGGCGGAGGGACCACGCTGACCGTCTTAG,QAGLTQPPSVSKGLRQTATLTCTGDSNNVGRQGAAWLQQHQGHPPKALSYRDNNRPSDISERFSASRSGNTASLTITGLQPEDEADYYCSAWDISLNAVVFGGGTTLTVL,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDSSLSAVVFGGGTKLTVL,SAWDISLNAVV,EVRLVESGGGLVQPGESLRLSCSASGFTFSNYWIHWVRQAPGKGLVWVSRIDTDGSGTSYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGALVTVSS[SEP]QAGLTQPPSVSKGLRQTATLTCTGDSNNVGRQGAAWLQQHQGHPPKALSYRDNNRPSDISERFSASRSGNTASLTITGLQPEDEADYYCSAWDISLNAVVFGGGTTLTVL,Memory-B-Cells,,human,Donor-1,"Phad et al., 2022",no,IGHV3-74,IGLV10-54,IGHV3-74,IGLV10-54


In [40]:
# Step 2: Calculate the percentage of identical genes for each group
percentages = []

for group in filtered_groups_only_sev_subj
    # Count occurrences of each unique gene in `general_v_gene_light`
    gene_counts = combine(groupby(group, "general_v_gene_light_no_para"), nrow => :count)
    
    # Calculate percentage of the most common gene
    most_common_count = maximum(gene_counts.count)
    total_count = sum(gene_counts.count)
    percentage_identical = (most_common_count / total_count) * 100
    
    push!(percentages, percentage_identical)  # Store the percentage
end

# Step 3: Calculate the average percentage
average_percentage = mean(percentages)

println("Average percentage of identical genes across groups: $average_percentage%")
average_percentage

Average percentage of identical genes across groups: 86.58351670985553%


86.58351670985553

In [41]:
# alternative: Calculate the fraction of groups where all entries have the same `general_v_gene_light`
true_cases = 0
total_groups = length(filtered_groups_only_sev_subj)

for group in filtered_groups_only_sev_subj
    # Check if all entries in the group have the same `general_v_gene_light`
    unique_genes = unique(group.general_v_gene_light_no_para)
    if length(unique_genes) == 1
        true_cases += 1  # Increment the count if all genes are identical
    end
end

# Step 3: Calculate the fraction of "true" cases
fraction_true = (true_cases / total_groups) * 100

println("Percentage of groups where all entries have the same general_v_gene_light: $fraction_true%")
fraction_true

Percentage of groups where all entries have the same general_v_gene_light: 55.8169375534645%


55.8169375534645

## only with data from Jaffe et al. 2022

In [42]:
filtered_groups_only_sev_subj

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String,String7,String15,String31,String7,String15,String15,String,String
1,GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGTAGCTTTGGCATGCACTGGGTCCGGCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCGGTCATATCATTTTCATTTATGGGAAGTTTTGAATACTATTCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAGCACGCTGTATCTGCAAATGGACAACCTGAGAGTTGAGGACACGGCTGTATATTACTGTGTGAAAGAATCGAATGCTTTTGATGTCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-30*18,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGTAGCTTTGGCATGCACTGGGTCCGGCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCGGTCATATCATTTTCATTTATGGGAAGTTTTGAATACTATTCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAGCACGCTGTATCTGCAAATGGACAACCTGAGAGTTGAGGACACGGCTGTATATTACTGTGTGAAAGAATCGAATGCTTTTGATGTCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAG,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSFGMHWVRQAPGKGLEWVAVISFSFMGSFEYYSDSVKGRFTISRDNSKSTLYLQMDNLRVEDTAVYYCVKESNAFDVWGQGTMVTVSS,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXXAFDVWGQGTMVTVSS,VKESNAFDV,GAAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACCATGCCCTGGGCTCTGCTCCTCCTGACCCTCCTCACTCACTCTGCAGTGTCAGTGGTCCAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCCTGAGACAGACCGCCACAGTCACCTGCACTGGGAACAGCAACAATGTTGGCGACCAAGGAGCCGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAACTCCTTTCCTACAGGAATAACAACCGGCCCTCAGGGATCTCAGAGAGATTATCTGCATCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACAGTAGACTCAATGTTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV10-54*01,CAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCCTGAGACAGACCGCCACAGTCACCTGCACTGGGAACAGCAACAATGTTGGCGACCAAGGAGCCGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAACTCCTTTCCTACAGGAATAACAACCGGCCCTCAGGGATCTCAGAGAGATTATCTGCATCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACAGTAGACTCAATGTTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QAGLTQPPSVSKGLRQTATVTCTGNSNNVGDQGAAWLQQHQGHPPKLLSYRNNNRPSGISERLSASRSGNTASLTITGLQPEDEADYYCSAWDSRLNVWVFGGGTKLTVL,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGAAWLQQHQGHPPKLLSYRNNNRPSGISERLSASRSGNTASLTITGLQPEDEADYYCSAWDSSLSXWVFGGGTKLTVL,SAWDSRLNVWV,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSFGMHWVRQAPGKGLEWVAVISFSFMGSFEYYSDSVKGRFTISRDNSKSTLYLQMDNLRVEDTAVYYCVKESNAFDVWGQGTMVTVSS[SEP]QAGLTQPPSVSKGLRQTATVTCTGNSNNVGDQGAAWLQQHQGHPPKLLSYRNNNRPSGISERLSASRSGNTASLTITGLQPEDEADYYCSAWDSRLNVWVFGGGTKLTVL,Memory-B-Cells,,human,Donor-1,"Phad et al., 2022",no,IGHV3-30,IGLV10-54,IGHV3-30,IGLV10-54
2,GGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAATGTCAGGTGCAGCTGGCGGAGTCTGGGGGAGGCTTAGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTAACTCTCAGTAATTATGTCACGCACTGGGTCCGCCAGGCTCCAGGCAAGGGACCGGAGTGGCTGGCAGATATGTCACAGGATGGAAGTATTAGAATGTATGCAGGCTCCGTGAGGGGCCGATTCACCATCTCTAGAGACAATTCCGAGAACACTCTCTATCTGCAAATGAACAGCCTGAGAGTTGAGGACACGGGTGTATATTACTGTGTGAAAGAATCCAATGCTTTTGATGTCTGGGGCCAAGGGACAATGGTCATCGTCTCTTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCAGCTGTCCTACAGTCCTCAGGA,H,IGHV3-30*18,CAGGTGCAGCTGGCGGAGTCTGGGGGAGGCTTAGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTAACTCTCAGTAATTATGTCACGCACTGGGTCCGCCAGGCTCCAGGCAAGGGACCGGAGTGGCTGGCAGATATGTCACAGGATGGAAGTATTAGAATGTATGCAGGCTCCGTGAGGGGCCGATTCACCATCTCTAGAGACAATTCCGAGAACACTCTCTATCTGCAAATGAACAGCCTGAGAGTTGAGGACACGGGTGTATATTACTGTGTGAAAGAATCCAATGCTTTTGATGTCTGGGGCCAAGGGACAATGGTCATCGTCTCTTCAG,QVQLAESGGGLVQPGRSLRLSCAASGLTLSNYVTHWVRQAPGKGPEWLADMSQDGSIRMYAGSVRGRFTISRDNSENTLYLQMNSLRVEDTGVYYCVKESNAFDVWGQGTMVIVSS,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXXAFDVWGQGTMVTVSS,VKESNAFDV,GCTCCAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACCATGCCCTGGGCTCTGCTCCTCCTGACCCTCCTCACTCACTCTGCAGTGTCAGTGGTCCAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGCCTTGGGACAGACCGCCACACTCACCTGCACTGGGAACAACTACGATGTTGGCAGCCAAGGAGCAGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAACTCCTCTCGTACAGGAATAACAACCGGCCCTCAGGGATTTCAGAGAGGTTCTCTGCGTCCAGGTCAGGGAACGCAGACTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTATTGCTCAGCATGGGACGACAGTATCAGTGGTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV10-54*04,CAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGCCTTGGGACAGACCGCCACACTCACCTGCACTGGGAACAACTACGATGTTGGCAGCCAAGGAGCAGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAACTCCTCTCGTACAGGAATAACAACCGGCCCTCAGGGATTTCAGAGAGGTTCTCTGCGTCCAGGTCAGGGAACGCAGACTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTATTGCTCAGCATGGGACGACAGTATCAGTGGTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QAGLTQPPSVSKALGQTATLTCTGNNYDVGSQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNADSLTITGLQPEDEADYYCSAWDDSISGWVFGGGTKLTVL,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDSSLSXWVFGGGTKLTVL,SAWDDSISGWV,QVQLAESGGGLVQPGRSLRLSCAASGLTLSNYVTHWVRQAPGKGPEWLADMSQDGSIRMYAGSVRGRFTISRDNSENTLYLQMNSLRVEDTGVYYCVKESNAFDVWGQGTMVIVSS[SEP]QAGLTQPPSVSKALGQTATLTCTGNNYDVGSQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNADSLTITGLQPEDEADYYCSAWDDSISGWVFGGGTKLTVL,Memory-B-Cells,SARS-COV-2,human,Donor-2,"Jaffe et al., 2022",35,IGHV3-30,IGLV10-54,IGHV3-30,IGLV10-54

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String,String7,String15,String31,String7,String15,String15,String,String
1,ACTCTGCTGAAGAAAACCAGCCCTGCAGCTCTGGGAGAGGAGCCCCAGCCCTGGGATTCCCAGCTGTTTCTGCTTGCTGATCAGGACTGCACACAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAACTACTGGATGCACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCAAGTATTAATAATGATGGGAGTAGGACAGATTATGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACCGCTGTGTATTACTGTGCAACAGTATTTGAGTACTGGGGCCAGGGAATTCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV3-74*01,GAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAACTACTGGATGCACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCAAGTATTAATAATGATGGGAGTAGGACAGATTATGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACCGCTGTGTATTACTGTGCAACAGTATTTGAGTACTGGGGCCAGGGAATTCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSNYWMHWVRQAPGKGLVWVSSINNDGSRTDYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGILVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCARXFDYWGQGTLVTVSS,ATVFEY,TGGGCTCCAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACCATGCCCTGGGCTCTGCTCCTCCTGACCCTCCTCACTCACTCTGCAGTGTCAGTGGTCCAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACCCTCACCTGCACTGGGAACAGCAACAATGTTGGCAACCAAGGAGCATCTTGGCTGCAGCATCACCAGGGCCACCCTCCCAAAGTCCTATCCTACAGGAATAACAACCGGCCCTCAGGGATCTCAGAGAGATTCTCTGCATCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGTCTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACACCAGCCTCAATGCCGGGCTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV10-54*04,CAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACCCTCACCTGCACTGGGAACAGCAACAATGTTGGCAACCAAGGAGCATCTTGGCTGCAGCATCACCAGGGCCACCCTCCCAAAGTCCTATCCTACAGGAATAACAACCGGCCCTCAGGGATCTCAGAGAGATTCTCTGCATCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGTCTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACACCAGCCTCAATGCCGGGCTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGASWLQHHQGHPPKVLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDTSLNAGLFGGGTKLTVL,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDSSLSAXXFGGGTKLTVL,SAWDTSLNAGL,EVQLVESGGGLVQPGGSLRLSCAASGFTFSNYWMHWVRQAPGKGLVWVSSINNDGSRTDYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGILVTVSS[SEP]QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGASWLQHHQGHPPKVLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDTSLNAGLFGGGTKLTVL,Memory-B-Cells,SARS-COV-2,human,S-CoV13,"Sokal et al, 2021",51,IGHV3-74,IGLV10-54,IGHV3-74,IGLV10-54
2,GGAGCCCCAGCCCTGGGATTCCCAGCTGTTTCTGCTTGCTGATCAGGACTGCACACAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCGGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGAGTCCCTGAGACTCTCCTGTTCAGCCTCTGGATTCACCTTCAGTAACTACTGGATCCACTGGGTCCGCCAAGCGCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTGATACTGATGGGAGTGGCACAAGTTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTGTATTATTGTGCAACCGTCTTTGAATATTGGGGCCAGGGAGCCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCAGCTGTCCTACAGTCCTCAGGA,H,IGHV3-74*01,GAGGTGCGGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGAGTCCCTGAGACTCTCCTGTTCAGCCTCTGGATTCACCTTCAGTAACTACTGGATCCACTGGGTCCGCCAAGCGCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTGATACTGATGGGAGTGGCACAAGTTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTGTATTATTGTGCAACCGTCTTTGAATATTGGGGCCAGGGAGCCCTGGTCACCGTCTCCTCAG,EVRLVESGGGLVQPGESLRLSCSASGFTFSNYWIHWVRQAPGKGLVWVSRIDTDGSGTSYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGALVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCAXXFDYWGQGTLVTVSS,ATVFEY,GGGGAAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACCATGCCCTGGGCTCTGCTCCTCCTGACCCTCCTCACTCACTCTGCAGTGTCAGTGGTCCAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACACTCACCTGCACTGGGGACAGCAACAATGTTGGCCGCCAAGGAGCAGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAAGCCCTATCCTACAGGGATAACAACCGGCCCTCAGACATCTCAGAGAGATTCTCTGCGTCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACATCAGTCTCAATGCTGTGGTCTTCGGCGGAGGGACCACGCTGACCGTCTTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV10-54*04,CAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACACTCACCTGCACTGGGGACAGCAACAATGTTGGCCGCCAAGGAGCAGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAAGCCCTATCCTACAGGGATAACAACCGGCCCTCAGACATCTCAGAGAGATTCTCTGCGTCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACATCAGTCTCAATGCTGTGGTCTTCGGCGGAGGGACCACGCTGACCGTCTTAG,QAGLTQPPSVSKGLRQTATLTCTGDSNNVGRQGAAWLQQHQGHPPKALSYRDNNRPSDISERFSASRSGNTASLTITGLQPEDEADYYCSAWDISLNAVVFGGGTTLTVL,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDSSLSAVVFGGGTKLTVL,SAWDISLNAVV,EVRLVESGGGLVQPGESLRLSCSASGFTFSNYWIHWVRQAPGKGLVWVSRIDTDGSGTSYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGALVTVSS[SEP]QAGLTQPPSVSKGLRQTATLTCTGDSNNVGRQGAAWLQQHQGHPPKALSYRDNNRPSDISERFSASRSGNTASLTITGLQPEDEADYYCSAWDISLNAVVFGGGTTLTVL,Memory-B-Cells,,human,Donor-1,"Phad et al., 2022",no,IGHV3-74,IGLV10-54,IGHV3-74,IGLV10-54


In [43]:
mem_subj_df_jaffe_only = filter(g -> g.Author == "Jaffe et al., 2022", mem_subj_df)


Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String,String7,String15,String31,String7,String15,String15,String,String
1,CCACATCCCTCCTCAGAAGCCCCCAGAGCACAACGCCTCACCATGGACTGGACCTGGAGGATCCTCTTTTTGGTGGCAGCAGCCACAGGTGCCCACTCCCAGGTCCAACTTGTGCAGTCTGGGGCTGAGGTGAGGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAAGGCTTCTGGATACACCTTCACTACTTATGCTATACATTGGGTGCGCCGGGCCCCCGGACAAAGGCTTGAGTGGATGGGATGGATCAACGCTGCCAATGGTAACACAGAATATTCACAGAGGTTCCAGGGCAGAGTCACATTTACAAGGGACACATCCGCGACCACAGCCTACATGGAACTGAGGAGCCTGAGATCTGAAGACACGGCTATTTATTATTGTGCGGGAGACTCTCTGGCAGCAGATGGCCAATTCTTTGATTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV1-3*01,CAGGTCCAACTTGTGCAGTCTGGGGCTGAGGTGAGGAAGCCTGGGGCCTCAGTGAAGGTTTCCTGCAAGGCTTCTGGATACACCTTCACTACTTATGCTATACATTGGGTGCGCCGGGCCCCCGGACAAAGGCTTGAGTGGATGGGATGGATCAACGCTGCCAATGGTAACACAGAATATTCACAGAGGTTCCAGGGCAGAGTCACATTTACAAGGGACACATCCGCGACCACAGCCTACATGGAACTGAGGAGCCTGAGATCTGAAGACACGGCTATTTATTATTGTGCGGGAGACTCTCTGGCAGCAGATGGCCAATTCTTTGATTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QVQLVQSGAEVRKPGASVKVSCKASGYTFTTYAIHWVRRAPGQRLEWMGWINAANGNTEYSQRFQGRVTFTRDTSATTAYMELRSLRSEDTAIYYCAGDSLAADGQFFDYWGQGTLVTVSS,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYAMHWVRQAPGQRLEWMGWINAGNGNTKYSQKFQGRVTITRDTSASTAYMELSSLRSEDTAVYYCARXXXAAAGXXFDYWGQGTLVTVSS,AGDSLAADGQFFDY,GGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATGCCACCGGAGAAGTTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGGGTGTTAAGTACAGTTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGTTTCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTACCTCATCCACTTTCGGCCCTGGGACCAAGGTGGATATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-20*01,GAAGTTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGGGTGTTAAGTACAGTTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGTTTCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTACCTCATCCACTTTCGGCCCTGGGACCAAGGTGGATATCAAAC,EVVLTQSPGTLSLSPGERATLSCRASQGVKYSYLAWYQQKPGQAPRLLIYGVSSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGTSSTFGPGTKVDIK,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSXTFGPGTKVDIK,QQYGTSST,QVQLVQSGAEVRKPGASVKVSCKASGYTFTTYAIHWVRRAPGQRLEWMGWINAANGNTEYSQRFQGRVTFTRDTSATTAYMELRSLRSEDTAIYYCAGDSLAADGQFFDYWGQGTLVTVSS[SEP]EVVLTQSPGTLSLSPGERATLSCRASQGVKYSYLAWYQQKPGQAPRLLIYGVSSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGTSSTFGPGTKVDIK,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV1-3,IGKV3-20,IGHV1-3,IGKV3-20
2,AGCTCTCAGAGAGGTGCCTTAGCCCTGGATTCCAAGGCATTTCCACTTGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTGGGGCTGTGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCGCCTTCAGTAGATATAGCATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTTCATACATTCGTAGTAGTGGTAGTCCCGTATACTACGCAGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAATGCCAAGAACTTACTGTTTCTACAAATGAACAGCCTGAGAGCCGAGGACACGGCTGTTTATTACTGTGCGAGAGATCTACCCATGAATGCTTTTGATATTTGGGGCCCAGGGACAATGGTCACCGTCTCTTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-48*01,GAGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCGCCTTCAGTAGATATAGCATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTTCATACATTCGTAGTAGTGGTAGTCCCGTATACTACGCAGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAATGCCAAGAACTTACTGTTTCTACAAATGAACAGCCTGAGAGCCGAGGACACGGCTGTTTATTACTGTGCGAGAGATCTACCCATGAATGCTTTTGATATTTGGGGCCCAGGGACAATGGTCACCGTCTCTTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFAFSRYSMNWVRQAPGKGLEWVSYIRSSGSPVYYADSVKGRFTISRDNAKNLLFLQMNSLRAEDTAVYYCARDLPMNAFDIWGPGTMVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYSMNWVRQAPGKGLEWVSYISSSSSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXXXXAFDIWGQGTMVTVSS,ARDLPMNAFDI,TGAGCGCAGAAGGCAGGACTCGGGACAATCTTCATCATGACCTGCTCCCCTCTCCTCCTCACCCTTCTCATTCACTGCACAGGGTCCTGGGCCCAGTCTGTGTTGACGCAGCCGCCCTCAGTGTCTGCGGCCCCAGGACAGAAGGTCACCATCTCCTGCTCTGGAAGCAGCTCCAATATTGGGAATAATTATGTATCCTGGTACCAACACCTCCCAGGAACAGCCCCCAAACTCCTCATCTATGAAAATAATAAGCGACCCTCAGGGATTCCTGACCGATTCTCTGGCTCCAAGTCTGGCACGTCAGCCACCCTGGGCATCACCGGACTCCAGACTGGGGACGAGGCCGATTTTTATTGCGCAACATGGGATAGCAGCCTGAGTGTTGGGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTACGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV1-51*02,CAGTCTGTGTTGACGCAGCCGCCCTCAGTGTCTGCGGCCCCAGGACAGAAGGTCACCATCTCCTGCTCTGGAAGCAGCTCCAATATTGGGAATAATTATGTATCCTGGTACCAACACCTCCCAGGAACAGCCCCCAAACTCCTCATCTATGAAAATAATAAGCGACCCTCAGGGATTCCTGACCGATTCTCTGGCTCCAAGTCTGGCACGTCAGCCACCCTGGGCATCACCGGACTCCAGACTGGGGACGAGGCCGATTTTTATTGCGCAACATGGGATAGCAGCCTGAGTGTTGGGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTA,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVSWYQHLPGTAPKLLIYENNKRPSGIPDRFSGSKSGTSATLGITGLQTGDEADFYCATWDSSLSVGVFGGGTKLTVL,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVSWYQQLPGTAPKLLIYENNKRPSGIPDRFSGSKSGTSATLGITGLQTGDEADYYCGTWDSSLSAGVFGGGTKLTVL,ATWDSSLSVGV,EVQLVESGGGLVQPGGSLRLSCAASGFAFSRYSMNWVRQAPGKGLEWVSYIRSSGSPVYYADSVKGRFTISRDNAKNLLFLQMNSLRAEDTAVYYCARDLPMNAFDIWGPGTMVTVSS[SEP]QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVSWYQHLPGTAPKLLIYENNKRPSGIPDRFSGSKSGTSATLGITGLQTGDEADFYCATWDSSLSVGVFGGGTKLTVL,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV3-48,IGLV1-51,IGHV3-48,IGLV1-51
3,AGCTCTGGGAGAGGAGCCCCAGCCTTGGGATTCCCAAGTGTTTTCACTCAGTGATCAGGACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGATTTTCCTTGCTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGATGGTGGAGTCTGGGGGAGGCTTGGTAAAGCCTGGGGGGTCCCTTAGACTCTCCTGTGCAGCCTCTGGATTCACTTTCAGTAACGCCTGGATGAGCTGGGTCCGCCAGCCTCCAGGGAAGGGGCTGGAGTGGGTTGGCCGTATTAAAAATAGAGGTGAAGGTGAGACAACAGACTACGCTGCACCCGTGAAAGGCAGATTCACCATCTCAAGAGATGATTCAAAAAACACGCTGTATCTGCAGATGAACAGCCTGAAAACCGAGGACACAGCCGTGTATTATTGTACCGCTTACTATTATAGTGGAATCTATACCGGAGACTTCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-15*01,GAGGTGCAGATGGTGGAGTCTGGGGGAGGCTTGGTAAAGCCTGGGGGGTCCCTTAGACTCTCCTGTGCAGCCTCTGGATTCACTTTCAGTAACGCCTGGATGAGCTGGGTCCGCCAGCCTCCAGGGAAGGGGCTGGAGTGGGTTGGCCGTATTAAAAATAGAGGTGAAGGTGAGACAACAGACTACGCTGCACCCGTGAAAGGCAGATTCACCATCTCAAGAGATGATTCAAAAAACACGCTGTATCTGCAGATGAACAGCCTGAAAACCGAGGACACAGCCGTGTATTATTGTACCGCTTACTATTATAGTGGAATCTATACCGGAGACTTCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQMVESGGGLVKPGGSLRLSCAASGFTFSNAWMSWVRQPPGKGLEWVGRIKNRGEGETTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTAYYYSGIYTGDFWGQGTLVTVSS,EVQLVESGGGLVKPGGSLRLSCAASGFTFSNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTXYYDSXXXXXDYWGQGTLVTVSS,TAYYYSGIYTGDF,GATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCTCTGGATCCAGTGGGGATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGCCAGAGCCTCCTGAATAGTAACGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGCAGTCTCCACAAGTCCTGATCTATTTGGCTTCTAATCGGGCCCCCGGGGTCCCTGACAGGTTCAGTGGTAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAACTCTGCAAAATCCTCGGGCTTTTGGCCAGGGGACCAAGCTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-28*01,GATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGCCAGAGCCTCCTGAATAGTAACGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGCAGTCTCCACAAGTCCTGATCTATTTGGCTTCTAATCGGGCCCCCGGGGTCCCTGACAGGTTCAGTGGTAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAACTCTGCAAAATCCTCGGGCTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC,DIVMTQSPLSLPVTPGEPASISCRSSQSLLNSNGYNYLDWYLQKPGQSPQVLIYLASNRAPGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQTLQNPRAFGQGTKLEIK,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPXXFGQGTKLEIK,MQTLQNPRA,EVQMVESGGGLVKPGGSLRLSCAASGFTFSNAWMSWVRQPPGKGLEWVGRIKNRGEGETTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTAYYYSGIYTGDFWGQGTLVTVSS[SEP]DIVMTQSPLSLPVTPGEPASISCRSSQSLLNSNGYNYLDWYLQKPGQSPQVLIYLASNRAPGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQTLQNPRAFGQGTKLEIK,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV3-15,IGKV2-28,IGHV3-15,IGKV2-28
4,TGGGGACCCAAAAACCACACCCCTCCTTGGGAGAATCCCCTAGATCACAGCTCCTCACCATGGACTGGACCTGGAGCATCCTTTTCTTGGTGGCAGCAGCAACAGGTGCCCACTCCCAGGTTCAGCTGCTTCAGTCTGGAGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTTCCGACTATGGTATCAGCTGGGTGCGACAGGCCCCTGGACAAGCGCTTGAGTGGATGGGATGGATCAGCCCTAACAACGGTAATACACACCTTCGACAGGAGCTCCGGGGCAGAGTCTTCCTGACCACAGACAGATCCACGACCACAGCCTACATGGAGCTGAGGAACCTGAGATCTGACGACACGGCCGTGTACTTCTGTGGGAGAGATCCTCGTGATTACGTCTGGGGGAATTATCCTCACTACACGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCCGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV1-18*01,CAGGTTCAGCTGCTTCAGTCTGGAGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTTCCGACTATGGTATCAGCTGGGTGCGACAGGCCCCTGGACAAGCGCTTGAGTGGATGGGATGGATCAGCCCTAACAACGGTAATACACACCTTCGACAGGAGCTCCGGGGCAGAGTCTTCCTGACCACAGACAGATCCACGACCACAGCCTACATGGAGCTGAGGAACCTGAGATCTGACGACACGGCCGTGTACTTCTGTGGGAGAGATCCTCGTGATTACGTCTGGGGGAATTATCCTCACTACACGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTC,QVQLLQSGAEVKKPGASVKVSCKASGYTFSDYGISWVRQAPGQALEWMGWISPNNGNTHLRQELRGRVFLTTDRSTTTAYMELRNLRSDDTAVYFCGRDPRDYVWGNYPHYTDVWGKGTTVTVSS,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYCARXXXDYVWGSYXXYMDVWGKGTTVTVSS,GRDPRDYVWGNYPHYTDV,AGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACCGGAGAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAATATTAGTAGCAGGTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGACTCCTCATATATGTTGCATCCAGCAGGGCCATTGGCATCCCAGACAGGTTCGGTGGCGGTGGGTCTGGGACAGACTTCAGTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTATTGTCAGCAATATGATACCTCACCCTTGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-20*01,GAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAATATTAGTAGCAGGTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGACTCCTCATATATGTTGCATCCAGCAGGGCCATTGGCATCCCAGACAGGTTCGGTGGCGGTGGGTCTGGGACAGACTTCAGTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTATTGTCAGCAATATGATACCTCACCCTTGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,EIVLTQSPGTLSLSPGERATLSCRASQNISSRYLAWYQQKPGQAPRLLIYVASSRAIGIPDRFGGGGSGTDFSLTISRLEPEDFAVYYCQQYDTSPLTFGQGTKVEIK,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSPXTFGQGTKVEIK,QQYDTSPLT,QVQLLQSGAEVKKPGASVKVSCKASGYTFSDYGISWVRQAPGQALEWMGWISPNNGNTHLRQELRGRVFLTTDRSTTTAYMELRNLRSDDTAVYFCGRDPRDYVWGNYPHYTDVWGKGTTVTVSS[SEP]EIVLTQSPGTLSLSPGERATLSCRASQNISSRYLAWYQQKPGQAPRLLIYVASSRAIGIPDRFGGGGSGTDFSLTISRLEPEDFAVYYCQQYDTSPLTFGQGTKVEIK,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV1-18,IGKV3-20,IGHV1-18,IGKV3-20
5,GGGGGAATCCTGCTCGCCACCATGGACATACTGTGTTCCACGCTCCTGCTACTGACTGTCCCGTCCTGGGTCTTATCCCAGGTCACCTTGAGGGAGTCTGGTCCTGCGCTGGTGAAACCCACACAGACCCTCACATTGACCTGCACCTTCTCTGGGTTCTCACTCAGTACTAGTGGAATGTGCGTGAGCTGGCTCCGTCAGGCCCCGGGGAAGGCCCCGGAGTGGCTTGGCCTCATCGATTGGGTTGATACAAAATACTACAGAGCATCACTAGAGCCCAGGCTCACCATCTCCAAGGACACCTCCAAAAACCGGGTGGTCCTTACAATGACCAGCATGGACCCTCTGGACACAGCCACGTACTATTGTGCACGGAACCGACTACGGTCTGGGAACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV2-70*13,CAGGTCACCTTGAGGGAGTCTGGTCCTGCGCTGGTGAAACCCACACAGACCCTCACATTGACCTGCACCTTCTCTGGGTTCTCACTCAGTACTAGTGGAATGTGCGTGAGCTGGCTCCGTCAGGCCCCGGGGAAGGCCCCGGAGTGGCTTGGCCTCATCGATTGGGTTGATACAAAATACTACAGAGCATCACTAGAGCCCAGGCTCACCATCTCCAAGGACACCTCCAAAAACCGGGTGGTCCTTACAATGACCAGCATGGACCCTCTGGACACAGCCACGTACTATTGTGCACGGAACCGACTACGGTCTGGGAACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCA,QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWLRQAPGKAPEWLGLIDWVDTKYYRASLEPRLTISKDTSKNRVVLTMTSMDPLDTATYYCARNRLRSGNMDVWGKGTTVTVSS,QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLALIDWDDDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARXXLRXXXMDVWGKGTTVTVSS,ARNRLRSGNMDV,AGCTTCAGCTGTGGGTAGAGAAGACAGGACTCAGGACAATCTCCAGCATGGCCAGCTTCCCTCTCCTCCTCACCCTCCTCACTCACTGTGCAGGGTCCTGGGCCCAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGCAGAGGGTCACCATCTCTTGTTCTGGAAGCAGCTCCAACATCGGAAGTAATAGTGTAAACTGGTACCAGCAGCTCCCAGGGACGGCCCCCAAACTCCTCATCTATAGTGATAATCAGCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCAGCCTCCCTGGCCATCAGTGGGCTCCGGTCTGAGGATGAGGCTGATTATTACTGTGCAGCATGGGATCACAGACTGAATGGTCTTGTGCTATTCGGCGGAGGGACCAAGCTGACCGTCCTGGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV1-44*01,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGCAGAGGGTCACCATCTCTTGTTCTGGAAGCAGCTCCAACATCGGAAGTAATAGTGTAAACTGGTACCAGCAGCTCCCAGGGACGGCCCCCAAACTCCTCATCTATAGTGATAATCAGCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCAGCCTCCCTGGCCATCAGTGGGCTCCGGTCTGAGGATGAGGCTGATTATTACTGTGCAGCATGGGATCACAGACTGAATGGTCTTGTGCTATTCGGCGGAGGGACCAAGCTGACCGTCCT,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNSVNWYQQLPGTAPKLLIYSDNQRPSGVPDRFSGSKSGTSASLAISGLRSEDEADYYCAAWDHRLNGLVLFGGGTKLTVL,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNTVNWYQQLPGTAPKLLIYSNNQRPSGVPDRFSGSKSGTSASLAISGLQSEDEADYYCAAWDDSLNGXVVFGGGTKLTVL,AAWDHRLNGLVL,QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWLRQAPGKAPEWLGLIDWVDTKYYRASLEPRLTISKDTSKNRVVLTMTSMDPLDTATYYCARNRLRSGNMDVWGKGTTVTVSS[SEP]QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNSVNWYQQLPGTAPKLLIYSDNQRPSGVPDRFSGSKSGTSASLAISGLRSEDEADYYCAAWDHRLNGLVLFGGGTKLTVL,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV2-70,IGLV1-44,IGHV2-70,IGLV1-44
6,CGAGCCCAGCACTGGAAGTCGCCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCTGCCTCTGGATTCACCTTCAGTAACTATGCTATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATTTCATATGATGGAAGTAGTAAATACTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACACTGTATCTACAAATGAACAGTCTGAGAGCTGAGGACACGGCTGTATTTTACTGTGCGAGAGGGTTCGATGGGGGTCATGGCTACTTTGACTGCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-30*04,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCTGCCTCTGGATTCACCTTCAGTAACTATGCTATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATTTCATATGATGGAAGTAGTAAATACTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACACTGTATCTACAAATGAACAGTCTGAGAGCTGAGGACACGGCTGTATTTTACTGTGCGAGAGGGTTCGATGGGGGTCATGGCTACTTTGACTGCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QVQLVESGGGVVQPGRSLRLSCAASGFTFSNYAMHWVRQAPGKGLEWVAVISYDGSSKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVFYCARGFDGGHGYFDCWGQGTLVTVSS,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARGFXXXXXYFDYWGQGTLVTVSS,ARGFDGGHGYFDC,AGCTTCAGCTGTGGTAGAGAAGACAGGATTCAGGACAATCTCCAGCATGGCCGGCTTCCCTCTCCTCCTCACCCTCCTCACTCACTGTGCAGGGTCCTGGGCCCAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGCAGAGGGTCACCATCTCTTGTTCTGGAAGCAGCTCCAACATCGGAAATAATTATGTATACTGGTACCAGCAGTTCCCAGGAACGGCCCCCAAACTCCTCATCTATAGGAATAATCAGCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCAGCCTCCCTGGCCATCAGTGGGCTCCGGTCCGAGGATGAGGCTGATTATTACTGTGCAGCATGGGATGACAGCCTGAGTGGTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV1-47*01,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGCAGAGGGTCACCATCTCTTGTTCTGGAAGCAGCTCCAACATCGGAAATAATTATGTATACTGGTACCAGCAGTTCCCAGGAACGGCCCCCAAACTCCTCATCTATAGGAATAATCAGCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCAGCCTCCCTGGCCATCAGTGGGCTCCGGTCCGAGGATGAGGCTGATTATTACTGTGCAGCATGGGATGACAGCCTGAGTGGTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QSVLTQPPSASGTPGQRVTISCSGSSSNIGNNYVYWYQQFPGTAPKLLIYRNNQRPSGVPDRFSGSKSGTSASLAISGLRSEDEADYYCAAWDDSLSGWVFGGGTKLTVL,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNYVYWYQQLPGTAPKLLIYRNNQRPSGVPDRFSGSKSGTSASLAISGLRSEDEADYYCAAWDDSLSGWVFGGGTKLTVL,AAWDDSLSGWV,QVQLVESGGGVVQPGRSLRLSCAASGFTFSNYAMHWVRQAPGKGLEWVAVISYDGSSKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVFYCARGFDGGHGYFDCWGQGTLVTVSS[SEP]QSVLTQPPSASGTPGQRVTISCSGSSSNIGNNYVYWYQQFPGTAPKLLIYRNNQRPSGVPDRFSGSKSGTSASLAISGLRSEDEADYYCAAWDDSLSGWVFGGGTKLTVL,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV3-30,IGLV1-47,IGHV3-30,IGLV1-47
7,TGGGGAGCTCTGGGAGAGGAGCCCCAGCCCTGAGATTCCCAGGTGTTTCCACTCAGTGATCAGCACTGAACACAGAGGGCTCACCATGGGGTTGGGACTGAGCTGGATTTTCCTTTTGGCTATTTTAAAAGGTGTCCAATGTGAAGCGCAGCTGGTGGAGTCTGGGGGAGACTTGGTACAGCCTGGCTGGTCCCTGAGACTCTCATGTGCAGCCTCTGGATTCACCATTCATGATTATGCCATGCACTGGGTCCGGCAAGCTCCAGGGAAGGGCCCGGAATGGGTCTCAGGTATCAGTTGGAATAGCGGTACCATAGGCTATGCGGACTCTGTGAAGGGCCGGTTCACCATCTCCAGAGACAACGCCAAGAACTCCCTGTATCTCCAAATGAACAGTCTGAGAGTTGAGGACACGGCCTTATATTATTGTGTAAATCTGATTGGGGCATCGAACGATGCTTTTGAGATCTGGGGCCAAGGGACAATGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-9*01,GAAGCGCAGCTGGTGGAGTCTGGGGGAGACTTGGTACAGCCTGGCTGGTCCCTGAGACTCTCATGTGCAGCCTCTGGATTCACCATTCATGATTATGCCATGCACTGGGTCCGGCAAGCTCCAGGGAAGGGCCCGGAATGGGTCTCAGGTATCAGTTGGAATAGCGGTACCATAGGCTATGCGGACTCTGTGAAGGGCCGGTTCACCATCTCCAGAGACAACGCCAAGAACTCCCTGTATCTCCAAATGAACAGTCTGAGAGTTGAGGACACGGCCTTATATTATTGTGTAAATCTGATTGGGGCATCGAACGATGCTTTTGAGATCTGGGGCCAAGGGACAATGGTCACCGTCTCCTCAG,EAQLVESGGDLVQPGWSLRLSCAASGFTIHDYAMHWVRQAPGKGPEWVSGISWNSGTIGYADSVKGRFTISRDNAKNSLYLQMNSLRVEDTALYYCVNLIGASNDAFEIWGQGTMVTVSS,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLEWVSGISWNSGSIGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYYCAXXIXXXXDAFDIWGQGTMVTVSS,VNLIGASNDAFEI,GAGCTACAACGGGCAGGCAGGGGCAGCAAGATGGTGTTGCAGACCCAGGTCTTCATTTCTCTGTTGCTCTGGATCTCTGTGACTGCAGGTGTCTACGGGGACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGGGACGGCCACCATCAGCTGCAAGACCAGCCAGAGTGTTTTGTACACGTCCAACAATATAAACTACTTAGCTTGGTACCAGCAGAAACCAGGGCAGCCTCCTAAGCTGCTCATTTACTGGGCATCGACCCGGAAATCCGGGGTCCCTGACCGATTCCGTGGCAGCGGATCTGGGACACAATTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATTATACTTCTCCTCGGACGTTCGGCCAGGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV4-1*01,GACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGGGACGGCCACCATCAGCTGCAAGACCAGCCAGAGTGTTTTGTACACGTCCAACAATATAAACTACTTAGCTTGGTACCAGCAGAAACCAGGGCAGCCTCCTAAGCTGCTCATTTACTGGGCATCGACCCGGAAATCCGGGGTCCCTGACCGATTCCGTGGCAGCGGATCTGGGACACAATTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATTATACTTCTCCTCGGACGTTCGGCCAGGGGACCAAGGTGGAAATCAAAC,DIVMTQSPDSLAVSLGGTATISCKTSQSVLYTSNNINYLAWYQQKPGQPPKLLIYWASTRKSGVPDRFRGSGSGTQFTLTISSLQAEDVAVYYCQQYYTSPRTFGQGTKVEIK,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPRTFGQGTKVEIK,QQYYTSPRT,EAQLVESGGDLVQPGWSLRLSCAASGFTIHDYAMHWVRQAPGKGPEWVSGISWNSGTIGYADSVKGRFTISRDNAKNSLYLQMNSLRVEDTALYYCVNLIGASNDAFEIWGQGTMVTVSS[SEP]DIVMTQSPDSLAVSLGGTATISCKTSQSVLYTSNNINYLAWYQQKPGQPPKLLIYWASTRKSGVPDRFRGSGSGTQFTLTISSLQAEDVAVYYCQQYYTSPRTFGQGTKVEIK,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV3-9,IGKV4-1,IGHV3-9,IGKV4-1
8,AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCATTTGGTGATCAGGACTGAACAGAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGCTTTTTCTTGTGGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAACTGTTGGAATCTGGGGGAGGCTTGGTACAGCGGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCGCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGGTATTAGTGGTAGTGGTGGTAGTACATTCTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTATATCTGCAAATGAAGAGCCTGAGAGCCGAGGACACGGCCCTATACTACTGTGCGAATGTAGTTGGATCGTATAGCAGCAGCTGGGGTTTCGACCCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCACCCTCCTCCAAGAGCACCTCTGGGGGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCCCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV3-23*01,GAGGTGCAACTGTTGGAATCTGGGGGAGGCTTGGTACAGCGGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCGCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGGTATTAGTGGTAGTGGTGGTAGTACATTCTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTATATCTGCAAATGAAGAGCCTGAGAGCCGAGGACACGGCCCTATACTACTGTGCGAATGTAGTTGGATCGTATAGCAGCAGCTGGGGTTTCGACCCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLLESGGGLVQRGGSLRLSCAAAGFTFSSYAMSWVRQAPGKGLEWVSGISGSGGSTFYADSVKGRFTISRDNSKNTLYLQMKSLRAEDTALYYCANVVGSYSSSWGFDPWGQGTLVTVSS,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAXXXXXYSSSWXFDPWGQGTLVTVSS,ANVVGSYSSSWGFDP,TGAGCGCAGAAGGCAGGACTCGGGACAATCTTCATCATGACCTGCTCCCCTCTCCTCCTCACCCTTCTCATTCACTGCACAGGGTCCTGGGCCCAGTCTGTGTTGACGCAGCCGCCCTCAGTGTCTGCGGCCCCAGGACAGAAGGTCACCATCTCCTGCTCTGGAAGCAGCTCCAACATTGGGAATAATTATGTATGCTGGTACCAGCAGCTCCCAGGAACAGCCCCCAAACTCCTCATTTATGACAATAATAAGCGACCCTCAGGGATTCCTGACCGATTCTCTGGCTCCAAGTCTGGCACGTCAGCCACCCTGGGCATCACCGGACTCCAGACTGGGGACGAGGCCGATTATTACTGCGAAACATGGGATAGCAGGCTGAGGGGGGTGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV1-51*01,CAGTCTGTGTTGACGCAGCCGCCCTCAGTGTCTGCGGCCCCAGGACAGAAGGTCACCATCTCCTGCTCTGGAAGCAGCTCCAACATTGGGAATAATTATGTATGCTGGTACCAGCAGCTCCCAGGAACAGCCCCCAAACTCCTCATTTATGACAATAATAAGCGACCCTCAGGGATTCCTGACCGATTCTCTGGCTCCAAGTCTGGCACGTCAGCCACCCTGGGCATCACCGGACTCCAGACTGGGGACGAGGCCGATTATTACTGCGAAACATGGGATAGCAGGCTGAGGGGGGTGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVCWYQQLPGTAPKLLIYDNNKRPSGIPDRFSGSKSGTSATLGITGLQTGDEADYYCETWDSRLRGVVFGGGTKLTVL,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVSWYQQLPGTAPKLLIYDNNKRPSGIPDRFSGSKSGTSATLGITGLQTGDEADYYCGTWDSSLXXXVFGGGTKLTVL,ETWDSRLRGVV,EVQLLESGGGLVQRGGSLRLSCAAAGFTFSSYAMSWVRQAPGKGLEWVSGISGSGGSTFYADSVKGRFTISRDNSKNTLYLQMKSLRAEDTALYYCANVVGSYSSSWGFDPWGQGTLVTVSS[SEP]QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVCWYQQLPGTAPKLLIYDNNKRPSGIPDRFSGSKSGTSATLGITGLQTGDEADYYCETWDSRLRGVVFGGGTKLTVL,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV3-23,IGLV1-51,IGHV3-23,IGLV1-51
9,AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCATTTGGTGATCAGGACTGAACAGAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGCTTTTTCTTGTGGCTATTTTAAAACGTGTCCAGTGTGAGGTGCAGTTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGAGTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCATTTATGGCATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAACTATTAGGGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAGGAACACACTGTCTCTGCAAATGAACAGCCTGAGAACCGAGGACACGGCCGTATATTACTGTGCGAAAGATCGGTATAGTGCCTACGGGGGGCTTACGGACTGCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-23*01,GAGGTGCAGTTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGAGTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCATTTATGGCATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAACTATTAGGGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAGGAACACACTGTCTCTGCAAATGAACAGCCTGAGAACCGAGGACACGGCCGTATATTACTGTGCGAAAGATCGGTATAGTGCCTACGGGGGGCTTACGGACTGCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLLESGGGLVQPGGSLRVSCAASGFTFSIYGMSWVRQAPGKGLEWVSTIRGSGGSTYYADSVKGRFTISRDNSRNTLSLQMNSLRTEDTAVYYCAKDRYSAYGGLTDCWGQGTLVTVSS,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXYSGYXXXXDYWGQGTLVTVSS,AKDRYSAYGGLTDC,GAGCTACAACAGGCAGGCAGGGGCAGCAAGATGGTGTTGCAGACCCAGGTCTTCATTTCTCTGTTGCTCTGGATCTCTGGTGCCTACGGGGACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCATCAACTGCAAGTCCAGCCAGAGTGTTTTATACAGCTCCAACAATAAGAACTACTTAGCTTGGTACCAGCAGAAACCAGGTCAGACTCCTAAGTTGCTCATTTACTGGTCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATTATAGTTCTTTTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV4-1*01,GACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCATCAACTGCAAGTCCAGCCAGAGTGTTTTATACAGCTCCAACAATAAGAACTACTTAGCTTGGTACCAGCAGAAACCAGGTCAGACTCCTAAGTTGCTCATTTACTGGTCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATTATAGTTCTTTTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQTPKLLIYWSSTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSSFLTFGGGTKVEIK,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTXLTFGGGTKVEIK,QQYYSSFLT,EVQLLESGGGLVQPGGSLRVSCAASGFTFSIYGMSWVRQAPGKGLEWVSTIRGSGGSTYYADSVKGRFTISRDNSRNTLSLQMNSLRTEDTAVYYCAKDRYSAYGGLTDCWGQGTLVTVSS[SEP]DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQTPKLLIYWSSTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSSFLTFGGGTKVEIK,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV3-23,IGKV4-1,IGHV3-23,IGKV4-1
10,TACGGGGCTTTCTGAGAGTCATGGATCTCATGTGCAAGAAAATGAAGCACCTGTGGTTCTTCCTCCTGCTGGTGGCGGCTCCCAGATGGGTCCTGTCCCAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTAATGATTACTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGCAGTATCTACTATAGTGGGAGCACTTACTACAACCCGTCCCTCAAGAGTCGACTCACCATATCCGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCTATGTATTACTGTGCGAGACATCCGACTAACGCAGCACCTGGTACTGGCTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV4-39*01,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTAATGATTACTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGCAGTATCTACTATAGTGGGAGCACTTACTACAACCCGTCCCTCAAGAGTCGACTCACCATATCCGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCTATGTATTACTGTGCGAGACATCCGACTAACGCAGCACCTGGTACTGGCTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCA,QLQLQESGPGLVKPSETLSLTCTVSGGSISSNDYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRLTISVDTSKNQFSLKLSSVTAADTAMYYCARHPTNAAPGTGYYYGMDVWGQGTTVTVSS,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXAAAGTXYYYGMDVWGQGTTVTVSS,ARHPTNAAPGTGYYYGMDV,ACTGCGGGGGTAAGAGGTTGTGTCCACCATGGCCTGGACTCCTCTCCTCCTCCTGTTCCTCTCTCACTGCACAGGTTCCCTCTCGCAGGCTGTGCTGACTCAGCCGTCTTCCCTCTCTGCATATCCTGGAGCATCAGCCAGTCTCACCTGCACCTTGCGCCGTGGCATCAATGTTGGTACCTACAGGATATACTGGTACCAGCAGAAGCCAGGGAGTCCTCCCCAGTATCTCCTGAGGTACAAATCAGACTCAGATAACCAGCAGGGCTCTGGAGTCCCCAGCCGCTTCTCTGGATCCAAAGATGCTTCGGCCAATGCAGGGATTTTAGTCATCTCTGAGCTCCAGTCTGAGGATGAGGCTGACTATTACTGTATGATTTGGCACATCAGCGCTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV5-45*03,CAGGCTGTGCTGACTCAGCCGTCTTCCCTCTCTGCATATCCTGGAGCATCAGCCAGTCTCACCTGCACCTTGCGCCGTGGCATCAATGTTGGTACCTACAGGATATACTGGTACCAGCAGAAGCCAGGGAGTCCTCCCCAGTATCTCCTGAGGTACAAATCAGACTCAGATAACCAGCAGGGCTCTGGAGTCCCCAGCCGCTTCTCTGGATCCAAAGATGCTTCGGCCAATGCAGGGATTTTAGTCATCTCTGAGCTCCAGTCTGAGGATGAGGCTGACTATTACTGTATGATTTGGCACATCAGCGCTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QAVLTQPSSLSAYPGASASLTCTLRRGINVGTYRIYWYQQKPGSPPQYLLRYKSDSDNQQGSGVPSRFSGSKDASANAGILVISELQSEDEADYYCMIWHISAWVFGGGTKLTVL,QAVLTQPSSLSASPGASASLTCTLRSGINVGTYRIYWYQQKPGSPPQYLLRYKSDSDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAWVFGGGTKLTVL,MIWHISAWV,QLQLQESGPGLVKPSETLSLTCTVSGGSISSNDYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRLTISVDTSKNQFSLKLSSVTAADTAMYYCARHPTNAAPGTGYYYGMDVWGQGTTVTVSS[SEP]QAVLTQPSSLSAYPGASASLTCTLRRGINVGTYRIYWYQQKPGSPPQYLLRYKSDSDNQQGSGVPSRFSGSKDASANAGILVISELQSEDEADYYCMIWHISAWVFGGGTKLTVL,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV4-39,IGLV5-45,IGHV4-39,IGLV5-45


In [44]:
# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
mem_subj_grouped_df_jaffe_only = groupby(mem_subj_df_jaffe_only, [:general_v_gene_heavy_no_para, :cdr3_aa_heavy])

# Step 1: Filter out groups with only one row
mem_subj_filtered_grouped_jaffe_only = filter(g -> nrow(g) > 1, mem_subj_grouped_df_jaffe_only)

# filter out every group that has the same entry in Subject
filtered_groups_only_sev_subj_jaffe_only = filter(g -> length(unique(g.Subject)) > 1, mem_subj_filtered_grouped_jaffe_only)


Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String,String7,String15,String31,String7,String15,String15,String,String
1,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAATTGGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTCAGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTTAGTAGTTATAAGATGAGTTGGGTCCGGCAAATTCCAGGGAAGGGGCTGGAATGGGTGGCCAACATAATGCAAGATGGGAGTGGGCGGGACTATGTGGACTCTGTGCAGGGCCGATTCACTATCTCCAGAGATAACGCCAAGAATTCACTGTATCTGCAAATGAACAGCCTGAGAGTCGAGGACACGGCTGTGTATTACTGTGCGGGTTGGGCTGGCAGCAGTTGGTTTGACTACTGGGGCCAGGGAACCCTGGTCGTCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-7*01,AGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTTAGTAGTTATAAGATGAGTTGGGTCCGGCAAATTCCAGGGAAGGGGCTGGAATGGGTGGCCAACATAATGCAAGATGGGAGTGGGCGGGACTATGTGGACTCTGTGCAGGGCCGATTCACTATCTCCAGAGATAACGCCAAGAATTCACTGTATCTGCAAATGAACAGCCTGAGAGTCGAGGACACGGCTGTGTATTACTGTGCGGGTTGGGCTGGCAGCAGTTGGTTTGACTACTGGGGCCAGGGAACCCTGGTCGTCGTCTCCTCAG,VQLVESGGGLVQPGGSLRLSCAASGFIFSSYKMSWVRQIPGKGLEWVANIMQDGSGRDYVDSVQGRFTISRDNAKNSLYLQMNSLRVEDTAVYYCAGWAGSSWFDYWGQGTLVVVSS,VQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAXXXXSSWFDYWGQGTLVTVSS,AGWAGSSWFDY,TCTGGCACCAGGGGTCCCTTCCAATATCAGCACCATGGCCTGGACTCCTCTCTTTCTGTTCCTCCTCACTTGCTGCCCAGGGTCCAATTCCCAGGCTGTGGTGACTCAGGAGCCCTCACTGACTGTGTCCCCAGGAGGGACAGTCACTCTCACCTGTGGCTCCAGCACTGGACCTGTCACCAGTACTCACTATCCCTACTGGTTCCAGCAGAAGCCTGGCCAAGCCCCCAGGACACTGATTTATGATACAGCCAACAAACACTCCTGGACACCTGCCCGATTCTCAGGCTCCCTCCTTGGGGACAAAGCTGCCCTGACCCTTTCGGGTGCGCAGCCTGAGGATGAAGCTGAATATTACTGCTTGCTCTCCTATAGCGGTCCTCGGGGTCCTCGGGTGGTATTCGGCGGAGGGACCAGGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV7-46*01,CAGGCTGTGGTGACTCAGGAGCCCTCACTGACTGTGTCCCCAGGAGGGACAGTCACTCTCACCTGTGGCTCCAGCACTGGACCTGTCACCAGTACTCACTATCCCTACTGGTTCCAGCAGAAGCCTGGCCAAGCCCCCAGGACACTGATTTATGATACAGCCAACAAACACTCCTGGACACCTGCCCGATTCTCAGGCTCCCTCCTTGGGGACAAAGCTGCCCTGACCCTTTCGGGTGCGCAGCCTGAGGATGAAGCTGAATATTACTGCTTGCTCTCCTATAGCGGTCCTCGGGGTCCTCGGGTGGTATTCGGCGGAGGGACCAGGCTGACCGTCCTAG,QAVVTQEPSLTVSPGGTVTLTCGSSTGPVTSTHYPYWFQQKPGQAPRTLIYDTANKHSWTPARFSGSLLGDKAALTLSGAQPEDEAEYYCLLSYSGPRGPRVVFGGGTRLTVL,QAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQKPGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLSGAQPEDEAEYYCLLSYSGARXXXVVFGGGTKLTVL,LLSYSGPRGPRVV,VQLVESGGGLVQPGGSLRLSCAASGFIFSSYKMSWVRQIPGKGLEWVANIMQDGSGRDYVDSVQGRFTISRDNAKNSLYLQMNSLRVEDTAVYYCAGWAGSSWFDYWGQGTLVVVSS[SEP]QAVVTQEPSLTVSPGGTVTLTCGSSTGPVTSTHYPYWFQQKPGQAPRTLIYDTANKHSWTPARFSGSLLGDKAALTLSGAQPEDEAEYYCLLSYSGPRGPRVVFGGGTRLTVL,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV3-7,IGLV7-46,IGHV3-7,IGLV7-46
2,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAATTGGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTCAGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTTAGTAGTTATAAGATGAGTTGGGTCCGGCAAATTCCAGGGAAGGGGCTGGAATGGGTGGCCAACATAATGCAAGATGGGAGTGGGCGGGACTATGTGGACTCTGTGCAGGGCCGATTCACTATCTCCAGAGATAACGCCAAGAATTCACTGTATCTGCAAATGAACAGCCTGAGAGTCGAGGACACGGCTGTGTATTACTGTGCGGGTTGGGCTGGCAGCAGTTGGTTTGACTACTGGGGCCAGGGAACCCTGGTCGTCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-7*01,AGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTTAGTAGTTATAAGATGAGTTGGGTCCGGCAAATTCCAGGGAAGGGGCTGGAATGGGTGGCCAACATAATGCAAGATGGGAGTGGGCGGGACTATGTGGACTCTGTGCAGGGCCGATTCACTATCTCCAGAGATAACGCCAAGAATTCACTGTATCTGCAAATGAACAGCCTGAGAGTCGAGGACACGGCTGTGTATTACTGTGCGGGTTGGGCTGGCAGCAGTTGGTTTGACTACTGGGGCCAGGGAACCCTGGTCGTCGTCTCCTCAG,VQLVESGGGLVQPGGSLRLSCAASGFIFSSYKMSWVRQIPGKGLEWVANIMQDGSGRDYVDSVQGRFTISRDNAKNSLYLQMNSLRVEDTAVYYCAGWAGSSWFDYWGQGTLVVVSS,VQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAXXXXSSWFDYWGQGTLVTVSS,AGWAGSSWFDY,GGGGGCACCAGGGGTCCCTTCCAATATCAGCACCATGGCCTGGACTCCTCTCTTTCTGTTCCTCCTCACTTGCTGCCCAGGGTCCAATTCCCAGGCTGTGGTGACTCAGGAGCCCTCACTGACTGTGTCCCCAGGAGGGACAGTCACTCTCACCTGTGGCTCCAGCACTGGACCTGTCACCAGTACTCACTATCCCTACTGGTTCCAGCAGAAGCCTGGCCAAGCCCCCAGGACACTGATTTATGATACAGCCAACAAACACTCCTGGACACCTGCCCGATTCTCAGGCTCCCTCCTTGGGGACAAAGCTGCCCTGACCCTTTCGGGTGCGCAGCCTGAGGATGAAGCTGAATATTACTGCTTGCTCTCCTATAGCGGTCCTCGGGGTCCTCGGGTGGTATTCGGCGGAGGGACCAGGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV7-46*01,CAGGCTGTGGTGACTCAGGAGCCCTCACTGACTGTGTCCCCAGGAGGGACAGTCACTCTCACCTGTGGCTCCAGCACTGGACCTGTCACCAGTACTCACTATCCCTACTGGTTCCAGCAGAAGCCTGGCCAAGCCCCCAGGACACTGATTTATGATACAGCCAACAAACACTCCTGGACACCTGCCCGATTCTCAGGCTCCCTCCTTGGGGACAAAGCTGCCCTGACCCTTTCGGGTGCGCAGCCTGAGGATGAAGCTGAATATTACTGCTTGCTCTCCTATAGCGGTCCTCGGGGTCCTCGGGTGGTATTCGGCGGAGGGACCAGGCTGACCGTCCTAG,QAVVTQEPSLTVSPGGTVTLTCGSSTGPVTSTHYPYWFQQKPGQAPRTLIYDTANKHSWTPARFSGSLLGDKAALTLSGAQPEDEAEYYCLLSYSGPRGPRVVFGGGTRLTVL,QAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQKPGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLSGAQPEDEAEYYCLLSYSGARXXXVVFGGGTKLTVL,LLSYSGPRGPRVV,VQLVESGGGLVQPGGSLRLSCAASGFIFSSYKMSWVRQIPGKGLEWVANIMQDGSGRDYVDSVQGRFTISRDNAKNSLYLQMNSLRVEDTAVYYCAGWAGSSWFDYWGQGTLVVVSS[SEP]QAVVTQEPSLTVSPGGTVTLTCGSSTGPVTSTHYPYWFQQKPGQAPRTLIYDTANKHSWTPARFSGSLLGDKAALTLSGAQPEDEAEYYCLLSYSGPRGPRVVFGGGTRLTVL,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV3-7,IGLV7-46,IGHV3-7,IGLV7-46
3,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAATTGGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTCAGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTTAGTAGTTATAAGATGAGTTGGGTCCGGCAAATTCCAGGGAAGGGGCTGGAATGGGTGGCCAACATAATGCAAGATGGGAGTGGGCGGGACTATGTGGACTCTGTGCAGGGCCGATTCACCATCTCCAGAGATAACGCCAAGAATTCACTGTATCTGCAAATGAACAGCCTGAGAGTCGAGGACACGGCTGTGTATTACTGTGCGGGTTGGGCTGGCAGCAGTTGGTTTGACTACTGGGGCCAGGGAACCCTGGTCGTCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-7*01,AGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTTAGTAGTTATAAGATGAGTTGGGTCCGGCAAATTCCAGGGAAGGGGCTGGAATGGGTGGCCAACATAATGCAAGATGGGAGTGGGCGGGACTATGTGGACTCTGTGCAGGGCCGATTCACCATCTCCAGAGATAACGCCAAGAATTCACTGTATCTGCAAATGAACAGCCTGAGAGTCGAGGACACGGCTGTGTATTACTGTGCGGGTTGGGCTGGCAGCAGTTGGTTTGACTACTGGGGCCAGGGAACCCTGGTCGTCGTCTCCTCAG,VQLVESGGGLVQPGGSLRLSCAASGFIFSSYKMSWVRQIPGKGLEWVANIMQDGSGRDYVDSVQGRFTISRDNAKNSLYLQMNSLRVEDTAVYYCAGWAGSSWFDYWGQGTLVVVSS,VQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAXXXXSSWFDYWGQGTLVTVSS,AGWAGSSWFDY,TCTGGCACCAGGGGTCCCTTCCAATATCAGCACCATGGCCTGGACTCCTCTCTTTCTGTTCCTCCTCACTTGCTGCCCAGGGTCCAATTCCCAGGCTGTGGTGACTCAGGAGCCCTCACTGACTGTGTCCCCAGGAGGGACAGTCACTCTCACCTGTGGCTCCAGCACTGGAGCTGTCACCAGTACTCACTATCCCTACTGGTTCCAGCAGAAGCCTGGCCAAGCCCCCAGGACACTGATTTATGATACAGACAACAAACACTCCTGGACACCTGCCCGATTCTCAGGCTCCCTCCTTGGGGACAAAGCTGCCCTGACCCTTTCGGGTGCGCAGCCTGAGGATGAAGCTGAATATTACTGCTTGCTCTCCTATAGCGGTCCTCGGGGTCCTCGGGTGGTATTCGGCGGAGGGACCAGGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV7-46*01,CAGGCTGTGGTGACTCAGGAGCCCTCACTGACTGTGTCCCCAGGAGGGACAGTCACTCTCACCTGTGGCTCCAGCACTGGAGCTGTCACCAGTACTCACTATCCCTACTGGTTCCAGCAGAAGCCTGGCCAAGCCCCCAGGACACTGATTTATGATACAGACAACAAACACTCCTGGACACCTGCCCGATTCTCAGGCTCCCTCCTTGGGGACAAAGCTGCCCTGACCCTTTCGGGTGCGCAGCCTGAGGATGAAGCTGAATATTACTGCTTGCTCTCCTATAGCGGTCCTCGGGGTCCTCGGGTGGTATTCGGCGGAGGGACCAGGCTGACCGTCCTAG,QAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSTHYPYWFQQKPGQAPRTLIYDTDNKHSWTPARFSGSLLGDKAALTLSGAQPEDEAEYYCLLSYSGPRGPRVVFGGGTRLTVL,QAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQKPGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLSGAQPEDEAEYYCLLSYSGARXXXVVFGGGTKLTVL,LLSYSGPRGPRVV,VQLVESGGGLVQPGGSLRLSCAASGFIFSSYKMSWVRQIPGKGLEWVANIMQDGSGRDYVDSVQGRFTISRDNAKNSLYLQMNSLRVEDTAVYYCAGWAGSSWFDYWGQGTLVVVSS[SEP]QAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSTHYPYWFQQKPGQAPRTLIYDTDNKHSWTPARFSGSLLGDKAALTLSGAQPEDEAEYYCLLSYSGPRGPRVVFGGGTRLTVL,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV3-7,IGLV7-46,IGHV3-7,IGLV7-46
4,AGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAATTGGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGCCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCTCCTTTAGTAAATATTGGATGGGTTGGGTCCGCCAGGCTCCAGGGAAGGGACTGGAGTGGGTGGCCAACATAAAGCAAGATGGAAGTGAGAAGTTGTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTCTATCTGCAAATGAGCAGCCTGAGAGTCGACGACACGGCTCTGTATTACTGTGCGGGCTGGGCGGGCAGCAGCTGGTTCGACTACTGGGGCCAGGGAACCCAGGTCACCGTTAACTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-7*01,GAGGTGCAGCTGGTGGAGTCTGGGGGAGCCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCTCCTTTAGTAAATATTGGATGGGTTGGGTCCGCCAGGCTCCAGGGAAGGGACTGGAGTGGGTGGCCAACATAAAGCAAGATGGAAGTGAGAAGTTGTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTCTATCTGCAAATGAGCAGCCTGAGAGTCGACGACACGGCTCTGTATTACTGTGCGGGCTGGGCGGGCAGCAGCTGGTTCGACTACTGGGGCCAGGGAACCCAGGTCACCGT,EVQLVESGGALVQPGGSLRLSCAASGFSFSKYWMGWVRQAPGKGLEWVANIKQDGSEKLYVDSVKGRFTISRDNAKNSLYLQMSSLRVDDTALYYCAGWAGSSWFDYWGQGTQVTV,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAXXXXSSWFDSWGQGTLVTV,AGWAGSSWFDY,TGAGCGCAGAAGGCAGGACTCGGGACAATCTTCATCATGACCTGCTCCCCTCTCCTCCTCACCCTTCTCATTCACTGCACAGGGTGCTGGGCCCAGTCTGTGTTGACGCAGCCGCCCTCAGTGTCTGCGGCCCCAGGACAGAAGGTCACCATCTCCTGCTCTGGAAGCAGCTCCAACATTGGGAATAATCATGTTTCCTGGTACCAGCAGCTCCCAGGAACAGCCCCCAAACTCCTCATTTCTGACAATAATAAGCGACCCTCAGGGATTCCTGACCGATTCTCTGGCTCCAAGTCTGGCACGTCAGCCACCCTGGGCATCACCGGACTCCAGACTGGGGACGAGGCCGATTATTACTGCGGAACATGGGATAGCAGCCTGAGTGCTGGCGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV1-51*01,CAGTCTGTGTTGACGCAGCCGCCCTCAGTGTCTGCGGCCCCAGGACAGAAGGTCACCATCTCCTGCTCTGGAAGCAGCTCCAACATTGGGAATAATCATGTTTCCTGGTACCAGCAGCTCCCAGGAACAGCCCCCAAACTCCTCATTTCTGACAATAATAAGCGACCCTCAGGGATTCCTGACCGATTCTCTGGCTCCAAGTCTGGCACGTCAGCCACCCTGGGCATCACCGGACTCCAGACTGGGGACGAGGCCGATTATTACTGCGGAACATGGGATAGCAGCCTGAGTGCTGGCGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNHVSWYQQLPGTAPKLLISDNNKRPSGIPDRFSGSKSGTSATLGITGLQTGDEADYYCGTWDSSLSAGVFGGGTKLTVL,QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNYVSWYQQLPGTAPKLLIYDNNKRPSGIPDRFSGSKSGTSATLGITGLQTGDEADYYCGTWDSSLSAGVFGGGTKLTVL,GTWDSSLSAGV,EVQLVESGGALVQPGGSLRLSCAASGFSFSKYWMGWVRQAPGKGLEWVANIKQDGSEKLYVDSVKGRFTISRDNAKNSLYLQMSSLRVDDTALYYCAGWAGSSWFDYWGQGTQVTV[SEP]QSVLTQPPSVSAAPGQKVTISCSGSSSNIGNNHVSWYQQLPGTAPKLLISDNNKRPSGIPDRFSGSKSGTSATLGITGLQTGDEADYYCGTWDSSLSAGVFGGGTKLTVL,Memory-B-Cells,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV3-7,IGLV1-51,IGHV3-7,IGLV1-51
5,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAATTGGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTCAGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTTAGTAGTTATAAGATGAGTTGGGTCCGGCAAATTCCAGGGAAGGGGCTGGAATGGGTGGCCAACATAATGCAAGATGGGAGTGGGCGGGACTATGTGGACTCTGTGCAGGGCCGATTCACTATCTCCAGAGATAACGCCAAGAATTCACTGTATCTGCAAATGAACAGCCTGAGAGTCGAGGACACGGCTGTGTATTACTGTGCGGGTTGGGCTGGCAGCAGTTGGTTTGACTACTGGGGCCAGGGAACCCTGGTCGTCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-7*01,AGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCATCTTTAGTAGTTATAAGATGAGTTGGGTCCGGCAAATTCCAGGGAAGGGGCTGGAATGGGTGGCCAACATAATGCAAGATGGGAGTGGGCGGGACTATGTGGACTCTGTGCAGGGCCGATTCACTATCTCCAGAGATAACGCCAAGAATTCACTGTATCTGCAAATGAACAGCCTGAGAGTCGAGGACACGGCTGTGTATTACTGTGCGGGTTGGGCTGGCAGCAGTTGGTTTGACTACTGGGGCCAGGGAACCCTGGTCGTCGTCTCCTCAG,VQLVESGGGLVQPGGSLRLSCAASGFIFSSYKMSWVRQIPGKGLEWVANIMQDGSGRDYVDSVQGRFTISRDNAKNSLYLQMNSLRVEDTAVYYCAGWAGSSWFDYWGQGTLVVVSS,VQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAXXXXSSWFDYWGQGTLVTVSS,AGWAGSSWFDY,TCTGGCACCAGGGGTCCCTTCCAATATCAGCACCATGGCCTGGACTCCTCTCTTTCTGTTCCTCCTCACTTGCTGCCCAGGGTCCAATTCCCAGGCTGTGGTGACTCAGGAGCCCTCACTGACTGTGTCCCCAGGAGGGACAGTCACTCTCACCTGTGGCTCCAGCACTGGACCTGTCACCAGTACTCACTATCCCTACTGGTTCCAGCAGAAGCCTGGCCAAGCCCCCAGGACACTGATTTATGATACAGCCAACAAACACTCCTGGACACCTGCCCGATTCTCAGGCTCCCTCCTTGGGGACAAAGCTGCCCTGACCCTTTCGGGTGCGCAGCCTGAGGATGAAGCTGAATATTACTGCTTGCTCTCCTATAGCGGTCCTCGGGGTCCTCGGGTGGTATTCGGCGGAGGGACCAGGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV7-46*01,CAGGCTGTGGTGACTCAGGAGCCCTCACTGACTGTGTCCCCAGGAGGGACAGTCACTCTCACCTGTGGCTCCAGCACTGGACCTGTCACCAGTACTCACTATCCCTACTGGTTCCAGCAGAAGCCTGGCCAAGCCCCCAGGACACTGATTTATGATACAGCCAACAAACACTCCTGGACACCTGCCCGATTCTCAGGCTCCCTCCTTGGGGACAAAGCTGCCCTGACCCTTTCGGGTGCGCAGCCTGAGGATGAAGCTGAATATTACTGCTTGCTCTCCTATAGCGGTCCTCGGGGTCCTCGGGTGGTATTCGGCGGAGGGACCAGGCTGACCGTCCTAG,QAVVTQEPSLTVSPGGTVTLTCGSSTGPVTSTHYPYWFQQKPGQAPRTLIYDTANKHSWTPARFSGSLLGDKAALTLSGAQPEDEAEYYCLLSYSGPRGPRVVFGGGTRLTVL,QAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQKPGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLSGAQPEDEAEYYCLLSYSGARXXXVVFGGGTKLTVL,LLSYSGPRGPRVV,VQLVESGGGLVQPGGSLRLSCAASGFIFSSYKMSWVRQIPGKGLEWVANIMQDGSGRDYVDSVQGRFTISRDNAKNSLYLQMNSLRVEDTAVYYCAGWAGSSWFDYWGQGTLVVVSS[SEP]QAVVTQEPSLTVSPGGTVTLTCGSSTGPVTSTHYPYWFQQKPGQAPRTLIYDTANKHSWTPARFSGSLLGDKAALTLSGAQPEDEAEYYCLLSYSGPRGPRVVFGGGTRLTVL,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV3-7,IGLV7-46,IGHV3-7,IGLV7-46

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String,String7,String15,String31,String7,String15,String15,String,String
1,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAATTGGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGTAACTATTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATAAGGCAAGATGGAAGTGCGACATACTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTTTCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTATATTACTGTGCGAGAGAAGCCTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-7*01,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGTAACTATTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATAAGGCAAGATGGAAGTGCGACATACTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTTTCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTATATTACTGTGCGAGAGAAGCCTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSNYWMSWVRQAPGKGLEWVANIRQDGSATYYVDSVKGRFTISRDNAKNSLFLQMNSLRAEDTAVYYCAREAYWGQGTLVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXYWGQGTLVTVSS,AREAY,ACTGTGGGGGTAAGAGGTTGTGTCCACCATGGCCTGGACTCCTCTCCTCCTCCTGTTCCTCTCTCACTGCACAGGTTCCCTCTCGCAGGCTGTGCTGACTCAGCCGTCTTCCCTCTCTGCATCTCCTGGAGCATCAGCCAGTCTCACCTGCACCTTGCGCAGTGGCATCAATGTTGGTTCCTACAGGATATACTGGTACCAGCAGAAGCCAGGGAGTCCTCCCCAGTATCTCCTGACGTACAAGTCAGACTCAGATAAGCAGCAGGGCTCTGGAGTCCCCAGCCGCTTCTCTGGATCCAAAGATGCTTCGGCCAATGCAGGGATTTTACTCATCTCTGGGCTCCAGTCTGAGGATGAGGCTGACTATTACTGTATGATTTGGCACAGCAGCGCTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV5-45*03,CAGGCTGTGCTGACTCAGCCGTCTTCCCTCTCTGCATCTCCTGGAGCATCAGCCAGTCTCACCTGCACCTTGCGCAGTGGCATCAATGTTGGTTCCTACAGGATATACTGGTACCAGCAGAAGCCAGGGAGTCCTCCCCAGTATCTCCTGACGTACAAGTCAGACTCAGATAAGCAGCAGGGCTCTGGAGTCCCCAGCCGCTTCTCTGGATCCAAAGATGCTTCGGCCAATGCAGGGATTTTACTCATCTCTGGGCTCCAGTCTGAGGATGAGGCTGACTATTACTGTATGATTTGGCACAGCAGCGCTTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QAVLTQPSSLSASPGASASLTCTLRSGINVGSYRIYWYQQKPGSPPQYLLTYKSDSDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAWVFGGGTKLTVL,QAVLTQPSSLSASPGASASLTCTLRSGINVGTYRIYWYQQKPGSPPQYLLRYKSDSDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAWVFGGGTKLTVL,MIWHSSAWV,EVQLVESGGGLVQPGGSLRLSCAASGFTFSNYWMSWVRQAPGKGLEWVANIRQDGSATYYVDSVKGRFTISRDNAKNSLFLQMNSLRAEDTAVYYCAREAYWGQGTLVTVSS[SEP]QAVLTQPSSLSASPGASASLTCTLRSGINVGSYRIYWYQQKPGSPPQYLLTYKSDSDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAWVFGGGTKLTVL,Memory-B-Cells,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV3-7,IGLV5-45,IGHV3-7,IGLV5-45
2,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAGTTGGGGCTGAGCTGGGTCTTCCTTGTTGCTATATTAGAAGGTGTCCAGTGTGAGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGTACCTATTGGATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATAAAGCCAGACGGAAGTGAAAAATACTACGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACTCCCAGAAGTCACTGTTTCTACAAATGAATAGCCTGAGAGACGAAGACACGGCCGTGTATTATTGTGCGAGAGAAGCCTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-7*03,GAGGTGCAACTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGTACCTATTGGATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATAAAGCCAGACGGAAGTGAAAAATACTACGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACTCCCAGAAGTCACTGTTTCTACAAATGAATAGCCTGAGAGACGAAGACACGGCCGTGTATTATTGTGCGAGAGAAGCCTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSTYWMSWVRQAPGKGLEWVANIKPDGSEKYYVDSVKGRFTISRDNSQKSLFLQMNSLRDEDTAVYYCAREAYWGQGTLVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXYWGQGTLVTVSS,AREAY,AGGAGTCAGACCCAGTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGCTCCCAGGTGCCAAATGTGACATCCAGATGACCCAGTCTCCTTCCACCCTGTCTGCATCTATAGGAGACAGAGTCACCATCACTTGCCGGGCCAGTCAGAGTATTAATAACTGGTTGGCCTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAGCTCCTGATCAATAAGGCGTCTAGTTTAGAAAGTGGGGTCCCATCAAGGTTCAGCGGCAGTGGATCTGGGACAGACTTCACTCTCACCATCAGCAGCCTGCAGCCTGATGATTTTGCAACTTATTTCTGCCAGCAGTTTAATGGTCATTTCGGCGGAGGGACCAGGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-5*03,GACATCCAGATGACCCAGTCTCCTTCCACCCTGTCTGCATCTATAGGAGACAGAGTCACCATCACTTGCCGGGCCAGTCAGAGTATTAATAACTGGTTGGCCTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAGCTCCTGATCAATAAGGCGTCTAGTTTAGAAAGTGGGGTCCCATCAAGGTTCAGCGGCAGTGGATCTGGGACAGACTTCACTCTCACCATCAGCAGCCTGCAGCCTGATGATTTTGCAACTTATTTCTGCCAGCAGTTTAATGGTCATTTCGGCGGAGGGACCAGGGTGGAGATCAAAC,DIQMTQSPSTLSASIGDRVTITCRASQSINNWLAWYQQKPGKAPKLLINKASSLESGVPSRFSGSGSGTDFTLTISSLQPDDFATYFCQQFNGHFGGGTRVEIK,DIQMTQSPSTLSASVGDRVTITCRASQSISSWLAWYQQKPGKAPKLLIYKASSLESGVPSRFSGSGSGTEFTLTISSLQPDDFATYYCQQYNSYFGGGTKVEIK,QQFNGH,EVQLVESGGGLVQPGGSLRLSCAASGFTFSTYWMSWVRQAPGKGLEWVANIKPDGSEKYYVDSVKGRFTISRDNSQKSLFLQMNSLRDEDTAVYYCAREAYWGQGTLVTVSS[SEP]DIQMTQSPSTLSASIGDRVTITCRASQSINNWLAWYQQKPGKAPKLLINKASSLESGVPSRFSGSGSGTDFTLTISSLQPDDFATYFCQQFNGHFGGGTRVEIK,Memory-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV3-7,IGKV1-5,IGHV3-7,IGKV1-5


In [45]:
# alternative: Calculate the fraction of groups where all entries have the same `general_v_gene_light`
true_cases_jaffe_only = 0
total_groups_jaffe_only = length(filtered_groups_only_sev_subj_jaffe_only)

for group in filtered_groups_only_sev_subj_jaffe_only
    # Check if all entries in the group have the same `general_v_gene_light`
    unique_genes_jaffe_only = unique(group.general_v_gene_light_no_para)
    if length(unique_genes_jaffe_only) == 1
        true_cases_jaffe_only += 1  # Increment the count if all genes are identical
    end
end

# Step 3: Calculate the fraction of "true" cases
fraction_true_jaffe_only = (true_cases_jaffe_only / total_groups_jaffe_only) * 100

println("Percentage of groups where all entries have the same general_v_gene_light: $fraction_true_jaffe_only%")
fraction_true_jaffe_only

Percentage of groups where all entries have the same general_v_gene_light: 81.90045248868778%


81.90045248868778

# Data with Jaffe et al., 2022 excluded

In [46]:
mem_subj_df_no_jaffe = filter(g -> g.Author != "Jaffe et al., 2022", mem_subj_df)

# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
mem_subj_grouped_df_no_jaffe = groupby(mem_subj_df_no_jaffe, [:general_v_gene_heavy_no_para, :cdr3_aa_heavy])

# Step 1: Filter out groups with only one row
mem_subj_filtered_grouped_no_jaffe = filter(g -> nrow(g) > 1, mem_subj_grouped_df_no_jaffe)

# filter out every group that has the same entry in Subject
filtered_groups_only_sev_subj_no_jaffe = filter(g -> length(unique(g.Subject)) > 1, mem_subj_filtered_grouped_no_jaffe)


Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String,String7,String15,String31,String7,String15,String15,String,String
1,AGCTCTGGGAGAGGAGCCCCAGCCTTGGGATTCCCAAGTGTTTTTATTCAGTGATCAGGACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGATTTTCCTTGCTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCACATGGTGGAATCTGGGGGAGGCTTGGTAAAGCCTGGGGGGTCCCTTAGCCTCTCCTGTGCAGCCTCTGAATTCACTTTCAGTAAGGCCTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGTTGGAATGGGTTGGCCGTATTAAAAGAAAAATTGATGGTGAGACAACAGACTACGCTGCACCCGTGAGAGGCAGATTCACCATCTCAAGAGATGATTCAAAAAACACTCTGTATCTACACATGAACAGCCTGAGAACCGAGGACACAGCCGTATATTACTGTGCCACAGGTTTGATTAATGCTTTTGATATCTGGGGCCAAGGGACACTGGTCACCGTCTCTTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-15*01,GAGGTGCACATGGTGGAATCTGGGGGAGGCTTGGTAAAGCCTGGGGGGTCCCTTAGCCTCTCCTGTGCAGCCTCTGAATTCACTTTCAGTAAGGCCTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGTTGGAATGGGTTGGCCGTATTAAAAGAAAAATTGATGGTGAGACAACAGACTACGCTGCACCCGTGAGAGGCAGATTCACCATCTCAAGAGATGATTCAAAAAACACTCTGTATCTACACATGAACAGCCTGAGAACCGAGGACACAGCCGTATATTACTGTGCCACAGGTTTGATTAATGCTTTTGATATCTGGGGCCAAGGGACACTGGTCACCGTCTCTTCAG,EVHMVESGGGLVKPGGSLSLSCAASEFTFSKAWMTWVRQAPGKGLEWVGRIKRKIDGETTDYAAPVRGRFTISRDDSKNTLYLHMNSLRTEDTAVYYCATGLINAFDIWGQGTLVTVSS,EVQLVESGGGLVKPGGSLRLSCAASGFTFSNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTTXXINAFDIWGQGTMVTVSS,ATGLINAFDI,GGCTGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTATTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTGTGGGTCTCTTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGCTTATAACTATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAAGTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGCCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCTGCTCATATACACTCACCAATACTGTGGTATTCGGCGGCGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-14*03,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTGTGGGTCTCTTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGCTTATAACTATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAAGTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGCCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCTGCTCATATACACTCACCAATACTGTGGTATTCGGCGGCGGGACCAAGCTGACCGTCCTAG,QSALTQPASVCGSLGQSITISCTGTSSDVGAYNYVSWYQQHPGKAPKVMIYDVSNRPSGVSNRFSGSKSANTASLTISGLQAEDEADYYCCSYTLTNTVVFGGGTKLTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTVVFGGGTKLTVL,CSYTLTNTVV,EVHMVESGGGLVKPGGSLSLSCAASEFTFSKAWMTWVRQAPGKGLEWVGRIKRKIDGETTDYAAPVRGRFTISRDDSKNTLYLHMNSLRTEDTAVYYCATGLINAFDIWGQGTLVTVSS[SEP]QSALTQPASVCGSLGQSITISCTGTSSDVGAYNYVSWYQQHPGKAPKVMIYDVSNRPSGVSNRFSGSKSANTASLTISGLQAEDEADYYCCSYTLTNTVVFGGGTKLTVL,Memory-B-Cells,,human,Donor-1,"Phad et al., 2022",no,IGHV3-15,IGLV2-14,IGHV3-15,IGLV2-14
2,AGCTCTGGGAGAGGAGCCCCAGCCTTGGGATTCCCAGGTCTTTTCATTCAGTGATCAGGACTGAACACAGAGGACTCACCATGGACTTTGGGCTGAACTGGATTTTCCTTGCTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTAAAGCCTGGGGGGTCCCTTAGACTCTCCTGTGCAGCCTCTGGATTCACTTTCAGTAGTGTCTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTGGCCGTGTTAAAAGGAAAACTGATGGTGAGACAACAGACTACGCTGCACCCGTGAAAGGCAGATTCACCATCTCAAGAGATGATTTAAAAAACACGCTGTATCTGCAAATGAACAGCCTGAAAACCGAGGACACAGCCGTGTATTACTGTGCCACAGGTCTCATCAATGCTTTTGATATCTGGGGCCAAGGGACAGTGGTCACCGTCTCTTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-15*01,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTAAAGCCTGGGGGGTCCCTTAGACTCTCCTGTGCAGCCTCTGGATTCACTTTCAGTAGTGTCTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTGGCCGTGTTAAAAGGAAAACTGATGGTGAGACAACAGACTACGCTGCACCCGTGAAAGGCAGATTCACCATCTCAAGAGATGATTTAAAAAACACGCTGTATCTGCAAATGAACAGCCTGAAAACCGAGGACACAGCCGTGTATTACTGTGCCACAGGTCTCATCAATGCTTTTGATATCTGGGGCCAAGGGACAGTGGTCACCGTCTCTTCAG,EVQLVESGGGLVKPGGSLRLSCAASGFTFSSVWMTWVRQAPGKGLEWVGRVKRKTDGETTDYAAPVKGRFTISRDDLKNTLYLQMNSLKTEDTAVYYCATGLINAFDIWGQGTVVTVSS,EVQLVESGGGLVKPGGSLRLSCAASGFTFSNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTTXXXXAFDIWGQGTMVTVSS,ATGLINAFDI,ATACGGGGCTGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTATTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAATTATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAGCCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCTGCTCATATAGAAGCACCATCTCTGTAGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-14*03,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAATTATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAGCCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCTGCTCATATAGAAGCACCATCTCTGTAGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSSRFSGSKSGNTASLTISGLQAEDEADYYCCSYRSTISVVFGGGTKLTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTXVFGGGTKLTVL,CSYRSTISVV,EVQLVESGGGLVKPGGSLRLSCAASGFTFSSVWMTWVRQAPGKGLEWVGRVKRKTDGETTDYAAPVKGRFTISRDDLKNTLYLQMNSLKTEDTAVYYCATGLINAFDIWGQGTVVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSSRFSGSKSGNTASLTISGLQAEDEADYYCCSYRSTISVVFGGGTKLTVL,Memory-B-Cells,,human,Donor-2,"Phad et al., 2022",no,IGHV3-15,IGLV2-14,IGHV3-15,IGLV2-14

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String,String7,String15,String31,String7,String15,String15,String,String
1,ACTCTGCTGAAGAAAACCAGCCCTGCAGCTCTGGGAGAGGAGCCCCAGCCCTGGGATTCCCAGCTGTTTCTGCTTGCTGATCAGGACTGCACACAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAACTACTGGATGCACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCAAGTATTAATAATGATGGGAGTAGGACAGATTATGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACCGCTGTGTATTACTGTGCAACAGTATTTGAGTACTGGGGCCAGGGAATTCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV3-74*01,GAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAACTACTGGATGCACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCAAGTATTAATAATGATGGGAGTAGGACAGATTATGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACCGCTGTGTATTACTGTGCAACAGTATTTGAGTACTGGGGCCAGGGAATTCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSNYWMHWVRQAPGKGLVWVSSINNDGSRTDYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGILVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCARXFDYWGQGTLVTVSS,ATVFEY,TGGGCTCCAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACCATGCCCTGGGCTCTGCTCCTCCTGACCCTCCTCACTCACTCTGCAGTGTCAGTGGTCCAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACCCTCACCTGCACTGGGAACAGCAACAATGTTGGCAACCAAGGAGCATCTTGGCTGCAGCATCACCAGGGCCACCCTCCCAAAGTCCTATCCTACAGGAATAACAACCGGCCCTCAGGGATCTCAGAGAGATTCTCTGCATCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGTCTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACACCAGCCTCAATGCCGGGCTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV10-54*04,CAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACCCTCACCTGCACTGGGAACAGCAACAATGTTGGCAACCAAGGAGCATCTTGGCTGCAGCATCACCAGGGCCACCCTCCCAAAGTCCTATCCTACAGGAATAACAACCGGCCCTCAGGGATCTCAGAGAGATTCTCTGCATCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGTCTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACACCAGCCTCAATGCCGGGCTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGASWLQHHQGHPPKVLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDTSLNAGLFGGGTKLTVL,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDSSLSAXXFGGGTKLTVL,SAWDTSLNAGL,EVQLVESGGGLVQPGGSLRLSCAASGFTFSNYWMHWVRQAPGKGLVWVSSINNDGSRTDYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGILVTVSS[SEP]QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGASWLQHHQGHPPKVLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDTSLNAGLFGGGTKLTVL,Memory-B-Cells,SARS-COV-2,human,S-CoV13,"Sokal et al, 2021",51,IGHV3-74,IGLV10-54,IGHV3-74,IGLV10-54
2,GGAGCCCCAGCCCTGGGATTCCCAGCTGTTTCTGCTTGCTGATCAGGACTGCACACAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCGGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGAGTCCCTGAGACTCTCCTGTTCAGCCTCTGGATTCACCTTCAGTAACTACTGGATCCACTGGGTCCGCCAAGCGCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTGATACTGATGGGAGTGGCACAAGTTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTGTATTATTGTGCAACCGTCTTTGAATATTGGGGCCAGGGAGCCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCAGCTGTCCTACAGTCCTCAGGA,H,IGHV3-74*01,GAGGTGCGGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGAGTCCCTGAGACTCTCCTGTTCAGCCTCTGGATTCACCTTCAGTAACTACTGGATCCACTGGGTCCGCCAAGCGCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTGATACTGATGGGAGTGGCACAAGTTACGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGGTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTGTATTATTGTGCAACCGTCTTTGAATATTGGGGCCAGGGAGCCCTGGTCACCGTCTCCTCAG,EVRLVESGGGLVQPGESLRLSCSASGFTFSNYWIHWVRQAPGKGLVWVSRIDTDGSGTSYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGALVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCAXXFDYWGQGTLVTVSS,ATVFEY,GGGGAAAACAGAGCTTCAGCAAGCATAGTGGGAATCTGCACCATGCCCTGGGCTCTGCTCCTCCTGACCCTCCTCACTCACTCTGCAGTGTCAGTGGTCCAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACACTCACCTGCACTGGGGACAGCAACAATGTTGGCCGCCAAGGAGCAGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAAGCCCTATCCTACAGGGATAACAACCGGCCCTCAGACATCTCAGAGAGATTCTCTGCGTCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACATCAGTCTCAATGCTGTGGTCTTCGGCGGAGGGACCACGCTGACCGTCTTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV10-54*04,CAGGCAGGGCTGACTCAGCCACCCTCGGTGTCCAAGGGCTTGAGACAGACCGCCACACTCACCTGCACTGGGGACAGCAACAATGTTGGCCGCCAAGGAGCAGCTTGGCTGCAGCAGCACCAGGGCCACCCTCCCAAAGCCCTATCCTACAGGGATAACAACCGGCCCTCAGACATCTCAGAGAGATTCTCTGCGTCCAGGTCAGGAAACACAGCCTCCCTGACCATTACTGGACTCCAGCCTGAGGACGAGGCTGACTATTACTGCTCAGCATGGGACATCAGTCTCAATGCTGTGGTCTTCGGCGGAGGGACCACGCTGACCGTCTTAG,QAGLTQPPSVSKGLRQTATLTCTGDSNNVGRQGAAWLQQHQGHPPKALSYRDNNRPSDISERFSASRSGNTASLTITGLQPEDEADYYCSAWDISLNAVVFGGGTTLTVL,QAGLTQPPSVSKGLRQTATLTCTGNSNNVGNQGAAWLQQHQGHPPKLLSYRNNNRPSGISERFSASRSGNTASLTITGLQPEDEADYYCSAWDSSLSAVVFGGGTKLTVL,SAWDISLNAVV,EVRLVESGGGLVQPGESLRLSCSASGFTFSNYWIHWVRQAPGKGLVWVSRIDTDGSGTSYADSVKGRFTISRDNAKNTVYLQMNSLRAEDTAVYYCATVFEYWGQGALVTVSS[SEP]QAGLTQPPSVSKGLRQTATLTCTGDSNNVGRQGAAWLQQHQGHPPKALSYRDNNRPSDISERFSASRSGNTASLTITGLQPEDEADYYCSAWDISLNAVVFGGGTTLTVL,Memory-B-Cells,,human,Donor-1,"Phad et al., 2022",no,IGHV3-74,IGLV10-54,IGHV3-74,IGLV10-54


In [47]:
# alternative: Calculate the fraction of groups where all entries have the same `general_v_gene_light`
true_cases_no_jaffe = 0
total_groups_no_jaffe = length(filtered_groups_only_sev_subj_no_jaffe)

for group in filtered_groups_only_sev_subj_no_jaffe
    # Check if all entries in the group have the same `general_v_gene_light`
    unique_genes_no_jaffe = unique(group.general_v_gene_light_no_para)
    if length(unique_genes_no_jaffe) == 1
        true_cases_no_jaffe += 1  # Increment the count if all genes are identical
    end
end

# Step 3: Calculate the fraction of "true" cases
fraction_true_no_jaffe = (true_cases_no_jaffe / total_groups_no_jaffe) * 100

println("Percentage of groups where all entries have the same general_v_gene_light: $fraction_true_no_jaffe%")
fraction_true_no_jaffe

Percentage of groups where all entries have the same general_v_gene_light: 51.358277806253206%


51.358277806253206

## Naive B Cells

In [48]:
file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/OAS_data_grouped_by_BTypes/extra_cols/BType_Naive-B-Cells_extra_cols.csv" 
naive_subj_df = CSV.read(file_path, DataFrame);

print(naive_subj_df.Subject[1:100])

String15["Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "Donor-4", "

In [49]:
# Filter and print entries with "D" in their name for each column
println("Entries with 'D' in general_v_gene_heavy:")
println(filter(row -> occursin("D", row[:general_v_gene_heavy]), naive_subj_df)[:, :general_v_gene_heavy])

println("Entries with 'D' in general_v_gene_light:")
println(filter(row -> occursin("D", row[:general_v_gene_light]), naive_subj_df)[:, :general_v_gene_light])


Entries with 'D' in general_v_gene_heavy:
String15["IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", "IGHV3-64D", 

In [50]:

# Add the new columns
naive_subj_df[!, :general_v_gene_heavy_no_para] = replace.(naive_subj_df.general_v_gene_heavy, r"D" => "")
naive_subj_df[!, :general_v_gene_light_no_para] = replace.(naive_subj_df.general_v_gene_light, r"D" => "")

# Display the modified DataFrame
println(first(naive_subj_df, 10))  # Print the first 10 rows for verification

[1m10×25 DataFrame[0m
[1m Row [0m│[1m sequence_heavy                    [0m[1m locus_heavy [0m[1m v_call_heavy [0m[1m sequence_alignment_heavy          [0m[1m sequence_alignment_aa_heavy       [0m[1m germline_alignment_aa_heavy       [0m[1m cdr3_aa_heavy                [0m[1m sequence_light                    [0m[1m locus_light [0m[1m v_call_light [0m[1m sequence_alignment_light          [0m[1m sequence_alignment_aa_light       [0m[1m germline_alignment_aa_light       [0m[1m cdr3_aa_light [0m[1m sequence_alignment_heavy_sep_light [0m[1m BType         [0m[1m Disease  [0m[1m Species [0m[1m Subject  [0m[1m Author             [0m[1m Age   [0m[1m general_v_gene_heavy [0m[1m general_v_gene_light [0m[1m general_v_gene_heavy_no_para [0m[1m general_v_gene_light_no_para [0m
     │[90m String                            [0m[90m String1     [0m[90m String15     [0m[90m String                            [0m[90m String                    

In [51]:
# Filter and print entries with "D" in their name for each column
println("Entries with 'D' in general_v_gene_heavy:")
println(filter(row -> occursin("D", row[:general_v_gene_heavy_no_para]), naive_subj_df)[:, :general_v_gene_heavy_no_para])

println("Entries with 'D' in general_v_gene_light:")
filter(row -> occursin("D", row[:general_v_gene_light_no_para]), naive_subj_df)[:, :general_v_gene_light_no_para]


Entries with 'D' in general_v_gene_heavy:
String[]
Entries with 'D' in general_v_gene_light:


String[]

In [52]:
naive_subjects_counts = countmap(naive_subj_df.Subject)

Dict{String15, Int64} with 5 entries:
  "Donor-4"   => 183974
  "Patient-1" => 1896
  "Donor-2"   => 149869
  "Donor-1"   => 169841
  "Donor-3"   => 161327

In [53]:
# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
naive_subj_grouped_df = groupby(naive_subj_df, [:general_v_gene_heavy_no_para, :cdr3_aa_heavy])

# Step 1: Filter out groups with only one row
naive_subj_filtered_grouped = filter(g -> nrow(g) > 1, naive_subj_grouped_df)

# Step 2: Calculate the percentage of identical genes for each group
percentages = []

for group in naive_subj_filtered_grouped
    # Count occurrences of each unique gene in `general_v_gene_light`
    gene_counts = combine(groupby(group, "general_v_gene_light_no_para"), nrow => :count)
    
    # Calculate percentage of the most common gene
    most_common_count = maximum(gene_counts.count)
    total_count = sum(gene_counts.count)
    percentage_identical = (most_common_count / total_count) * 100
    
    push!(percentages, percentage_identical)  # Store the percentage
end

# Step 3: Calculate the average percentage
average_percentage = mean(percentages)

println("Average percentage of identical genes across groups: $average_percentage%")
average_percentage

Average percentage of identical genes across groups: 98.2537563123416%


98.2537563123416

In [54]:
group_sizes_naive_subj = [nrow(group) for group in naive_subj_filtered_grouped]

size_counts_naive_subj = combine(groupby(DataFrame(size = group_sizes_naive_subj), :size), nrow => :count)

println(size_counts_naive_subj)

[1m44×2 DataFrame[0m
[1m Row [0m│[1m size  [0m[1m count [0m
     │[90m Int64 [0m[90m Int64 [0m
─────┼──────────────
   1 │     2   7924
   2 │     3   1079
   3 │     4    473
   4 │     5    269
   5 │     6    159
   6 │     7    109
   7 │     8     73
   8 │     9     54
   9 │    10     46
  10 │    11     35
  11 │    12     23
  12 │    13     26
  13 │    14     12
  14 │    15      8
  15 │    16     10
  16 │    17     13
  17 │    18      5
  18 │    19      6
  19 │    20      3
  20 │    21      9
  21 │    22      5
  22 │    23      2
  23 │    24      1
  24 │    25      1
  25 │    26      5
  26 │    27      5
  27 │    28      2
  28 │    30      1
  29 │    31      1
  30 │    32      3
  31 │    34      1
  32 │    35      2
  33 │    36      2
  34 │    37      2
  35 │    38      1
  36 │    41      1
  37 │    45      1
  38 │    48      2
  39 │    50      1
  40 │    57      2
  41 │    61      1
  42 │    65      1
  43 │    90      1
  44 │   122

In [55]:
# filter out every group that has the same entry in Subject
naive_filtered_groups_only_sev_subj = filter(g -> length(unique(g.Subject)) > 1, naive_subj_filtered_grouped)

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String15,String7,String15,String31,Int64,String15,String15,String,String
1,AGCTCTGGGAGAAGAGCCCCAGCCCCAGAATTCCCAGGAGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTTGTTGCTATTATAAAAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCAAGCCTGGAGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTGACTACTACATGAGCTGGATCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTTCATACATTAGTAGTAGTAGTAGTTACACAAACTACGCAGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCGAGAGATCTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-11*06,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCAAGCCTGGAGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTGACTACTACATGAGCTGGATCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTTCATACATTAGTAGTAGTAGTAGTTACACAAACTACGCAGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCGAGAGATCTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QVQLVESGGGLVKPGGSLRLSCAASGFTFSDYYMSWIRQAPGKGLEWVSYISSSSSYTNYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARDLDYWGQGTLVTVSS,QVQLVESGGGLVKPGGSLRLSCAASGFTFSDYYMSWIRQAPGKGLEWVSYISSSSSYTNYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXDYWGQGTLVTVSS,ARDLDY,TGGGGGCTGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTATTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAACTATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGGGGATATGTCTTCGGAACTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-14*03,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAACTATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGGGGATATGTCTTCGGAACTGGGACCAAGGTCACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSRGYVFGTGTKVTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSXXYVFGTGTKVTVL,SSYTSSRGYV,QVQLVESGGGLVKPGGSLRLSCAASGFTFSDYYMSWIRQAPGKGLEWVSYISSSSSYTNYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARDLDYWGQGTLVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSRGYVFGTGTKVTVL,Naive-B-Cells,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV3-11,IGLV2-14,IGHV3-11,IGLV2-14
2,AGCTCTGGGAGAAGAGCCCCAGCCCCAGAATTCCCAGGAGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTTGTTGCTATTATAAAAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCAAGCCTGGAGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTGACTACTACATGAGCTGGATCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTTCATACATTAGTAGTAGTGGTAGTACCATATACTACGCAGACTCTGTGAAGGGCCGATTCACCATCTCCAGGGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTGTATTACTGTGCGAGAGATCTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-11*01,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCAAGCCTGGAGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTGACTACTACATGAGCTGGATCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTTTCATACATTAGTAGTAGTGGTAGTACCATATACTACGCAGACTCTGTGAAGGGCCGATTCACCATCTCCAGGGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTGTATTACTGTGCGAGAGATCTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QVQLVESGGGLVKPGGSLRLSCAASGFTFSDYYMSWIRQAPGKGLEWVSYISSSGSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARDLDYWGQGTLVTVSS,QVQLVESGGGLVKPGGSLRLSCAASGFTFSDYYMSWIRQAPGKGLEWVSYISSSGSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXDYWGQGTLVTVSS,ARDLDY,GAGGCTGGTCAGACTTTGTGCAGGAATCAGACCCAGTCAGGACACAGCATGGACATGAGAGTCCTCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGTTTCCCAGGTGCCAGATGTGACATCCAGATGACCCAGTCTCCATCCTCACTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGTCGGGCGAGTCAGGGCATTAGCAATTATTTAGCCTGGTTTCAGCAGAAACCAGGGAAAGCCCCTAAGTCCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTATTACTGCCAACAGTATAATAGTTACCCTCGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-16*02,GACATCCAGATGACCCAGTCTCCATCCTCACTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGTCGGGCGAGTCAGGGCATTAGCAATTATTTAGCCTGGTTTCAGCAGAAACCAGGGAAAGCCCCTAAGTCCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTATTACTGCCAACAGTATAATAGTTACCCTCGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,DIQMTQSPSSLSASVGDRVTITCRASQGISNYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYNSYPRTFGQGTKVEIK,DIQMTQSPSSLSASVGDRVTITCRASQGISNYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYNSYPRTFGQGTKVEIK,QQYNSYPRT,QVQLVESGGGLVKPGGSLRLSCAASGFTFSDYYMSWIRQAPGKGLEWVSYISSSGSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARDLDYWGQGTLVTVSS[SEP]DIQMTQSPSSLSASVGDRVTITCRASQGISNYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYNSYPRTFGQGTKVEIK,Naive-B-Cells,SARS-COV-2,human,Donor-2,"Jaffe et al., 2022",35,IGHV3-11,IGKV1-16,IGHV3-11,IGKV1-16

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String15,String7,String15,String31,Int64,String15,String15,String,String
1,ACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTCCTCCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTAGTTACTACTGGAGCTGGATCCGGCAGCCCGCCGGGAAGGGACTGGAGTGGATTGGGCGTATCTATACCAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCCGTGTATTACTGTGCGAGAGGGGGATCCTACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGCACCCACCAAGGCTCCGGATGTGTTCCCCATCATATCAGGGTGCAGACACCCAAAGGATAACAGCCCTGTGGTCCTGGCATGCTTGATAACTGGGTACCACC,H,IGHV4-61*02,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTAGTTACTACTGGAGCTGGATCCGGCAGCCCGCCGGGAAGGGACTGGAGTGGATTGGGCGTATCTATACCAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCCGTGTATTACTGTGCGAGAGGGGGATCCTACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCA,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGRIYTSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARGGSYYYYYGMDVWGQGTTVTVSS,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGRIYTSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARGGXYYYYYGMDVWGQGTTVTVSS,ARGGSYYYYYGMDV,GGCTGGGGTCTCAGGAGGCAGCACTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTCCTCCTCAGCCTCCTCACTCAGGGCACAGGATCCTGGGCTCAGTCTGCCCTGACTCAGCCTCGCTCAGTGTCCGGGTCTCCTGGACAGTCAGTCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGTGGTTATAACTATGTCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAAGCGGCCCTCAGGGGTCCCTGATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGATGAGGCTGATTATTACTGCTGCTCATATGCAGGCAGCTATGTCTTCGGAACTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-11*01,CAGTCTGCCCTGACTCAGCCTCGCTCAGTGTCCGGGTCTCCTGGACAGTCAGTCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGTGGTTATAACTATGTCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAAGCGGCCCTCAGGGGTCCCTGATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGATGAGGCTGATTATTACTGCTGCTCATATGCAGGCAGCTATGTCTTCGGAACTGGGACCAAGGTCACCGTCCTAG,QSALTQPRSVSGSPGQSVTISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSKRPSGVPDRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSYVFGTGTKVTVL,QSALTQPRSVSGSPGQSVTISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSKRPSGVPDRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSYVFGTGTKVTVL,CSYAGSYV,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGRIYTSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARGGSYYYYYGMDVWGQGTTVTVSS[SEP]QSALTQPRSVSGSPGQSVTISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSKRPSGVPDRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSYVFGTGTKVTVL,Naive-B-Cells,SARS-COV-2,human,Donor-2,"Jaffe et al., 2022",35,IGHV4-61,IGLV2-11,IGHV4-61,IGLV2-11
2,ACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTCCTCCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTAGTTACTACTGGAGCTGGATCCGGCAGCCCGCCGGGAAGGGACTGGAGTGGATTGGGCGTATCTATACCAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCCGTGTATTACTGTGCGAGGGGTGGGAGCTACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV4-61*02,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTAGTTACTACTGGAGCTGGATCCGGCAGCCCGCCGGGAAGGGACTGGAGTGGATTGGGCGTATCTATACCAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCCGTGTATTACTGTGCGAGGGGTGGGAGCTACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCA,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGRIYTSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARGGSYYYYYGMDVWGQGTTVTVSS,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGRIYTSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAXXGSYYYYYGMDVWGQGTTVTVSS,ARGGSYYYYYGMDV,TCTGGGGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCTCTGGATCCAGTGGGGATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGCATAGTAATGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGCAGTCTCCACAGCTCCTGATCTATTTGGGTTCTAATCGGGCCTCCGGGGTCCCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGCTCTACAAACTCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-28*01,GATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGCATAGTAATGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGCAGTCTCCACAGCTCCTGATCTATTTGGGTTCTAATCGGGCCTCCGGGGTCCCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGCTCTACAAACTCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPLTFGGGTKVEIK,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPLTFGGGTKVEIK,MQALQTPLT,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGRIYTSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARGGSYYYYYGMDVWGQGTTVTVSS[SEP]DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPLTFGGGTKVEIK,Naive-B-Cells,SARS-COV-2,human,Donor-1,"Jaffe et al., 2022",45,IGHV4-61,IGKV2-28,IGHV4-61,IGKV2-28


In [56]:
# Step 2: Calculate the percentage of identical genes for each group
percentages = []

for group in naive_filtered_groups_only_sev_subj
    # Count occurrences of each unique gene in `general_v_gene_light`
    gene_counts = combine(groupby(group, "general_v_gene_light_no_para"), nrow => :count)
    
    # Calculate percentage of the most common gene
    most_common_count = maximum(gene_counts.count)
    total_count = sum(gene_counts.count)
    percentage_identical = (most_common_count / total_count) * 100
    
    push!(percentages, percentage_identical)  # Store the percentage
end

# Step 3: Calculate the average percentage
average_percentage = mean(percentages)

println("Average percentage of identical genes across groups: $average_percentage%")
average_percentage

Average percentage of identical genes across groups: 57.11544795783927%


57.11544795783927

In [57]:
# alternative: Calculate the fraction of groups where all entries have the same `general_v_gene_light`
true_cases = 0
total_groups = length(naive_filtered_groups_only_sev_subj)

for group in naive_filtered_groups_only_sev_subj
    # Check if all entries in the group have the same `general_v_gene_light`
    unique_genes = unique(group.general_v_gene_light)
    if length(unique_genes) == 1
        true_cases += 1  # Increment the count if all genes are identical
    end
end

# Step 3: Calculate the fraction of "true" cases
fraction_true = (true_cases / total_groups) * 100

println("Percentage of groups where all entries have the same general_v_gene_light: $fraction_true%")
fraction_true

Percentage of groups where all entries have the same general_v_gene_light: 13.83399209486166%


13.83399209486166

# Unsorted B Cells (as "control")

In [58]:
uns_subj_df = filter(row -> row.BType == "Unsorted-B-Cells", subjects_df);

In [59]:

# Extract up to the first two segments (e.g., "IGKV2-30" from "IGKV2-30*01")
uns_subj_df[!, :general_v_gene_heavy] = replace.(uns_subj_df.v_call_heavy, r"(^[^*]+?)(?:\*.*)?$" => s"\1");
uns_subj_df[!, :general_v_gene_light] = replace.(uns_subj_df.v_call_light, r"(^[^*]+?)(?:\*.*)?$" => s"\1");

unique_btypes = unique(uns_subj_df.BType)

println("unique BTypes: ", unique_btypes)


unique BTypes: ["Unsorted-B-Cells"]


In [60]:
# Filter and print entries with "D" in their name for each column
println("Entries with 'D' in general_v_gene_heavy:")
println(filter(row -> occursin("D", row[:general_v_gene_heavy]), uns_subj_df)[:, :general_v_gene_heavy])

println("Entries with 'D' in general_v_gene_light:")
println(filter(row -> occursin("D", row[:general_v_gene_light]), uns_subj_df)[:, :general_v_gene_light])


Entries with 'D' in general_v_gene_heavy:
["IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-43D", "IGHV3-4

Excessive output truncated after 524288 bytes.

, "IGKV3D-15", "IGKV1D-12", "IGKV3D-20", "IGKV1D-8", "IGKV1D-12", "IGKV2D-29", "IGKV1D-12", "IGKV3D-20", "IGKV2D-29", "IGKV1D-8", "IGKV6D-21", "IGKV3D-20", "IGKV1D-16", "IGKV2D-29", "IGKV3D-15", "IGKV3D-15", "IGKV6D-21", "IGKV2D-29", "IGKV3D-20", "IGKV2D-29", "IGKV3D-20", "IGKV3D-15", "IGKV2D-29", "IGKV3D-11", "IGKV1D-16", "IGKV6D-21", "IGKV1D-16", "IGKV3D-15", "IGKV3D-15", "IGKV3D-20", "IGKV2D-30", "IGKV3D-20", "IGKV2D-29", "IGKV2D-29", "IGKV3D-20", "IGKV3D-20", "IGKV2D-29", "IGKV3D-15", "IGKV3D-20", "IGKV1D-8", "IGKV3D-20", "IGKV1D-8", "IGKV3D-15", "IGKV1D-16", "IGKV1D-12", "IGKV2D-29", "IGKV3D-15", "IGKV1D-12", "IGKV1D-12", "IGKV1D-12", "IGKV2D-29", "IGKV3D-20", "IGKV3D-15", "IGKV3D-20", "IGKV3D-15", "IGKV6D-21", "IGKV2D-29", "IGKV1D-12", "IGKV6D-21", "IGKV2D-29", "IGKV1D-8", "IGKV3D-15", "IGKV3D-15", "IGKV1D-16", "IGKV1D-8", "IGKV1D-12", "IGKV2D-29", "IGKV3D-15", "IGKV2D-29", "IGKV1D-12", "IGKV3D-15", "IGKV1D-12", "IGKV1D-8", "IGKV1D-12", "IGKV1D-12", "IGKV2D-29", "IGKV6D-21", "IGK

In [61]:

# Add the new columns
uns_subj_df[!, :general_v_gene_heavy_no_para] = replace.(uns_subj_df.general_v_gene_heavy, r"D" => "")
uns_subj_df[!, :general_v_gene_light_no_para] = replace.(uns_subj_df.general_v_gene_light, r"D" => "")

# Display the modified DataFrame
println(first(uns_subj_df, 10))  # Print the first 10 rows for verification

In [62]:
# Filter and print entries with "D" in their name for each column
println("Entries with 'D' in general_v_gene_heavy_no_para:")
println(filter(row -> occursin("D", row[:general_v_gene_heavy_no_para]), uns_subj_df)[:, :general_v_gene_heavy_no_para])

println("Entries with 'D' in general_v_gene_light_no_para:")
filter(row -> occursin("D", row[:general_v_gene_light_no_para]), uns_subj_df)[:, :general_v_gene_light_no_para]


String[]

In [63]:
uns_subj_df_counts = countmap(uns_subj_df.Subject)

Dict{String15, Int64} with 30 entries:
  "Patient-6"    => 1605
  "390c"         => 1012
  "Patient-15"   => 3849
  "Patient-14"   => 4232
  "Patient-5"    => 3574
  "Subject-BCP4" => 425
  "Donor-45"     => 4103
  "Subject-BCP5" => 2935
  "Patient-2"    => 4162
  "None"         => 109103
  "Patient-1"    => 3340
  "Patient-12"   => 3314
  "Donor-2"      => 110575
  "Subject-BCP9" => 2978
  "Patient-7"    => 1584
  "Patient-3"    => 1812
  "Donor-3"      => 143053
  "Subject-BCP8" => 2888
  "Patient-13"   => 3482
  ⋮              => ⋮

In [64]:
# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
uns_subj_df_grouped = groupby(uns_subj_df, [:general_v_gene_heavy_no_para, :cdr3_aa_heavy])

# Step 1: Filter out groups with only one row
uns_subj_df_grouped_filtered = filter(g -> nrow(g) > 1, uns_subj_df_grouped)


Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String,String,String7,String15,String31,String15,String,String,String,String
1,GATTTCCTTAAATTCAGGGTCCAGCTCACATGGGAAATACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTTCTCCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-59*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXYSSXXXXXXYFDYWGQGTLVTVSS,ARDVGPYNSISPGRYYFDY,GGGGTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGACACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCAGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-23*02,CAGTCTGCCCTGACTCAGCCAGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSXXXYVFGTGTKVTVL,CSYAGSRILYV,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,Unsorted-B-Cells,,human,390c,"James et al, 2020",65 to 70,IGHV4-59,IGLV2-23,IGHV4-59,IGLV2-23
2,ATACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTTCTCCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-59*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXYSSXXXXXXYFDYWGQGTLVTVSS,ARDVGPYNSISPGRYYFDY,GGGGTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGACACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCAGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-23*02,CAGTCTGCCCTGACTCAGCCAGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSXXXYVFGTGTKVTVL,CSYAGSRILYV,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,Unsorted-B-Cells,,human,390c,"James et al, 2020",65 to 70,IGHV4-59,IGLV2-23,IGHV4-59,IGLV2-23
3,GGGTCCAGCTCACATGGGAAATACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTTCTCCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-59*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXYSSXXXXXXYFDYWGQGTLVTVSS,ARDVGPYNSISPGRYYFDY,GTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGACACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCC,L,IGLV2-23*02,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSXXXYVFGTGTKVTVL,CSYAGSRILYV,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,Unsorted-B-Cells,,human,390c,"James et al, 2020",65 to 70,IGHV4-59,IGLV2-23,IGHV4-59,IGLV2-23
4,AAATTCAGGGTCCAGCTCACATGGGAAATACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTTCTCCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-59*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXYSSXXXXXXYFDYWGQGTLVTVSS,ARDVGPYNSISPGRYYFDY,GGGGTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGACACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCAGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-23*02,CAGTCTGCCCTGACTCAGCCAGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSXXXYVFGTGTKVTVL,CSYAGSRILYV,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,Unsorted-B-Cells,,human,390c,"James et al, 2020",65 to 70,IGHV4-59,IGLV2-23,IGHV4-59,IGLV2-23

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String,String,String7,String15,String31,String15,String,String,String,String
1,TGGGGGCATTCTGAGAGTCATGGATCTCATGTGCAAGAAAATGAAGCACCTGTGGTTCTTCCTCCTGCTGGTGGCGGCTCCCAGATGGGTCCTGTCCCAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCCTCGGAGACCCAGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCACGAGGACTTGCTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGAGTATCCATTATAGTGGGAACACCTATTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCCGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCTGTATATTACTGTGCGAGACAGGCAAATTACGGCACCTATCCCTACTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-39*01,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCCTCGGAGACCCAGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCACGAGGACTTGCTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGAGTATCCATTATAGTGGGAACACCTATTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCCGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCTGTATATTACTGTGCGAGACAGGCAAATTACGGCACCTATCCCTACTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCA,QLQLQESGPGLVKPSETQSLTCTVSGGSISTRTCYWGWIRQPPGKGLEWIGSIHYSGNTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARQANYGTYPYYMDVWGKGTTVTVSS,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXYXXXXYYMDVWGKGTTVTVSS,ARQANYGTYPYYMDV,CTGGGGGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCTCTGGATCCAGTGGGGATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGCATAGAAGTGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGAAGTCTCCACAGCTCCTGATCTATTTGGGTTCTAATCGGGCCTCCGGGGTCCCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGCTCTACAAAGTCCTACCTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-28*01,GATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGCATAGAAGTGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGAAGTCTCCACAGCTCCTGATCTATTTGGGTTCTAATCGGGCCTCCGGGGTCCCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGCTCTACAAAGTCCTACCTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHRSGYNYLDWYLQKPGKSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQSPTFGQGTKVEIK,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPXFGQGTKVEIK,MQALQSPT,QLQLQESGPGLVKPSETQSLTCTVSGGSISTRTCYWGWIRQPPGKGLEWIGSIHYSGNTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARQANYGTYPYYMDVWGKGTTVTVSS[SEP]DIVMTQSPLSLPVTPGEPASISCRSSQSLLHRSGYNYLDWYLQKPGKSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQSPTFGQGTKVEIK,Unsorted-B-Cells,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV4-39,IGKV2-28,IGHV4-39,IGKV2-28
2,GGGGCATTCTGAGAGTCATGGATCTCATGTGCAAGAAAATGAAGCACCTGTGGTTCTTCCTCCTGCTGGTGGCGGCTCCCAGATGGGTCCTGTCCCAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCCTCGGAGACCCAGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCACGAGGACTTGCTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGAGTATCCATTATAGTGGGAACACCTATTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCCGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCTGTATATTACTGTGCGAGACAGGCAAATTACGGCACCTATCCCTACTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-39*01,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCCTCGGAGACCCAGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCACGAGGACTTGCTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGAGTATCCATTATAGTGGGAACACCTATTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCCGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCTGTATATTACTGTGCGAGACAGGCAAATTACGGCACCTATCCCTACTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCA,QLQLQESGPGLVKPSETQSLTCTVSGGSISTRTCYWGWIRQPPGKGLEWIGSIHYSGNTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARQANYGTYPYYMDVWGKGTTVTVSS,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXYXXXXYYMDVWGKGTTVTVSS,ARQANYGTYPYYMDV,GATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCTCTGGATCCAGTGGGGATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGCATAGAAGTGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGAAGTCTCCACAGCTCCTGATCTATTTGGGTTCTAATCGGGCCTCCGGGGTCCCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGCTCTACAAAGTCCTACCTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-28*01,GATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGCATAGAAGTGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGAAGTCTCCACAGCTCCTGATCTATTTGGGTTCTAATCGGGCCTCCGGGGTCCCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGCTCTACAAAGTCCTACCTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHRSGYNYLDWYLQKPGKSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQSPTFGQGTKVEIK,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPXFGQGTKVEIK,MQALQSPT,QLQLQESGPGLVKPSETQSLTCTVSGGSISTRTCYWGWIRQPPGKGLEWIGSIHYSGNTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARQANYGTYPYYMDVWGKGTTVTVSS[SEP]DIVMTQSPLSLPVTPGEPASISCRSSQSLLHRSGYNYLDWYLQKPGKSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQSPTFGQGTKVEIK,Unsorted-B-Cells,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV4-39,IGKV2-28,IGHV4-39,IGKV2-28


In [65]:
group_sizes_uns_subj = [nrow(group) for group in uns_subj_df_grouped_filtered]

size_counts_uns_subj = combine(groupby(DataFrame(size = group_sizes_uns_subj), :size), nrow => :count)

println(size_counts_uns_subj)

In [66]:
# filter out every group that has the same entry in Subject
uns_filtered_groups_only_sev_subj = filter(g -> length(unique(g.Subject)) > 1, uns_subj_df_grouped_filtered)

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String,String,String7,String15,String31,String15,String,String,String,String
1,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGGATTGGGGCTGAGCTGGGTTTTCCTTGTTGCTCTTTTAGAAGTTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGAGTCCCTGAGACTCTCCTGTGAAGCCTCTGGAATCACGTTCAGTAGCTATTGGATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATAAAAAAAGATGGAAGTGAGACATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAACCTGAGAGTCGAGGACGCGGCTGTGTATTACTGTGGGAGAGGGAGTGGCTGGTTACAGGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-7*01,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGAGTCCCTGAGACTCTCCTGTGAAGCCTCTGGAATCACGTTCAGTAGCTATTGGATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATAAAAAAAGATGGAAGTGAGACATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAACCTGAGAGTCGAGGACGCGGCTGTGTATTACTGTGGGAGAGGGAGTGGCTGGTTACAGGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGESLRLSCEASGITFSSYWMSWVRQAPGKGLEWVANIKKDGSETWYVDSVKGRFTISRDNAKNSLYLQMNNLRVEDAAVYYCGRGSGWLQDYWGQGTLVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXSGWXXDYWGQGTLVTVSS,GRGSGWLQDY,GAGCTACAACAGGCAGGCAGGGGCAGCAAGATGGTGTTGCAGACCCAGGTCTTCATTTCTCTGTTGCTCTGGATCTCTGGTGCCTACGGGGACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTGTTTTCTCTAGCTCCAGTAATAAGAACTTCCTAGCTTGGTTCCAGAAGAAACCAGGGCAGCCTCCTAAGTTGCTAATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATCATAGTGGTCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV4-1*01,GACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTGTTTTCTCTAGCTCCAGTAATAAGAACTTCCTAGCTTGGTTCCAGAAGAAACCAGGGCAGCCTCCTAAGTTGCTAATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATCATAGTGGTCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIVMTQSPDSLAVSLGERATVNCKSSQSVFSSSSNKNFLAWFQKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYHSGPLTFGGGTKVEIK,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPLTFGGGTKVEIK,QQYHSGPLT,EVQLVESGGGLVQPGESLRLSCEASGITFSSYWMSWVRQAPGKGLEWVANIKKDGSETWYVDSVKGRFTISRDNAKNSLYLQMNNLRVEDAAVYYCGRGSGWLQDYWGQGTLVTVSS[SEP]DIVMTQSPDSLAVSLGERATVNCKSSQSVFSSSSNKNFLAWFQKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYHSGPLTFGGGTKVEIK,Unsorted-B-Cells,Multiple-sclerosis,human,,"Ramesh et al, 2020",37.0,IGHV3-7,IGKV4-1,IGHV3-7,IGKV4-1
2,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAATTGGGGCTGACCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGAGTCCGCCTCAGTAACTATTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAAGAAAGATGGAAGTGAGAAATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATTTCCAGAGACAACGCCGAGAACTCACTGTTTCTGCAAATGGACAAACTGAGAGACGACGACACGGCTGTGTATTACTGCGGGAGGGGCAGTGGCTGGCTACAAGATTACTGGGGCCAGGGAATATCGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-7*04,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGAGTCCGCCTCAGTAACTATTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAAGAAAGATGGAAGTGAGAAATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATTTCCAGAGACAACGCCGAGAACTCACTGTTTCTGCAAATGGACAAACTGAGAGACGACGACACGGCTGTGTATTACTGCGGGAGGGGCAGTGGCTGGCTACAAGATTACTGGGGCCAGGGAATATCGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGVRLSNYWMSWVRQAPGKGLEWVANMKKDGSEKWYVDSVKGRFTISRDNAENSLFLQMDKLRDDDTAVYYCGRGSGWLQDYWGQGISVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARGSGWXXXYWGQGTLVTVSS,GRGSGWLQDY,GCTACAACAGGCAGGCAGGGGCAGCAAGATGGTGTTGCAGACCCAGGTCTTCATTTCTCTGTTGCTCTGGATCTCTGGTGCCTACGGGGACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTATTTTATCCAGTTCCAACAATAAGAACTACTTAGCTTGGTTCCACAAGAAACCAGGACAGCCTCCTAAACTACTCATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAACAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATTATGGTGGTCCTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV4-1*01,GACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTATTTTATCCAGTTCCAACAATAAGAACTACTTAGCTTGGTTCCACAAGAAACCAGGACAGCCTCCTAAACTACTCATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAACAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATTATGGTGGTCCTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIVMTQSPDSLAVSLGERATVNCKSSQSILSSSNNKNYLAWFHKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTINSLQAEDVAVYYCQQYYGGPLTFGGGTKVEIK,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPLTFGGGTKVEIK,QQYYGGPLT,EVQLVESGGGLVQPGGSLRLSCAASGVRLSNYWMSWVRQAPGKGLEWVANMKKDGSEKWYVDSVKGRFTISRDNAENSLFLQMDKLRDDDTAVYYCGRGSGWLQDYWGQGISVTVSS[SEP]DIVMTQSPDSLAVSLGERATVNCKSSQSILSSSNNKNYLAWFHKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTINSLQAEDVAVYYCQQYYGGPLTFGGGTKVEIK,Unsorted-B-Cells,CMV,human,Donor-4,"Jaffe et al., 2022",50.0,IGHV3-7,IGKV4-1,IGHV3-7,IGKV4-1

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String,String,String7,String15,String31,String15,String,String,String,String
1,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAGTTGGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAGCTGGCGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGTAGTCTCTGGATTCACGTTTAGTGACTACTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAATGAAGATGGAAGTGAGAAATACTGTCTGGACGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTGTATTACTGTGCGAGAGGAGATGTCAACTCGGGCGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV3-7*03,GAGGTGCAGCTGGCGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGTAGTCTCTGGATTCACGTTTAGTGACTACTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAATGAAGATGGAAGTGAGAAATACTGTCTGGAC---------GGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTGTATTACTGTGCGAGAGGAGATGTCAACTCGGGCGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLAESGGGLVQPGGSLRLSCVVSGFTFSDYWMSWVRQAPGKGLEWVANMNEDGSEKYCLDGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARGDVNSGDYWGQGTLVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXXNXXDYWGQGTLVTVSS,ARGDVNSGDY,GATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCCCAGGATCCAGTGGGGATCTTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAAAGCCTCGTACACAGTGATGGAAACATCTACTTGAATTGGCTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTCATACATAGGGTTTCTACCCGGGACTCTGGGGTCCCAGAAAGATTCAGCGGCAGTGGGTCAGGCACTAATTTCACACTGAGAATCAGCAGGGTGGAGGCTGAGGATGTTGGCGTTTATTACTGCATGCAAGGTAAACACTGGACTTTTGGCCAGGGGACCAAGCTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-30*02,GATCTTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAAAGCCTCGTACACAGTGATGGAAACATCTACTTGAATTGGCTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTCATACATAGGGTTTCTACCCGGGACTCTGGGGTCCCAGAAAGATTCAGCGGCAGTGGGTCAGGCACTAATTTCACACTGAGAATCAGCAGGGTGGAGGCTGAGGATGTTGGCGTTTATTACTGCATGCAAGGTAAACACTGGACTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC,DLVMTQSPLSLPVTLGQPASISCRSSQSLVHSDGNIYLNWLQQRPGQSPRRLIHRVSTRDSGVPERFSGSGSGTNFTLRISRVEAEDVGVYYCMQGKHWTFGQGTKLEIK,DVVMTQSPLSLPVTLGQPASISCRSSQSLVHSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTHWPFGQGTKLEIK,MQGKHWT,EVQLAESGGGLVQPGGSLRLSCVVSGFTFSDYWMSWVRQAPGKGLEWVANMNEDGSEKYCLDGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARGDVNSGDYWGQGTLVTVSS[SEP]DLVMTQSPLSLPVTLGQPASISCRSSQSLVHSDGNIYLNWLQQRPGQSPRRLIHRVSTRDSGVPERFSGSGSGTNFTLRISRVEAEDVGVYYCMQGKHWTFGQGTKLEIK,Unsorted-B-Cells,SARS-COV-2,human,Patient-15,"Mor et al., 2021",51,IGHV3-7,IGKV2-30,IGHV3-7,IGKV2-30
2,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACGCACCATGGAATTGGGGCTGAGCTGGGTTTTCCTTGTTGTTATTCTAGAAGGTGTCCAGTGTGAGGTGCAATTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGCAGGCTCTGGATTTATGTTTAGCGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGCGTGGCCATCACAGACCAAGAAGGAAATGAGAGATACTCTGTTCACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACGCCAAAAATTCTCTGTATTTGGAAATGCACAGCCTGAGAGCCGAAGACACGGCTCTATATTACTGTGCGAGAGGGGATGTCAATTCGGGGGACTATTGGGGCCAGGGAACCATGGTCACCGTCGCGTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-7*01,GAGGTGCAATTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGCAGGCTCTGGATTTATGTTTAGCGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGCGTGGCCATCACAGACCAAGAAGGAAATGAGAGATACTCTGTTCACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACGCCAAAAATTCTCTGTATTTGGAAATGCACAGCCTGAGAGCCGAAGACACGGCTCTATATTACTGTGCGAGAGGGGATGTCAATTCGGGGGACTATTGGGGCCAGGGAACCATGGTCACCGTCGCGTCAG,EVQLVESGGGLVQPGGSLRLSCAGSGFMFSDYWMTWVRQAPGKGLECVAITDQEGNERYSVHSVRGRFTISRDNAKNSLYLEMHSLRAEDTALYYCARGDVNSGDYWGQGTMVTVAS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXXXSGDYWGQGTLVTVSS,ARGDVNSGDY,GGGGACTGATCAGGGCTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCCCAGGATCTCGTGGGTTTTTTGTGCTGACTCAGTCTCCACTCTCACTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGGGATGGATATACCTACCTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTTCTTTATAAGGTTTCTCACCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAGAATCAGTAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAACATACTGGATGTTCGGCCAAGGGACCAAACTGGAGATCAAAGGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-30*01,TTGTGCTGACTCAGTCTCCACTCTCACTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGGGATGGATATACCTACCTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTTCTTTATAAGGTTTCTCACCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAGAATCAGTAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAACATACTGGATGTTCGGCCAAGGGACCAAACTGGAGATCAAA,VLTQSPLSLPVTLGQPASISCRSNASLLDRDGYTYLNWFQQRPGQSPRRLLYKVSHRDSGVPDRFSGSGSGTDFTLRISRVEAEDVAVYYCMQATYWMFGQGTKLEIK,VMTQSPLSLPVTLGQPASISCRSSQSLVYSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTHWXFGQGTKVEIK,MQATYWM,EVQLVESGGGLVQPGGSLRLSCAGSGFMFSDYWMTWVRQAPGKGLECVAITDQEGNERYSVHSVRGRFTISRDNAKNSLYLEMHSLRAEDTALYYCARGDVNSGDYWGQGTMVTVAS[SEP]VLTQSPLSLPVTLGQPASISCRSNASLLDRDGYTYLNWFQQRPGQSPRRLLYKVSHRDSGVPDRFSGSGSGTDFTLRISRVEAEDVAVYYCMQATYWMFGQGTKLEIK,Unsorted-B-Cells,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV3-7,IGKV2-30,IGHV3-7,IGKV2-30
3,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACGCACCATGGAATTGGGGCTGAACTGGGTTTTCCTTGTTGCTATTCTGGAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGTAGCCTCTGGATTTATGTTTAGTGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACACAAACCAAGATGGGAGTGACAAGCACTATGTCTACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACACCGAAAATTCTCTGTTTCTGGAAATGCACAGCCTGAGACCCGAAGACACGGCTCTATATTATTGTGCGCGAGGGGATGTCAACTCGGGGGACTACTGGGGCCAGGGAACCATGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-7*01,GAGGTGCAGCTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGTAGCCTCTGGATTTATGTTTAGTGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACACAAACCAAGATGGGAGTGACAAGCACTATGTCTACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACACCGAAAATTCTCTGTTTCTGGAAATGCACAGCCTGAGACCCGAAGACACGGCTCTATATTATTGTGCGCGAGGGGATGTCAACTCGGGGGACTACTGGGGCCAGGGAACCATGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCVASGFMFSDYWMTWVRQAPGKGLEWVANTNQDGSDKHYVYSVRGRFTISRDNTENSLFLEMHSLRPEDTALYYCARGDVNSGDYWGQGTMVTVSS,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXXXSGDYWGQGTLVTVSS,ARGDVNSGDY,CTGGGGGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCCCAGGATCTCGTGGGTATTTTGTGATGACTCAGTCTCCTCTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGTGATGGGAACACCCACTTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTACTTTATAAGGTTTCTCGCCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAAAATCAGCAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAATATACTGGACGTTCGGCCAAGGGACCAAACTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-30*01,ATTTTGTGATGACTCAGTCTCCTCTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGTGATGGGAACACCCACTTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTACTTTATAAGGTTTCTCGCCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAAAATCAGCAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAATATACTGGACGTTCGGCCAAGGGACCAAACTGGAAATCAAAC,FVMTQSPLSLPVTLGQPASISCRSNASLLDSDGNTHLNWFQQRPGQSPRRLLYKVSRRDSGVPDRFSGSGSGTDFTLKISRVEAEDVAVYYCMQAIYWTFGQGTKLEIK,VVMTQSPLSLPVTLGQPASISCRSSQSLVYSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTHWTFGQGTKVEIK,MQAIYWT,EVQLVESGGGLVQPGGSLRLSCVASGFMFSDYWMTWVRQAPGKGLEWVANTNQDGSDKHYVYSVRGRFTISRDNTENSLFLEMHSLRPEDTALYYCARGDVNSGDYWGQGTMVTVSS[SEP]FVMTQSPLSLPVTLGQPASISCRSNASLLDSDGNTHLNWFQQRPGQSPRRLLYKVSRRDSGVPDRFSGSGSGTDFTLKISRVEAEDVAVYYCMQAIYWTFGQGTKLEIK,Unsorted-B-Cells,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV3-7,IGKV2-30,IGHV3-7,IGKV2-30


In [67]:
# alternative: Calculate the fraction of groups where all entries have the same `general_v_gene_light`
true_cases_uns = 0
total_groups_uns = length(uns_filtered_groups_only_sev_subj)

for group in uns_filtered_groups_only_sev_subj
    # Check if all entries in the group have the same `general_v_gene_light`
    unique_genes_uns = unique(group.general_v_gene_light_no_para)
    if length(unique_genes_uns) == 1
        true_cases_uns += 1  # Increment the count if all genes are identical
    end
end

# Step 3: Calculate the fraction of "true" cases
fraction_true_uns = (true_cases_uns / total_groups_uns) * 100

println("Percentage of groups where all entries have the same general_v_gene_light: $fraction_true_uns%")
fraction_true_uns

64.45672191528546

## Test Set

In [68]:
test_set_file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/test_set/df_merged_final_test_set.csv"

df_test_set = CSV.read(test_set_file_path, DataFrame);

In [69]:
names(df_test_set)

33-element Vector{String}:
 "sequence_heavy"
 "locus_heavy"
 "v_call_heavy"
 "sequence_alignment_heavy"
 "sequence_alignment_aa_heavy"
 "germline_alignment_aa_heavy"
 "cdr3_aa_heavy"
 "sequence_light"
 "locus_light"
 "v_call_light"
 ⋮
 "BLOSUM_score"
 "similarity"
 "perplexity"
 "calculated_blosum"
 "calculated_similarity"
 "general_v_gene_heavy"
 "general_v_gene_light"
 "v_gene_heavy_family"
 "v_gene_light_family"

In [70]:
df_test_set = countmap(df_test_set.BType)

Dict{String31, Int64} with 6 entries:
  "Plasma-B-Cells"   => 6020
  "Memory-B-Cells"   => 26539
  "Unsorted-B-Cells" => 11237
  "Naive-B-Cells"    => 8019
  "Plasmablast"      => 263
  "RV+B-Cells"       => 60

In [71]:

# Extract up to the first two segments (e.g., "IGKV2-30" from "IGKV2-30*01")
df_test_set[!, :general_v_gene_heavy] = replace.(df_test_set.v_call_heavy, r"(^[^*]+?)(?:\*.*)?$" => s"\1");
df_test_set[!, :general_v_gene_light] = replace.(df_test_set.v_call_light, r"(^[^*]+?)(?:\*.*)?$" => s"\1");

# Add the new columns
df_test_set[!, :general_v_gene_heavy_no_para] = replace.(df_test_set.general_v_gene_heavy, r"D" => "")
df_test_set[!, :general_v_gene_light_no_para] = replace.(df_test_set.general_v_gene_light, r"D" => "")

# extract names of the BTypes
replace!(df_test_set.BType, "CD27-memory-and-Plasmablast/Plasma-B-Cells" => "CD27-memory-and-Plasmablast_Plasma-B-Cells")
replace!(df_test_set.BType, "Plasmablast/Plasma-B-Cells" => "Plasmablast_Plasma-B-Cells")

unique_btypes = unique(df_test_set.BType)

println("unique BTypes: ", unique_btypes)

# Group by BType
grouped_by_btype = groupby(df_test_set, :BType)

output_dir = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/test_set"

# Save each group as a separate CSV file
for (btype_group_id, btype_group) in enumerate(grouped_by_btype)
    # Extract the BType name (to use in the file name)
    btype_name = btype_group.BType[1]  # Assuming BType exists and is consistent within each group

    # print number of rows in each group
    println("Number of rows in group $btype_name: ", nrow(btype_group))
    
    # Create a sanitized file name (replace spaces or special characters if needed)
    file_name = "BType_$(replace(btype_name, r"\s" => "_"))_extra_cols.csv"
    
    # Write the group to a CSV file
    CSV.write(joinpath(output_dir, file_name), btype_group, writeheader=true)
end

println("Grouped data saved to CSV files.")

ErrorException: type Dict has no field v_call_heavy

## Memory B Cells test set

In [72]:
mem_test_set_file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/test_set/BType_Memory-B-Cells_extra_cols.csv"

mem_df_test_set = CSV.read(mem_test_set_file_path, DataFrame);

In [73]:
# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
mem_df_test_set_grouped = groupby(mem_df_test_set, [:general_v_gene_heavy_no_para, :cdr3_aa_heavy]);

# Step 1: Filter out groups with only one row
mem_df_test_set_grouped_filt = filter(g -> nrow(g) > 1, mem_df_test_set_grouped);

In [74]:
# filter out every group that has the same entry in Subject
mem_df_test_set_grouped_filt_subj = filter(g -> length(unique(g.Subject)) > 1, mem_df_test_set_grouped_filt)

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,sequence_alignment_aa_light_1,generated_sequence_light,input_heavy_sequence,BLOSUM_score,similarity,perplexity,calculated_blosum,calculated_similarity,general_v_gene_heavy,general_v_gene_light,v_gene_heavy_family,v_gene_light_family,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String7,String7,String7,String31,String3,String,String,String,Int64,Float64,Float64,Int64,Float64,String15,String15,String7,String7,String15,String15
1,GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGGCTCTGGATTCCCCTTCAGTAGTTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCACATGATGGAAGTAATGAATACTATGCAGACTCCGTGAAGGGCCGATTCACCATCTTCAGAGACAATTCCAAGAACACAATGTATCTGCAAATGAACAGCCTGAGAGCTGGGGACTCGGCTCTGTATTACTGTGCGAAAGAAGGTTACTATGGTTCGGGGAGTTTCCCAGATTACTGGGGCCAGGGAACCCTGATCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-30*18,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGGCTCTGGATTCCCCTTCAGTAGTTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCACATGATGGAAGTAATGAATACTATGCAGACTCCGTGAAGGGCCGATTCACCATCTTCAGAGACAATTCCAAGAACACAATGTATCTGCAAATGAACAGCCTGAGAGCTGGGGACTCGGCTCTGTATTACTGTGCGAAAGAAGGTTACTATGGTTCGGGGAGTTTCCCAGATTACTGGGGCCAGGGAACCCTGATCACCGTCTCCTCAG,QVQLVESGGGVVQPGRSLRLSCAGSGFPFSSYGMHWVRQAPGKGLEWVAVISHDGSNEYYADSVKGRFTIFRDNSKNTMYLQMNSLRAGDSALYYCAKEGYYGSGSFPDYWGQGTLITVSS,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXYYGSGSXXXYWGQGTLVTVSS,AKEGYYGSGSFPDY,AGGAATCAGACCCAGTCAGGACACAGCATGGACATGAGAGTCCTCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGTTTCCCAGGTGCCAGATGTGACATCCAGATGACCCAGTCTCCATCCTCACTGTCTGCATCTGTAGGAGACAGAGTCATCATCACTTGTCGGGCGAGTCAGGGCATTGCCAATTATTTAGCCTGGTTTCAGCAGAAACCAGGGAAAGCCCCTAAGTCCCTAATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAATTTATTACTGCCAACAGTATAATTCTTACCCGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-16*02,GACATCCAGATGACCCAGTCTCCATCCTCACTGTCTGCATCTGTAGGAGACAGAGTCATCATCACTTGTCGGGCGAGTCAGGGCATTGCCAATTATTTAGCCTGGTTTCAGCAGAAACCAGGGAAAGCCCCTAAGTCCCTAATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAATTTATTACTGCCAACAGTATAATTCTTACCCGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,DIQMTQSPSSLSASVGDRVIITCRASQGIANYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFAIYYCQQYNSYPWTFGQGTKVEIK,DIQMTQSPSSLSASVGDRVTITCRASQGISNYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYNSYPWTFGQGTKVEIK,QQYNSYPWT,QVQLVESGGGVVQPGRSLRLSCAGSGFPFSSYGMHWVRQAPGKGLEWVAVISHDGSNEYYADSVKGRFTIFRDNSKNTMYLQMNSLRAGDSALYYCAKEGYYGSGSFPDYWGQGTLITVSS[SEP]DIQMTQSPSSLSASVGDRVIITCRASQGIANYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFAIYYCQQYNSYPWTFGQGTKVEIK,Memory-B-Cells,,human,Donor-2,"Phad et al., 2022",no,DIQMTQSPSSLSASVGDRVIITCRASQGIANYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFAIYYCQQYNSYPWTFGQGTKVEIK,DIQMTQSPSSLSASVGDRVTITCRASQGISNYLAWYQQKPGKVPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDVATYYCQKYNSAPFTFGPGTKVDIK,QVQLVESGGGVVQPGRSLRLSCAGSGFPFSSYGMHWVRQAPGKGLEWVAVISHDGSNEYYADSVKGRFTIFRDNSKNTMYLQMNSLRAGDSALYYCAKEGYYGSGSFPDYWGQGTLITVSS,485,86.9159,2.21785,485,86.9159,IGHV3-30,IGKV1-16,IGHV3,IGKV1,IGHV3-30,IGKV1-16
2,GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTGTATCATATGATGGAAGTAATAAGTATTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTCTCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAAGAGGGTTACTATGGTTCAGGGAGTTTCCCTGACTACTGGGGCCAGGGAACCCTGGTCAGCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-30*18,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTGTATCATATGATGGAAGTAATAAGTATTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTCTCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAAGAGGGTTACTATGGTTCAGGGAGTTTCCCTGACTACTGGGGCCAGGGAACCCTGGTCAGCGTCTCCTCAG,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVVSYDGSNKYYADSVKGRFTISRDNSKNTLSLQMNSLRAEDTAVYYCAKEGYYGSGSFPDYWGQGTLVSVSS,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXYYGSGSXXDYWGQGTLVTVSS,AKEGYYGSGSFPDY,AGGAATCAGACCCAGTCAGGACACAGCATGGACATGAGAGTCCTCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGTTTCCCAGGTGCCAGATGTGACATCCAGATGACCCAGTCTCCATCCTCACTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGTCGGGCGAGTCAGGCCATTGACACTTATTTAGCCTGGTTTCAGCAGAAACCAGGGAAAGCCCCTACGTCCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTACAGCCTGAAGATTTTGCAACTTATTACTGCCAACAGTATAAGAGTTACCCTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCGAGCGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-16*02,GACATCCAGATGACCCAGTCTCCATCCTCACTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGTCGGGCGAGTCAGGCCATTGACACTTATTTAGCCTGGTTTCAGCAGAAACCAGGGAAAGCCCCTACGTCCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTACAGCCTGAAGATTTTGCAACTTATTACTGCCAACAGTATAAGAGTTACCCTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATC,DIQMTQSPSSLSASVGDRVTITCRASQAIDTYLAWFQQKPGKAPTSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYKSYPLTFGGGTKVEI,DIQMTQSPSSLSASVGDRVTITCRASQGISNYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYNSYPLTFGGGTKVEI,QQYKSYPLT,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVVSYDGSNKYYADSVKGRFTISRDNSKNTLSLQMNSLRAEDTAVYYCAKEGYYGSGSFPDYWGQGTLVSVSS[SEP]DIQMTQSPSSLSASVGDRVTITCRASQAIDTYLAWFQQKPGKAPTSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYKSYPLTFGGGTKVEI,Memory-B-Cells,,human,Donor-3,"Jaffe et al., 2022",38,DIQMTQSPSSLSASVGDRVTITCRASQAIDTYLAWFQQKPGKAPTSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYKSYPLTFGGGTKVEI,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTLVFGGGTKLTVL,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVVSYDGSNKYYADSVKGRFTISRDNSKNTLSLQMNSLRAEDTAVYYCAKEGYYGSGSFPDYWGQGTLVSVSS,225,47.2727,2.13457,207,48.1818,IGHV3-30,IGKV1-16,IGHV3,IGKV1,IGHV3-30,IGKV1-16


## Naive B Cells test set

In [75]:
naive_test_set_file_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/coherence_analysis_in_oas_db/data/test_set/BType_Memory-B-Cells_extra_cols.csv"

naive_df_test_set = CSV.read(naive_test_set_file_path, DataFrame);

# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
naive_df_test_set_grouped = groupby(naive_df_test_set, [:general_v_gene_heavy_no_para, :cdr3_aa_heavy]);

# Step 1: Filter out groups with only one row
naive_df_test_set_grouped_filt = filter(g -> nrow(g) > 1, naive_df_test_set_grouped);

# filter out every group that has the same entry in Subject
naive_df_test_set_grouped_filt_subj = filter(g -> length(unique(g.Subject)) > 1, naive_df_test_set_grouped_filt)

Row,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,sequence_alignment_aa_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,BType,Disease,Species,Subject,Author,Age,sequence_alignment_aa_light_1,generated_sequence_light,input_heavy_sequence,BLOSUM_score,similarity,perplexity,calculated_blosum,calculated_similarity,general_v_gene_heavy,general_v_gene_light,v_gene_heavy_family,v_gene_light_family,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String1,String15,String,String,String,String,String,String1,String15,String,String,String,String31,String,String15,String7,String7,String7,String31,String3,String,String,String,Int64,Float64,Float64,Int64,Float64,String15,String15,String7,String7,String15,String15
1,GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGGCTCTGGATTCCCCTTCAGTAGTTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCACATGATGGAAGTAATGAATACTATGCAGACTCCGTGAAGGGCCGATTCACCATCTTCAGAGACAATTCCAAGAACACAATGTATCTGCAAATGAACAGCCTGAGAGCTGGGGACTCGGCTCTGTATTACTGTGCGAAAGAAGGTTACTATGGTTCGGGGAGTTTCCCAGATTACTGGGGCCAGGGAACCCTGATCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-30*18,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGGCTCTGGATTCCCCTTCAGTAGTTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTATATCACATGATGGAAGTAATGAATACTATGCAGACTCCGTGAAGGGCCGATTCACCATCTTCAGAGACAATTCCAAGAACACAATGTATCTGCAAATGAACAGCCTGAGAGCTGGGGACTCGGCTCTGTATTACTGTGCGAAAGAAGGTTACTATGGTTCGGGGAGTTTCCCAGATTACTGGGGCCAGGGAACCCTGATCACCGTCTCCTCAG,QVQLVESGGGVVQPGRSLRLSCAGSGFPFSSYGMHWVRQAPGKGLEWVAVISHDGSNEYYADSVKGRFTIFRDNSKNTMYLQMNSLRAGDSALYYCAKEGYYGSGSFPDYWGQGTLITVSS,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXYYGSGSXXXYWGQGTLVTVSS,AKEGYYGSGSFPDY,AGGAATCAGACCCAGTCAGGACACAGCATGGACATGAGAGTCCTCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGTTTCCCAGGTGCCAGATGTGACATCCAGATGACCCAGTCTCCATCCTCACTGTCTGCATCTGTAGGAGACAGAGTCATCATCACTTGTCGGGCGAGTCAGGGCATTGCCAATTATTTAGCCTGGTTTCAGCAGAAACCAGGGAAAGCCCCTAAGTCCCTAATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAATTTATTACTGCCAACAGTATAATTCTTACCCGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-16*02,GACATCCAGATGACCCAGTCTCCATCCTCACTGTCTGCATCTGTAGGAGACAGAGTCATCATCACTTGTCGGGCGAGTCAGGGCATTGCCAATTATTTAGCCTGGTTTCAGCAGAAACCAGGGAAAGCCCCTAAGTCCCTAATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAATTTATTACTGCCAACAGTATAATTCTTACCCGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,DIQMTQSPSSLSASVGDRVIITCRASQGIANYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFAIYYCQQYNSYPWTFGQGTKVEIK,DIQMTQSPSSLSASVGDRVTITCRASQGISNYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYNSYPWTFGQGTKVEIK,QQYNSYPWT,QVQLVESGGGVVQPGRSLRLSCAGSGFPFSSYGMHWVRQAPGKGLEWVAVISHDGSNEYYADSVKGRFTIFRDNSKNTMYLQMNSLRAGDSALYYCAKEGYYGSGSFPDYWGQGTLITVSS[SEP]DIQMTQSPSSLSASVGDRVIITCRASQGIANYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFAIYYCQQYNSYPWTFGQGTKVEIK,Memory-B-Cells,,human,Donor-2,"Phad et al., 2022",no,DIQMTQSPSSLSASVGDRVIITCRASQGIANYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFAIYYCQQYNSYPWTFGQGTKVEIK,DIQMTQSPSSLSASVGDRVTITCRASQGISNYLAWYQQKPGKVPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDVATYYCQKYNSAPFTFGPGTKVDIK,QVQLVESGGGVVQPGRSLRLSCAGSGFPFSSYGMHWVRQAPGKGLEWVAVISHDGSNEYYADSVKGRFTIFRDNSKNTMYLQMNSLRAGDSALYYCAKEGYYGSGSFPDYWGQGTLITVSS,485,86.9159,2.21785,485,86.9159,IGHV3-30,IGKV1-16,IGHV3,IGKV1,IGHV3-30,IGKV1-16
2,GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTGTATCATATGATGGAAGTAATAAGTATTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTCTCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAAGAGGGTTACTATGGTTCAGGGAGTTTCCCTGACTACTGGGGCCAGGGAACCCTGGTCAGCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-30*18,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTGGCAGTTGTATCATATGATGGAAGTAATAAGTATTATGCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTCTCTGCAAATGAACAGCCTGAGAGCTGAGGACACGGCTGTGTATTACTGTGCGAAAGAGGGTTACTATGGTTCAGGGAGTTTCCCTGACTACTGGGGCCAGGGAACCCTGGTCAGCGTCTCCTCAG,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVVSYDGSNKYYADSVKGRFTISRDNSKNTLSLQMNSLRAEDTAVYYCAKEGYYGSGSFPDYWGQGTLVSVSS,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXYYGSGSXXDYWGQGTLVTVSS,AKEGYYGSGSFPDY,AGGAATCAGACCCAGTCAGGACACAGCATGGACATGAGAGTCCTCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGTTTCCCAGGTGCCAGATGTGACATCCAGATGACCCAGTCTCCATCCTCACTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGTCGGGCGAGTCAGGCCATTGACACTTATTTAGCCTGGTTTCAGCAGAAACCAGGGAAAGCCCCTACGTCCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTACAGCCTGAAGATTTTGCAACTTATTACTGCCAACAGTATAAGAGTTACCCTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCGAGCGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-16*02,GACATCCAGATGACCCAGTCTCCATCCTCACTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGTCGGGCGAGTCAGGCCATTGACACTTATTTAGCCTGGTTTCAGCAGAAACCAGGGAAAGCCCCTACGTCCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAAGTTCAGCGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGCCTACAGCCTGAAGATTTTGCAACTTATTACTGCCAACAGTATAAGAGTTACCCTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATC,DIQMTQSPSSLSASVGDRVTITCRASQAIDTYLAWFQQKPGKAPTSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYKSYPLTFGGGTKVEI,DIQMTQSPSSLSASVGDRVTITCRASQGISNYLAWFQQKPGKAPKSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYNSYPLTFGGGTKVEI,QQYKSYPLT,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVVSYDGSNKYYADSVKGRFTISRDNSKNTLSLQMNSLRAEDTAVYYCAKEGYYGSGSFPDYWGQGTLVSVSS[SEP]DIQMTQSPSSLSASVGDRVTITCRASQAIDTYLAWFQQKPGKAPTSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYKSYPLTFGGGTKVEI,Memory-B-Cells,,human,Donor-3,"Jaffe et al., 2022",38,DIQMTQSPSSLSASVGDRVTITCRASQAIDTYLAWFQQKPGKAPTSLIYAASSLQSGVPSKFSGSGSGTDFTLTISSLQPEDFATYYCQQYKSYPLTFGGGTKVEI,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTLVFGGGTKLTVL,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVVSYDGSNKYYADSVKGRFTISRDNSKNTLSLQMNSLRAEDTAVYYCAKEGYYGSGSFPDYWGQGTLVSVSS,225,47.2727,2.13457,207,48.1818,IGHV3-30,IGKV1-16,IGHV3,IGKV1,IGHV3-30,IGKV1-16


# unsorted b cells classified to naive and memory by classifier

In [76]:
class_unsorted_b_cells_path = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/BERT2GPT/naive_memory_classification/test_results/unsorted_test_results_all_metrics_OAS_paired_classifier_batch_64_epochs_10_lr_1e-06_group_size_421255-2_unlabelled.csv"

"/ibmm_data2/oas_database/paired_lea_tmp/paired_model/BERT2GPT/naive_memory_classification/test_results/unsorted_test_results_all_metrics_OAS_paired_classifier_batch_64_epochs_10_lr_1e-06_group_size_421255-2_unlabelled.csv"

In [77]:
class_unsorted_b_cells = CSV.read(class_unsorted_b_cells_path, DataFrame);


In [78]:
# divide by memory and naive (1 in predicted class for memory and 0 for naive)
class_unsorted_b_cells[!, :predicted_btype] = ifelse.(class_unsorted_b_cells.predicted_class .== 1, "Memory-B-Cells", "Naive-B-Cells")

632436-element Vector{String}:
 "Memory-B-Cells"
 "Memory-B-Cells"
 "Memory-B-Cells"
 "Memory-B-Cells"
 "Memory-B-Cells"
 "Memory-B-Cells"
 "Memory-B-Cells"
 "Memory-B-Cells"
 "Memory-B-Cells"
 "Memory-B-Cells"
 ⋮
 "Naive-B-Cells"
 "Naive-B-Cells"
 "Naive-B-Cells"
 "Naive-B-Cells"
 "Naive-B-Cells"
 "Naive-B-Cells"
 "Memory-B-Cells"
 "Naive-B-Cells"
 "Naive-B-Cells"

In [79]:
class_unsorted_b_cells

Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype
Unnamed: 0_level_1,String,String31,Int64,String
1,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells
2,QVQLQESGPGLVKPSETLSLICNVTGFSISGYFWSWVRQPPGKGLEWIASMTYSGTTNYNPSLQGRVTMSLSMSKDQVSLKLSSATAADTAVYYCARMARDGYVLRDWYFDLWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells
3,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYHWSWIRQPPGKGLEWIGYMYYSGSTNYNPSLKSRVTISVDTSKTQFSLKLSSVTTADTAVYYCARGRLIWSADYTGGDYFDPWGQGILVTVSS,Unsorted-B-Cells,1,Memory-B-Cells
4,QVQLQESGPGLVKPSETLSLTCNVSGYSISSGYYWGWIRQPPGKGLEWIGIIYQNGHSFYNPSLKSRAALSVAASKNQFSLNLRSVTAADTAVYFCARVASNAPTDWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells
5,QPQLQESGPRLVKPSETLSLTCSVSGGSITNDNYYWVWIRQPPGKGLDWVGSINYSGRTYYNPSLKSRLTMSVDTSKNQFSLKLTSVTAADTAIYYCARLFDPFVNDYSPGTGYGWLDPWGQGTPVTVSA,Unsorted-B-Cells,1,Memory-B-Cells
6,QVQLQVSGPGLVKPSETLSLTCSVSNYSIGSGYYWGWVRQPPGRGLEWIGSIFRNGNTYYNPSLQSRVTISVETSKNHFSLRLSPVTAADTAVYYCARHNRYNQRNPFDLWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells
7,QVQLQESGPGLVKSSETLSLTCTVSGVPISSSSYYWAWIRQPPGKGLEWIGSIYYSGSSFYHPSLGSRVTISMDKSKNVFSLKVHSLTAADTAVYFCARTALITYSYGEGRAFFDYWGQGRLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells
8,QVQLQESGPGLVRPSETLSLECSVSGSSLSNDYYWGWIRQPPGKGLQWIGNIYHSGTTYYNPSLKSRLTMSVDTSRNHFSLQLDSVTAADTAVYYCARLIYTGYGKRCFDYWGQGALVTVSS,Unsorted-B-Cells,1,Memory-B-Cells
9,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVGVIWYDGSKKYYSDSVKGRFTISRDSPNNMLYLQMNSLRAEDTAVYFCARDDDGSNQYGIFEYWGQGTVVTVSS,Unsorted-B-Cells,1,Memory-B-Cells
10,QLRLQESGPGLVKPSETLSLTCSVSGVSISSSSYFWGWIRQSPGKGLEWIGNIYDRGSTYYNPSLKTRATLRVDASKNEFSLELNSVSAADTGVYYCARTRFSVETYYYNGMDVWGQGTTVTVSS,Unsorted-B-Cells,1,Memory-B-Cells


In [80]:
# merge class_unsorted_b_cells and uns_subj_df on sequence_alignment_aa_heavy and BType
merged_unsorted_b_cells = innerjoin(class_unsorted_b_cells, uns_subj_df, on = [:sequence_alignment_aa_heavy, :BType])

Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String31,Int64,String,String,String1,String15,String,String,String,String,String1,String15,String,String,String,String31,String,String,String7,String15,String31,String15,String,String,String,String
1,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,GATTTCCTTAAATTCAGGGTCCAGCTCACATGGGAAATACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTTCTCCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-59*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXYSSXXXXXXYFDYWGQGTLVTVSS,ARDVGPYNSISPGRYYFDY,GGGGTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGACACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCAGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-23*02,CAGTCTGCCCTGACTCAGCCAGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSXXXYVFGTGTKVTVL,CSYAGSRILYV,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,,human,390c,"James et al, 2020",65 to 70,IGHV4-59,IGLV2-23,IGHV4-59,IGLV2-23
2,QVQLQESGPGLVKPSETLSLICNVTGFSISGYFWSWVRQPPGKGLEWIASMTYSGTTNYNPSLQGRVTMSLSMSKDQVSLKLSSATAADTAVYYCARMARDGYVLRDWYFDLWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,ATACTTTCTGAGAGCCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTTCTCCTGGTGGCAGCTCCCAGAGGGGTCGTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCATCTGCAATGTCACTGGTTTCTCCATCAGTGGTTACTTCTGGAGCTGGGTCCGGCAGCCCCCAGGGAAGGGACTGGAATGGATTGCCTCTATGACTTACAGTGGCACCACCAACTACAACCCCTCCCTCCAGGGCCGAGTCACTATGTCACTTTCCATGTCCAAGGACCAGGTCTCCCTGAAACTGAGCTCTGCGACCGCTGCGGACACGGCCGTCTATTACTGTGCGAGAATGGCGCGAGATGGCTACGTTTTGCGGGACTGGTACTTCGATCTCTGGGGCCAAGGCACCCTGGTCACTGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-59*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCATCTGCAATGTCACTGGTTTCTCCATCAGTGGTTACTTCTGGAGCTGGGTCCGGCAGCCCCCAGGGAAGGGACTGGAATGGATTGCCTCTATGACTTACAGTGGCACCACCAACTACAACCCCTCCCTCCAGGGCCGAGTCACTATGTCACTTTCCATGTCCAAGGACCAGGTCTCCCTGAAACTGAGCTCTGCGACCGCTGCGGACACGGCCGTCTATTACTGTGCGAGAATGGCGCGAGATGGCTACGTTTTGCGGGACTGGTACTTCGATCTCTGGGGCCAAGGCACCCTGGTCACTGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXGYXXXXWYFDLWGRGTLVTVSS,ARMARDGYVLRDWYFDL,AAGCCCAGCACCCGCCCCAGCTGCTTTGCATGTCCCTCCCAGCCGCCCTGCAGTCCAGAGCCCATATCAATGCCTGGGTCAGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATATCACCGGAGAAATTGTGTTGACGCAGTCGCCAGGCACCCTGTCTTTGTCTACAGGGGAAAGAGCCACCCTCTCTTGCAGGGCCGGTCAGACTGTTGACGGCAACTCCTTAGCCTGGTACCAGCACAAACCTGGCCAGGCTCCCAGGCTCCTCATCTTTCGTGCATCTCGTAGGGCCGCTGACATCCCAGACAGGTTCACTGGCAGTGGGTCTGGGACCGACTTCACTCTCACCATTAGCAGACTGGAGGTTGAAGATTTCGCAGTTTATTACTGTCAGCAGTATGGTGCCTCACCAAAAACGTTCGGCCAAGGGACCAAGGTGGAACTAAAAAGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-20*01,GAAATTGTGTTGACGCAGTCGCCAGGCACCCTGTCTTTGTCTACAGGGGAAAGAGCCACCCTCTCTTGCAGGGCCGGTCAGACTGTTGACGGCAACTCCTTAGCCTGGTACCAGCACAAACCTGGCCAGGCTCCCAGGCTCCTCATCTTTCGTGCATCTCGTAGGGCCGCTGACATCCCAGACAGGTTCACTGGCAGTGGGTCTGGGACCGACTTCACTCTCACCATTAGCAGACTGGAGGTTGAAGATTTCGCAGTTTATTACTGTCAGCAGTATGGTGCCTCACCAAAAACGTTCGGCCAAGGGACCAAGGTGGAA,EIVLTQSPGTLSLSTGERATLSCRAGQTVDGNSLAWYQHKPGQAPRLLIFRASRRAADIPDRFTGSGSGTDFTLTISRLEVEDFAVYYCQQYGASPKTFGQGTKVE,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSPXTFGQGTKVE,QQYGASPKT,QVQLQESGPGLVKPSETLSLICNVTGFSISGYFWSWVRQPPGKGLEWIASMTYSGTTNYNPSLQGRVTMSLSMSKDQVSLKLSSATAADTAVYYCARMARDGYVLRDWYFDLWGQGTLVTVSS[SEP]EIVLTQSPGTLSLSTGERATLSCRAGQTVDGNSLAWYQHKPGQAPRLLIFRASRRAADIPDRFTGSGSGTDFTLTISRLEVEDFAVYYCQQYGASPKTFGQGTKVE,,human,390c,"James et al, 2020",65 to 70,IGHV4-59,IGKV3-20,IGHV4-59,IGKV3-20
3,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYHWSWIRQPPGKGLEWIGYMYYSGSTNYNPSLKSRVTISVDTSKTQFSLKLSSVTTADTAVYYCARGRLIWSADYTGGDYFDPWGQGILVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,TTTTCACCTCTCCATACAAAGGCACCACCCACATGCACATCCTCACTTAAGCACCCACAGGAAACCACCACACATTTCCTTAAATTCAGGGTCCAGCTCACATGGGAAATACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTTCTCCTGGTGGCAGCTCCCACATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGCAGTTACCACTGGAGCTGGATCCGGCAGCCCCCAGGGAAGGGACTGGAGTGGATCGGCTATATGTATTACAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGACCCAATTCTCCCTGAAGCTGAGCTCTGTGACCACTGCGGACACGGCCGTGTATTACTGTGCGAGAGGTAGACTCATTTGGAGTGCTGATTATACCGGTGGGGACTACTTTGACCCCTGGGGCCAGGGAATCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV4-59*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGCAGTTACCACTGGAGCTGGATCCGGCAGCCCCCAGGGAAGGGACTGGAGTGGATCGGCTATATGTATTACAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGACCCAATTCTCCCTGAAGCTGAGCTCTGTGACCACTGCGGACACGGCCGTGTATTACTGTGCGAGAGGTAGACTCATTTGGAGTGCTGATTATACCGGTGGGGACTACTTTGACCCCTGGGGCCAGGGAATCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXWSGYYTXXXYFDYWGQGTLVTVSS,ARGRLIWSADYTGGDYFDP,GGGGTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGACACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAAGCAGCAGTGATGTTGGGAGTTATAACCTTGTCTCTTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGAGGACGAGGCTCAATATTACTGCTGCTCATATGGAGGTAGGAATTTTCATGTGCTATTCGGCGGAGGGACCGAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-23*02,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAAGCAGCAGTGATGTTGGGAGTTATAACCTTGTCTCTTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGAGGACGAGGCTCAATATTACTGCTGCTCATATGGAGGTAGGAATTTTCATGTGCTATTCGGCGGAGGGACCGAGCTGACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGSSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEAQYYCCSYGGRNFHVLFGGGTELTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGXXXXVVFGGGTKLTVL,CSYGGRNFHVL,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYHWSWIRQPPGKGLEWIGYMYYSGSTNYNPSLKSRVTISVDTSKTQFSLKLSSVTTADTAVYYCARGRLIWSADYTGGDYFDPWGQGILVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGSSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEAQYYCCSYGGRNFHVLFGGGTELTVL,,human,390c,"James et al, 2020",65 to 70,IGHV4-59,IGLV2-23,IGHV4-59,IGLV2-23
4,QVQLQESGPGLVKPSETLSLTCNVSGYSISSGYYWGWIRQPPGKGLEWIGIIYQNGHSFYNPSLKSRAALSVAASKNQFSLNLRSVTAADTAVYFCARVASNAPTDWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,TCCTGTGCAAGAACATGAAACACCTGTGGTTCTTCCTCCTGCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAACCTTCGGAGACCCTGTCCCTCACCTGCAATGTCTCTGGTTACTCCATTAGCAGTGGTTACTACTGGGGCTGGATCCGGCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGAATTATCTATCAAAATGGGCATTCCTTCTACAATCCGTCCCTCAAGAGTCGAGCCGCCCTATCAGTGGCCGCGTCCAAGAACCAGTTCTCCCTGAACCTGCGCTCTGTGACCGCCGCAGACACGGCCGTGTATTTCTGTGCGAGAGTCGCGAGCAACGCGCCTACCGACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTG,H,IGHV4-38-2*02,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAACCTTCGGAGACCCTGTCCCTCACCTGCAATGTCTCTGGTTACTCCATTAGCAGTGGTTACTACTGGGGCTGGATCCGGCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGAATTATCTATCAAAATGGGCATTCCTTCTACAATCCGTCCCTCAAGAGTCGAGCCGCCCTATCAGTGGCCGCGTCCAAGAACCAGTTCTCCCTGAACCTGCGCTCTGTGACCGCCGCAGACACGGCCGTGTATTTCTGTGCGAGAGTCGCGAGCAACGCGCCTACCGACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGYSISSGYYWGWIRQPPGKGLEWIGSIYHSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXXXXXWGQGTLVTVSS,ARVASNAPTD,GCTGGGGTCTCAGGAGGCAGCGCTCTCAGGACGTCACCACCATGGCCTGGGCTCTGCTCCTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTCCCTCCGCGTCCGGGTCTCTTGGACAGTCAGTCACCATCTCCTGCACTGGAAGTAGTAGTGACGTTGGTGGGTATGCCTATGTCTCCTGGTATCAACAACACCCAGGCAAAGCCCCCAAAGTCGTAATTTATGAGGTCACTAAGCGGCCCTCAGGGGTCCCTGAACGGTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCGTCTCTGGGCTCCAGGCTGAAGATGAGGCTGATTATTACTGCATCTCATATGCCGGCGCCAACAAATTAGGGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-8*01,CAGTCTGCCCTGACTCAGCCTCCCTCCGCGTCCGGGTCTCTTGGACAGTCAGTCACCATCTCCTGCACTGGAAGTAGTAGTGACGTTGGTGGGTATGCCTATGTCTCCTGGTATCAACAACACCCAGGCAAAGCCCCCAAAGTCGTAATTTATGAGGTCACTAAGCGGCCCTCAGGGGTCCCTGAACGGTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCGTCTCTGGGCTCCAGGCTGAAGATGAGGCTGATTATTACTGCATCTCATATGCCGGCGCCAACAAATTAGGGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QSALTQPPSASGSLGQSVTISCTGSSSDVGGYAYVSWYQQHPGKAPKVVIYEVTKRPSGVPERFSGSKSGNTASLTVSGLQAEDEADYYCISYAGANKLGVFGGGTKLTVL,QSALTQPPSASGSPGQSVTISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYEVSKRPSGVPDRFSGSKSGNTASLTVSGLQAEDEADYYCSSYAGSNNXXVFGGGTKLTVL,ISYAGANKLGV,QVQLQESGPGLVKPSETLSLTCNVSGYSISSGYYWGWIRQPPGKGLEWIGIIYQNGHSFYNPSLKSRAALSVAASKNQFSLNLRSVTAADTAVYFCARVASNAPTDWGQGTLVTVSS[SEP]QSALTQPPSASGSLGQSVTISCTGSSSDVGGYAYVSWYQQHPGKAPKVVIYEVTKRPSGVPERFSGSKSGNTASLTVSGLQAEDEADYYCISYAGANKLGVFGGGTKLTVL,,human,390c,"James et al, 2020",65 to 70,IGHV4-38-2,IGLV2-8,IGHV4-38-2,IGLV2-8
5,QPQLQESGPRLVKPSETLSLTCSVSGGSITNDNYYWVWIRQPPGKGLDWVGSINYSGRTYYNPSLKSRLTMSVDTSKNQFSLKLTSVTAADTAIYYCARLFDPFVNDYSPGTGYGWLDPWGQGTPVTVSA,Unsorted-B-Cells,1,Memory-B-Cells,GCCATTGACAAGCGTTTTCTTATATGGGATGCTTTCTGAGAGTCATGGATCTCACGTGCAAGAAAATGAAGCACCTGTGGTTCTTCCTCCTGCTGGGGGCGGCTCCCGGATGGGTCCTGTCCCAGCCGCAGCTGCAGGAGTCGGGCCCACGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCTCTGTCTCTGGTGGCTCCATCACAAATGATAATTATTACTGGGTCTGGATCCGCCAGCCCCCAGGTAAGGGGCTGGACTGGGTTGGCAGTATCAATTATAGTGGGAGAACCTACTATAATCCGTCCCTCAAGAGTCGACTCACCATGTCCGTGGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGACCTCTGTGACCGCCGCAGACACGGCTATATATTACTGTGCGAGACTTTTTGACCCCTTCGTCAATGACTACTCCCCGGGGACCGGCTACGGCTGGCTCGACCCCTGGGGCCAGGGAACCCCGGTCACCGTCTCCGCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-39*01,CAGCCGCAGCTGCAGGAGTCGGGCCCACGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCTCTGTCTCTGGTGGCTCCATCACAAATGATAATTATTACTGGGTCTGGATCCGCCAGCCCCCAGGTAAGGGGCTGGACTGGGTTGGCAGTATCAATTATAGTGGGAGAACCTACTATAATCCGTCCCTCAAGAGTCGACTCACCATGTCCGTGGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGACCTCTGTGACCGCCGCAGACACGGCTATATATTACTGTGCGAGACTTTTTGACCCCTTCGTCAATGACTACTCCCCGGGGACCGGCTACGGCTGGCTCGACCCCTGGGGCCAGGGAACCCCGGTCACCGTCTCCGCAG,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXXXXXYSXXXXXXWFDPWGQGTLVTVSS,ARLFDPFVNDYSPGTGYGWLDP,AGGAGTCAGTCCCAACCAGGACACAGCATGGACATGAGGGTCCCTGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGCTCTCAGGTGCCAGATGTGACATCCAGTTGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTTGGAGACAGAGTCACCATCACTTGCCAGGCGACTCAGGACATCAGGAAGTCTTTAAATTGGTATCAACAGAAACCAGGGAAAGCCCCTAAACTCCTGATCAACGATGCGTCCAATTTGCAAACAGGGGTCCCATCAAGGTTCAGTGGAAGTGGATCTGGGACAGATTTTTCTTTCACCATCAACACCCTGCAGCCTGAAGATATCGCAACATATTTCTGTCAACAATATAGAAGTCTCCCTCTCACTTTCGGCGGAGGGTCCAACGTAGAGATCAAGCGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-33*01,GACATCCAGTTGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTTGGAGACAGAGTCACCATCACTTGCCAGGCGACTCAGGACATCAGGAAGTCTTTAAATTGGTATCAACAGAAACCAGGGAAAGCCCCTAAACTCCTGATCAACGATGCGTCCAATTTGCAAACAGGGGTCCCATCAAGGTTCAGTGGAAGTGGATCTGGGACAGATTTTTCTTTCACCATCAACACCCTGCAGCCTGAAGATATCGCAACATATTTCTGTCAACAATATAGAAGTCTCCCTCTCACTTTCGGCGGAGGGTCCAACGTAGAGATCAA,DIQLTQSPSSLSASVGDRVTITCQATQDIRKSLNWYQQKPGKAPKLLINDASNLQTGVPSRFSGSGSGTDFSFTINTLQPEDIATYFCQQYRSLPLTFGGGSNVEI,DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQYDNLPLTFGGGTKVEI,QQYRSLPLT,QPQLQESGPRLVKPSETLSLTCSVSGGSITNDNYYWVWIRQPPGKGLDWVGSINYSGRTYYNPSLKSRLTMSVDTSKNQFSLKLTSVTAADTAIYYCARLFDPFVNDYSPGTGYGWLDPWGQGTPVTVSA[SEP]DIQLTQSPSSLSASVGDRVTITCQATQDIRKSLNWYQQKPGKAPKLLINDASNLQTGVPSRFSGSGSGTDFSFTINTLQPEDIATYFCQQYRSLPLTFGGGSNVEI,,human,390c,"James et al, 2020",65 to 70,IGHV4-39,IGKV1-33,IGHV4-39,IGKV1-33
6,QVQLQVSGPGLVKPSETLSLTCSVSNYSIGSGYYWGWVRQPPGRGLEWIGSIFRNGNTYYNPSLQSRVTISVETSKNHFSLRLSPVTAADTAVYYCARHNRYNQRNPFDLWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,GAACATGAAGCACCTGTGGTTTTTCCTCCTGCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTACAGGTGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCAGTGTCTCTAATTACTCCATCGGCAGTGGTTACTACTGGGGCTGGGTCCGGCAGCCCCCAGGGAGGGGGCTGGAGTGGATTGGAAGTATCTTTCGTAATGGGAACACATACTACAACCCGTCCCTCCAGAGTCGAGTCACCATATCAGTAGAAACGTCCAAGAACCACTTCTCCTTGAGGCTGAGCCCTGTGACCGCCGCAGACACGGCCGTCTATTACTGTGCGAGACACAATCGATATAATCAGAGGAATCCATTTGACTTGTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-38-2*02,CAGGTGCAGCTACAGGTGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCAGTGTCTCTAATTACTCCATCGGCAGTGGTTACTACTGGGGCTGGGTCCGGCAGCCCCCAGGGAGGGGGCTGGAGTGGATTGGAAGTATCTTTCGTAATGGGAACACATACTACAACCCGTCCCTCCAGAGTCGAGTCACCATATCAGTAGAAACGTCCAAGAACCACTTCTCCTTGAGGCTGAGCCCTGTGACCGCCGCAGACACGGCCGTCTATTACTGTGCGAGACACAATCGATATAATCAGAGGAATCCATTTGACTTGTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGYSISSGYYWGWIRQPPGKGLEWIGSIYHSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXRYXXXXXFDYWGQGTLVTVSS,ARHNRYNQRNPFDL,GAGCTACAACAGGCAGGCAGGGGCAGCAAGATGGTGTTGCAGACCCAGGTCTTCATTTCTCTGTTGCTCTGGATCTCTGGTGCCTACGGGGACATCGTGATGACCCAGTCTCCAGACTCCCTGGCCGTGTCTCTGGGCGAGAGGGCCACCATCAACTGCAAGTCCAGCCAGAGTGTTTTGGACAACTCCTTCAATAAGAACTACTTAGCTTGGTACCAGCAAAAACCAGGACTGCCTCCTAAGTTACTCATTTACTGGGCATTTACCCGGGAATCCGGGGTCCCTGATCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAACAGCCTGCAGGCTGAAGATGTGGCAATTTATTACTGTCAGCAATATTATACTTATCCATTCACTTTCGGCCCTGGGACCAAAGTGGATATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV4-1*01,GACATCGTGATGACCCAGTCTCCAGACTCCCTGGCCGTGTCTCTGGGCGAGAGGGCCACCATCAACTGCAAGTCCAGCCAGAGTGTTTTGGACAACTCCTTCAATAAGAACTACTTAGCTTGGTACCAGCAAAAACCAGGACTGCCTCCTAAGTTACTCATTTACTGGGCATTTACCCGGGAATCCGGGGTCCCTGATCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAACAGCCTGCAGGCTGAAGATGTGGCAATTTATTACTGTCAGCAATATTATACTTATCCATTCACTTTCGGCCCTGGGACCAAAGTGGATATCAAAC,DIVMTQSPDSLAVSLGERATINCKSSQSVLDNSFNKNYLAWYQQKPGLPPKLLIYWAFTRESGVPDRFSGSGSGTDFTLTINSLQAEDVAIYYCQQYYTYPFTFGPGTKVDIK,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPFTFGPGTKVDIK,QQYYTYPFT,QVQLQVSGPGLVKPSETLSLTCSVSNYSIGSGYYWGWVRQPPGRGLEWIGSIFRNGNTYYNPSLQSRVTISVETSKNHFSLRLSPVTAADTAVYYCARHNRYNQRNPFDLWGQGTLVTVSS[SEP]DIVMTQSPDSLAVSLGERATINCKSSQSVLDNSFNKNYLAWYQQKPGLPPKLLIYWAFTRESGVPDRFSGSGSGTDFTLTINSLQAEDVAIYYCQQYYTYPFTFGPGTKVDIK,,human,390c,"James et al, 2020",65 to 70,IGHV4-38-2,IGKV4-1,IGHV4-38-2,IGKV4-1
7,QVQLQESGPGLVKSSETLSLTCTVSGVPISSSSYYWAWIRQPPGKGLEWIGSIYYSGSSFYHPSLGSRVTISMDKSKNVFSLKVHSLTAADTAVYFCARTALITYSYGEGRAFFDYWGQGRLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,TTTCTTATATGGGGATGCTTTCTGAGAGTCATGGACCTCCTGTGCAAGAACATGAAGCACCTGTGGCTCTTCCTCCTGCTGGTGGCGGCTCCCAGAGCGATCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGTCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCCGGCGTCCCCATTAGTAGTAGCAGTTACTACTGGGCCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGCAGCATATATTATAGTGGCAGTTCGTTCTACCACCCGTCCCTCGGGAGTCGAGTCACCATTTCCATGGACAAGTCCAAGAATGTGTTCTCCCTGAAGGTACATTCTCTGACCGCCGCGGACACGGCCGTATATTTCTGTGCGAGGACCGCCCTCATCACATACAGTTATGGTGAGGGGCGCGCCTTCTTTGACTACTGGGGCCAGGGAAGGTTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV4-39*07,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGTCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCCGGCGTCCCCATTAGTAGTAGCAGTTACTACTGGGCCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGCAGCATATATTATAGTGGCAGTTCGTTCTACCACCCGTCCCTCGGGAGTCGAGTCACCATTTCCATGGACAAGTCCAAGAATGTGTTCTCCCTGAAGGTACATTCTCTGACCGCCGCGGACACGGCCGTATATTTCTGTGCGAGGACCGCCCTCATCACATACAGTTATGGTGAGGGGCGCGCCTTCTTTGACTACTGGGGCCAGGGAAGGTTGGTCACCGTCTCCTCAG,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAXXXXXXYSYGXXXXXFDYWGQGTLVTVSS,ARTALITYSYGEGRAFFDY,GGGGTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGACACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCATGCTCTGGAACCAGCAGTGATGTAGGGAATTATAACCTTATCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAAGTCATGATTTATGAGGTCAGTCAGAGGCCCTCAGGGGTTTCTAGTCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGCCTGAGGACGAGGCTGATTATTACTGCTCCTCATATGCAGGGAGATACACTTTTGTCTTCGGCGCTGGGACCAGGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-23*02,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCATGCTCTGGAACCAGCAGTGATGTAGGGAATTATAACCTTATCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAAGTCATGATTTATGAGGTCAGTCAGAGGCCCTCAGGGGTTTCTAGTCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGCCTGAGGACGAGGCTGATTATTACTGCTCCTCATATGCAGGGAGATACACTTTTGTCTTCGGCGCTGGGACCAGGGTCACCGTCCTAG,QSALTQPASVSGSPGQSITISCSGTSSDVGNYNLISWYQQHPGKAPKVMIYEVSQRPSGVSSRFSGSKSGNTASLTISGLQPEDEADYYCSSYAGRYTFVFGAGTRVTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSSTFVFGTGTKVTVL,SSYAGRYTFV,QVQLQESGPGLVKSSETLSLTCTVSGVPISSSSYYWAWIRQPPGKGLEWIGSIYYSGSSFYHPSLGSRVTISMDKSKNVFSLKVHSLTAADTAVYFCARTALITYSYGEGRAFFDYWGQGRLVTVSS[SEP]QSALTQPASVSGSPGQSITISCSGTSSDVGNYNLISWYQQHPGKAPKVMIYEVSQRPSGVSSRFSGSKSGNTASLTISGLQPEDEADYYCSSYAGRYTFVFGAGTRVTVL,,human,390c,"James et al, 2020",65 to 70,IGHV4-39,IGLV2-23,IGHV4-39,IGLV2-23
8,QVQLQESGPGLVRPSETLSLECSVSGSSLSNDYYWGWIRQPPGKGLQWIGNIYHSGTTYYNPSLKSRLTMSVDTSRNHFSLQLDSVTAADTAVYYCARLIYTGYGKRCFDYWGQGALVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,CTTTCTGAGAGTCATGGACCTCCTGTGCAAGAACATGAAGCACCTGTGGTTTTTCCTCCTGCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAGGCCTTCGGAGACCCTGTCCCTCGAATGCTCTGTCTCTGGTTCCTCTCTCAGCAATGATTATTATTGGGGCTGGATCCGGCAGCCCCCAGGGAAGGGGCTGCAGTGGATTGGTAATATCTATCATAGTGGGACCACCTACTACAACCCGTCCCTCAAGAGTCGACTCACCATGTCAGTGGACACGTCCCGGAACCACTTCTCCTTGCAGCTGGACTCTGTGACCGCCGCAGACACGGCCGTGTATTACTGTGCGAGGCTAATCTATACTGGCTACGGCAAGAGATGCTTTGACTACTGGGGCCAGGGAGCCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-38-2*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAGGCCTTCGGAGACCCTGTCCCTCGAATGCTCTGTCTCTGGTTCCTCTCTCAGCAATGATTATTATTGGGGCTGGATCCGGCAGCCCCCAGGGAAGGGGCTGCAGTGGATTGGTAATATCTATCATAGTGGGACCACCTACTACAACCCGTCCCTCAAGAGTCGACTCACCATGTCAGTGGACACGTCCCGGAACCACTTCTCCTTGCAGCTGGACTCTGTGACCGCCGCAGACACGGCCGTGTATTACTGTGCGAGGCTAATCTATACTGGCTACGGCAAGAGATGCTTTGACTACTGGGGCCAGGGAGCCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCAVSGYSISSGYYWGWIRQPPGKGLEWIGSIYHSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAXXXYSGYXXXXFDYWGQGTLVTVSS,ARLIYTGYGKRCFDY,AGGAGTCAGACCCAGTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGTTCCCAGGTTCCAGATGCGACATCCAGATGACCCAGTCTCCACCTTTCGTGTCTGCATCTGTGGGAGACAGCGTCACCATCACTTGTCGGGCGAGTCAGGGTATTACCGACTGGTTAGCCTGGTATCAGCATAAACAAGGGAAAGCCCCTAAGCTCCTCATCTTCGCTGCATCCACTTTGCAGAGTGGGGTCCCGTCACGATTCAGCGGCACTGGATCTGGAACAGATTTCACTCTCACCATCACCAGACTACAGCCTGAAGATTCTGCAACTTACTATTGTCAACAGGGTTACACATTCCCCGGGGGTTTCACTTTCGGCCCTGGGACCAAAGTGGATGTCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-12*01,GACATCCAGATGACCCAGTCTCCACCTTTCGTGTCTGCATCTGTGGGAGACAGCGTCACCATCACTTGTCGGGCGAGTCAGGGTATTACCGACTGGTTAGCCTGGTATCAGCATAAACAAGGGAAAGCCCCTAAGCTCCTCATCTTCGCTGCATCCACTTTGCAGAGTGGGGTCCCGTCACGATTCAGCGGCACTGGATCTGGAACAGATTTCACTCTCACCATCACCAGACTACAGCCTGAAGATTCTGCAACTTACTATTGTCAACAGGGTTACACATTCCCCGGGGGTTTCACTTTCGGCCCTGGGACCAAAGTGGATGTCAAAC,DIQMTQSPPFVSASVGDSVTITCRASQGITDWLAWYQHKQGKAPKLLIFAASTLQSGVPSRFSGTGSGTDFTLTITRLQPEDSATYYCQQGYTFPGGFTFGPGTKVDVK,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPXXFTFGPGTKVDIK,QQGYTFPGGFT,QVQLQESGPGLVRPSETLSLECSVSGSSLSNDYYWGWIRQPPGKGLQWIGNIYHSGTTYYNPSLKSRLTMSVDTSRNHFSLQLDSVTAADTAVYYCARLIYTGYGKRCFDYWGQGALVTVSS[SEP]DIQMTQSPPFVSASVGDSVTITCRASQGITDWLAWYQHKQGKAPKLLIFAASTLQSGVPSRFSGTGSGTDFTLTITRLQPEDSATYYCQQGYTFPGGFTFGPGTKVDVK,,human,390c,"James et al, 2020",65 to 70,IGHV4-38-2,IGKV1-12,IGHV4-38-2,IGKV1-12
9,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVGVIWYDGSKKYYSDSVKGRFTISRDSPNNMLYLQMNSLRAEDTAVYFCARDDDGSNQYGIFEYWGQGTVVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCGTCTGGATTCACCTTCAGTAGTTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTTGGAGTTATATGGTATGATGGAAGTAAAAAATACTATTCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAGTCCCAACAACATGCTGTATTTGCAGATGAACAGCCTGAGAGCCGAGGACACGGCTGTTTATTTCTGTGCGAGAGATGATGATGGTAGTAATCAGTATGGGATCTTTGAATACTGGGGCCAGGGAACCGTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-33*01,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCGTCTGGATTCACCTTCAGTAGTTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTTGGAGTTATATGGTATGATGGAAGTAAAAAATACTATTCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAGTCCCAACAACATGCTGTATTTGCAGATGAACAGCCTGAGAGCCGAGGACACGGCTGTTTATTTCTGTGCGAGAGATGATGATGGTAGTAATCAGTATGGGATCTTTGAATACTGGGGCCAGGGAACCGTGGTCACCGTCTCCTCAG,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVIWYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARXXDSSXXXXXFDYWGQGTLVTVSS,ARDDDGSNQYGIFEY,TGGGGCTGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTATTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGTCTCCGTGTCTGGGTCTCCTGGACAGTCGATCGCCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAACTCTGTCTCCTGGTTCCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGCCTCCAGGCTGAAGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTCGGCTGTTCGGCGGAGGGACCAAGTTGACCGTCCTAAGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-14*03,CAGTCTGCCCTGACTCAGCCTGTCTCCGTGTCTGGGTCTCCTGGACAGTCGATCGCCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAACTCTGTCTCCTGGTTCCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGCCTCCAGGCTGAAGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTCGGCTGTTCGGCGGAGGGACCAAGTTGACCGTCCTA,QSALTQPVSVSGSPGQSIAISCTGTSSDVGGYNSVSWFQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTRLFGGGTKLTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTXXFGGGTKLTVL,SSYTSSSTRL,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVGVIWYDGSKKYYSDSVKGRFTISRDSPNNMLYLQMNSLRAEDTAVYFCARDDDGSNQYGIFEYWGQGTVVTVSS[SEP]QSALTQPVSVSGSPGQSIAISCTGTSSDVGGYNSVSWFQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTRLFGGGTKLTVL,,human,390c,"James et al, 2020",65 to 70,IGHV3-33,IGLV2-14,IGHV3-33,IGLV2-14
10,QLRLQESGPGLVKPSETLSLTCSVSGVSISSSSYFWGWIRQSPGKGLEWIGNIYDRGSTYYNPSLKTRATLRVDASKNEFSLELNSVSAADTGVYYCARTRFSVETYYYNGMDVWGQGTTVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,TTGATATTTCTTATATGGGGATGCTTTCTGAGAGTCATGGATCTCATGTGCAAGAAAATGAAGCACCTGTGGTTCCTCCTCCTGCTGGTGGCGGCTCCCAGATGGGTCCTGTCCCAACTTCGACTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCCTCGGAGACCCTGTCCCTCACCTGCAGTGTCTCTGGTGTCTCCATCAGTAGTAGCAGTTACTTCTGGGGCTGGATCCGCCAGTCCCCAGGGAAGGGGCTGGAGTGGATTGGAAACATCTATGATCGTGGGAGTACCTACTACAATCCGTCCCTCAAGACTCGAGCCACCTTACGCGTTGACGCGTCGAAGAACGAGTTTTCCCTGGAACTGAACTCTGTGAGCGCCGCAGACACGGGTGTCTATTACTGTGCGAGGACGAGGTTTAGTGTGGAGACTTATTACTACAACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-39*01,CAACTTCGACTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCCTCGGAGACCCTGTCCCTCACCTGCAGTGTCTCTGGTGTCTCCATCAGTAGTAGCAGTTACTTCTGGGGCTGGATCCGCCAGTCCCCAGGGAAGGGGCTGGAGTGGATTGGAAACATCTATGATCGTGGGAGTACCTACTACAATCCGTCCCTCAAGACTCGAGCCACCTTACGCGTTGACGCGTCGAAGAACGAGTTTTCCCTGGAACTGAACTCTGTGAGCGCCGCAGACACGGGTGTCTATTACTGTGCGAGGACGAGGTTTAGTGTGGAGACTTATTACTACAACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCA,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAXXXYSXXXXYYYGMDVWGQGTTVTVSS,ARTRFSVETYYYNGMDV,CTGGGCCTCAGGAAGCAGCATCGGGGGTGCCTCAGCCATGGCATGGATCCCTCTCTTCCTCGGCGTCCTTGCTTACTGCACAGGATCCGTGGCCTCCTTTGCGCTGACTCAGCCACCCTCTCTGTCCGTGTCCCCAGGACAGACAGCCAGCGTCAATTGTTCTGGAGATAAATTGGAGGATAAATATGCTTGTTGGTATCAACATAAGCCAGGCCAGTCCCCTCTCTTGATCATATATCAGGATAACAAGCGGCCCCCCGGGATCCCTGAGCGATTCTCTGGCTCCAATTCTGGGAACACAGCCACTCTGACCATCAGCGGGACCCAGCCCATGGATGAGGCTGACTATTACTGTCAGGCGTGGGACAGTTTCTCTGGTGTTTTTGTCTTCGGAGCTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV3-1*01,TCCTTTGCGCTGACTCAGCCACCCTCTCTGTCCGTGTCCCCAGGACAGACAGCCAGCGTCAATTGTTCTGGAGATAAATTGGAGGATAAATATGCTTGTTGGTATCAACATAAGCCAGGCCAGTCCCCTCTCTTGATCATATATCAGGATAACAAGCGGCCCCCCGGGATCCCTGAGCGATTCTCTGGCTCCAATTCTGGGAACACAGCCACTCTGACCATCAGCGGGACCCAGCCCATGGATGAGGCTGACTATTACTGTCAGGCGTGGGACAGTTTCTCTGGTGTTTTTGTCTTCGGAGCTGGGACCAAGGTCACCGTCCTAG,SFALTQPPSLSVSPGQTASVNCSGDKLEDKYACWYQHKPGQSPLLIIYQDNKRPPGIPERFSGSNSGNTATLTISGTQPMDEADYYCQAWDSFSGVFVFGAGTKVTVL,SYELTQPPSVSVSPGQTASITCSGDKLGDKYACWYQQKPGQSPVLVIYQDSKRPSGIPERFSGSNSGNTATLTISGTQAMDEADYYCQAWDXXXXXXVFGTGTKVTVL,QAWDSFSGVFV,QLRLQESGPGLVKPSETLSLTCSVSGVSISSSSYFWGWIRQSPGKGLEWIGNIYDRGSTYYNPSLKTRATLRVDASKNEFSLELNSVSAADTGVYYCARTRFSVETYYYNGMDVWGQGTTVTVSS[SEP]SFALTQPPSLSVSPGQTASVNCSGDKLEDKYACWYQHKPGQSPLLIIYQDNKRPPGIPERFSGSNSGNTATLTISGTQPMDEADYYCQAWDSFSGVFVFGAGTKVTVL,,human,390c,"James et al, 2020",65 to 70,IGHV4-39,IGLV3-1,IGHV4-39,IGLV3-1


In [81]:
merged_unsorted_b_cells_unique = unique(merged_unsorted_b_cells, :sequence_alignment_aa_heavy)

Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String31,Int64,String,String,String1,String15,String,String,String,String,String1,String15,String,String,String,String31,String,String,String7,String15,String31,String15,String,String,String,String
1,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,GATTTCCTTAAATTCAGGGTCCAGCTCACATGGGAAATACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTTCTCCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-59*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGTGGTTTTTATTGGAGCTGGATCCGGCAGTCCCCAGGGAAGGGACTGGAGTGGATTGCATATATCTATTTTAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCTTATCAGTGGACACGTCCAAGAACCAGTTCTCCCTGAAACTGAGCTCTGTTACCGCTGCGGACTCGGCCGTTTATTACTGTGCGAGAGATGTCGGCCCGTATAACAGCATCTCCCCGGGGCGTTACTATTTTGACTACTGGGGCCCGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXYSSXXXXXXYFDYWGQGTLVTVSS,ARDVGPYNSISPGRYYFDY,GGGGTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGACACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCAGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-23*02,CAGTCTGCCCTGACTCAGCCAGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGATGTTGGGAATTATAACCTTGTCTCCTGGTACCAACACCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGATTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGACGACGAGGCTGATTATTACTGCTGCTCATATGCAGGTAGTAGAATCCTTTATGTCTTCGGATCTGGGACCAAGGTCACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSXXXYVFGTGTKVTVL,CSYAGSRILYV,QVQLQESGPGLVKPSETLSLTCTVSGGSISGFYWSWIRQSPGKGLEWIAYIYFSGSTNYNPSLKSRVTLSVDTSKNQFSLKLSSVTAADSAVYYCARDVGPYNSISPGRYYFDYWGPGTLVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGNYNLVSWYQHHPGKAPKLMIYEVSKRPSGISNRFSGSKSGNTASLTISGLQADDEADYYCCSYAGSRILYVFGSGTKVTVL,,human,390c,"James et al, 2020",65 to 70,IGHV4-59,IGLV2-23,IGHV4-59,IGLV2-23
2,QVQLQESGPGLVKPSETLSLICNVTGFSISGYFWSWVRQPPGKGLEWIASMTYSGTTNYNPSLQGRVTMSLSMSKDQVSLKLSSATAADTAVYYCARMARDGYVLRDWYFDLWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,ATACTTTCTGAGAGCCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTTCTCCTGGTGGCAGCTCCCAGAGGGGTCGTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCATCTGCAATGTCACTGGTTTCTCCATCAGTGGTTACTTCTGGAGCTGGGTCCGGCAGCCCCCAGGGAAGGGACTGGAATGGATTGCCTCTATGACTTACAGTGGCACCACCAACTACAACCCCTCCCTCCAGGGCCGAGTCACTATGTCACTTTCCATGTCCAAGGACCAGGTCTCCCTGAAACTGAGCTCTGCGACCGCTGCGGACACGGCCGTCTATTACTGTGCGAGAATGGCGCGAGATGGCTACGTTTTGCGGGACTGGTACTTCGATCTCTGGGGCCAAGGCACCCTGGTCACTGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-59*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCATCTGCAATGTCACTGGTTTCTCCATCAGTGGTTACTTCTGGAGCTGGGTCCGGCAGCCCCCAGGGAAGGGACTGGAATGGATTGCCTCTATGACTTACAGTGGCACCACCAACTACAACCCCTCCCTCCAGGGCCGAGTCACTATGTCACTTTCCATGTCCAAGGACCAGGTCTCCCTGAAACTGAGCTCTGCGACCGCTGCGGACACGGCCGTCTATTACTGTGCGAGAATGGCGCGAGATGGCTACGTTTTGCGGGACTGGTACTTCGATCTCTGGGGCCAAGGCACCCTGGTCACTGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXGYXXXXWYFDLWGRGTLVTVSS,ARMARDGYVLRDWYFDL,AAGCCCAGCACCCGCCCCAGCTGCTTTGCATGTCCCTCCCAGCCGCCCTGCAGTCCAGAGCCCATATCAATGCCTGGGTCAGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATATCACCGGAGAAATTGTGTTGACGCAGTCGCCAGGCACCCTGTCTTTGTCTACAGGGGAAAGAGCCACCCTCTCTTGCAGGGCCGGTCAGACTGTTGACGGCAACTCCTTAGCCTGGTACCAGCACAAACCTGGCCAGGCTCCCAGGCTCCTCATCTTTCGTGCATCTCGTAGGGCCGCTGACATCCCAGACAGGTTCACTGGCAGTGGGTCTGGGACCGACTTCACTCTCACCATTAGCAGACTGGAGGTTGAAGATTTCGCAGTTTATTACTGTCAGCAGTATGGTGCCTCACCAAAAACGTTCGGCCAAGGGACCAAGGTGGAACTAAAAAGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-20*01,GAAATTGTGTTGACGCAGTCGCCAGGCACCCTGTCTTTGTCTACAGGGGAAAGAGCCACCCTCTCTTGCAGGGCCGGTCAGACTGTTGACGGCAACTCCTTAGCCTGGTACCAGCACAAACCTGGCCAGGCTCCCAGGCTCCTCATCTTTCGTGCATCTCGTAGGGCCGCTGACATCCCAGACAGGTTCACTGGCAGTGGGTCTGGGACCGACTTCACTCTCACCATTAGCAGACTGGAGGTTGAAGATTTCGCAGTTTATTACTGTCAGCAGTATGGTGCCTCACCAAAAACGTTCGGCCAAGGGACCAAGGTGGAA,EIVLTQSPGTLSLSTGERATLSCRAGQTVDGNSLAWYQHKPGQAPRLLIFRASRRAADIPDRFTGSGSGTDFTLTISRLEVEDFAVYYCQQYGASPKTFGQGTKVE,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSPXTFGQGTKVE,QQYGASPKT,QVQLQESGPGLVKPSETLSLICNVTGFSISGYFWSWVRQPPGKGLEWIASMTYSGTTNYNPSLQGRVTMSLSMSKDQVSLKLSSATAADTAVYYCARMARDGYVLRDWYFDLWGQGTLVTVSS[SEP]EIVLTQSPGTLSLSTGERATLSCRAGQTVDGNSLAWYQHKPGQAPRLLIFRASRRAADIPDRFTGSGSGTDFTLTISRLEVEDFAVYYCQQYGASPKTFGQGTKVE,,human,390c,"James et al, 2020",65 to 70,IGHV4-59,IGKV3-20,IGHV4-59,IGKV3-20
3,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYHWSWIRQPPGKGLEWIGYMYYSGSTNYNPSLKSRVTISVDTSKTQFSLKLSSVTTADTAVYYCARGRLIWSADYTGGDYFDPWGQGILVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,TTTTCACCTCTCCATACAAAGGCACCACCCACATGCACATCCTCACTTAAGCACCCACAGGAAACCACCACACATTTCCTTAAATTCAGGGTCCAGCTCACATGGGAAATACTTTCTGAGAGTCCTGGACCTCCTGTGCAAGAACATGAAACATCTGTGGTTCTTCCTTCTCCTGGTGGCAGCTCCCACATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGCAGTTACCACTGGAGCTGGATCCGGCAGCCCCCAGGGAAGGGACTGGAGTGGATCGGCTATATGTATTACAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGACCCAATTCTCCCTGAAGCTGAGCTCTGTGACCACTGCGGACACGGCCGTGTATTACTGTGCGAGAGGTAGACTCATTTGGAGTGCTGATTATACCGGTGGGGACTACTTTGACCCCTGGGGCCAGGGAATCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV4-59*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATTAGCAGTTACCACTGGAGCTGGATCCGGCAGCCCCCAGGGAAGGGACTGGAGTGGATCGGCTATATGTATTACAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGACCCAATTCTCCCTGAAGCTGAGCTCTGTGACCACTGCGGACACGGCCGTGTATTACTGTGCGAGAGGTAGACTCATTTGGAGTGCTGATTATACCGGTGGGGACTACTTTGACCCCTGGGGCCAGGGAATCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYYWSWIRQPPGKGLEWIGYIYYSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXWSGYYTXXXYFDYWGQGTLVTVSS,ARGRLIWSADYTGGDYFDP,GGGGTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGACACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAAGCAGCAGTGATGTTGGGAGTTATAACCTTGTCTCTTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGAGGACGAGGCTCAATATTACTGCTGCTCATATGGAGGTAGGAATTTTCATGTGCTATTCGGCGGAGGGACCGAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-23*02,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAAGCAGCAGTGATGTTGGGAGTTATAACCTTGTCTCTTGGTACCAACAGCACCCAGGCAAAGCCCCCAAACTCATGATTTATGAGGTCAGTAAGCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGGCTGAGGACGAGGCTCAATATTACTGCTGCTCATATGGAGGTAGGAATTTTCATGTGCTATTCGGCGGAGGGACCGAGCTGACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGSSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEAQYYCCSYGGRNFHVLFGGGTELTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGXXXXVVFGGGTKLTVL,CSYGGRNFHVL,QVQLQESGPGLVKPSETLSLTCTVSGGSISSYHWSWIRQPPGKGLEWIGYMYYSGSTNYNPSLKSRVTISVDTSKTQFSLKLSSVTTADTAVYYCARGRLIWSADYTGGDYFDPWGQGILVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGSSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEAQYYCCSYGGRNFHVLFGGGTELTVL,,human,390c,"James et al, 2020",65 to 70,IGHV4-59,IGLV2-23,IGHV4-59,IGLV2-23
4,QVQLQESGPGLVKPSETLSLTCNVSGYSISSGYYWGWIRQPPGKGLEWIGIIYQNGHSFYNPSLKSRAALSVAASKNQFSLNLRSVTAADTAVYFCARVASNAPTDWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,TCCTGTGCAAGAACATGAAACACCTGTGGTTCTTCCTCCTGCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAACCTTCGGAGACCCTGTCCCTCACCTGCAATGTCTCTGGTTACTCCATTAGCAGTGGTTACTACTGGGGCTGGATCCGGCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGAATTATCTATCAAAATGGGCATTCCTTCTACAATCCGTCCCTCAAGAGTCGAGCCGCCCTATCAGTGGCCGCGTCCAAGAACCAGTTCTCCCTGAACCTGCGCTCTGTGACCGCCGCAGACACGGCCGTGTATTTCTGTGCGAGAGTCGCGAGCAACGCGCCTACCGACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTG,H,IGHV4-38-2*02,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAACCTTCGGAGACCCTGTCCCTCACCTGCAATGTCTCTGGTTACTCCATTAGCAGTGGTTACTACTGGGGCTGGATCCGGCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGAATTATCTATCAAAATGGGCATTCCTTCTACAATCCGTCCCTCAAGAGTCGAGCCGCCCTATCAGTGGCCGCGTCCAAGAACCAGTTCTCCCTGAACCTGCGCTCTGTGACCGCCGCAGACACGGCCGTGTATTTCTGTGCGAGAGTCGCGAGCAACGCGCCTACCGACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGYSISSGYYWGWIRQPPGKGLEWIGSIYHSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXXXXXWGQGTLVTVSS,ARVASNAPTD,GCTGGGGTCTCAGGAGGCAGCGCTCTCAGGACGTCACCACCATGGCCTGGGCTCTGCTCCTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTCCCTCCGCGTCCGGGTCTCTTGGACAGTCAGTCACCATCTCCTGCACTGGAAGTAGTAGTGACGTTGGTGGGTATGCCTATGTCTCCTGGTATCAACAACACCCAGGCAAAGCCCCCAAAGTCGTAATTTATGAGGTCACTAAGCGGCCCTCAGGGGTCCCTGAACGGTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCGTCTCTGGGCTCCAGGCTGAAGATGAGGCTGATTATTACTGCATCTCATATGCCGGCGCCAACAAATTAGGGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-8*01,CAGTCTGCCCTGACTCAGCCTCCCTCCGCGTCCGGGTCTCTTGGACAGTCAGTCACCATCTCCTGCACTGGAAGTAGTAGTGACGTTGGTGGGTATGCCTATGTCTCCTGGTATCAACAACACCCAGGCAAAGCCCCCAAAGTCGTAATTTATGAGGTCACTAAGCGGCCCTCAGGGGTCCCTGAACGGTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCGTCTCTGGGCTCCAGGCTGAAGATGAGGCTGATTATTACTGCATCTCATATGCCGGCGCCAACAAATTAGGGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QSALTQPPSASGSLGQSVTISCTGSSSDVGGYAYVSWYQQHPGKAPKVVIYEVTKRPSGVPERFSGSKSGNTASLTVSGLQAEDEADYYCISYAGANKLGVFGGGTKLTVL,QSALTQPPSASGSPGQSVTISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYEVSKRPSGVPDRFSGSKSGNTASLTVSGLQAEDEADYYCSSYAGSNNXXVFGGGTKLTVL,ISYAGANKLGV,QVQLQESGPGLVKPSETLSLTCNVSGYSISSGYYWGWIRQPPGKGLEWIGIIYQNGHSFYNPSLKSRAALSVAASKNQFSLNLRSVTAADTAVYFCARVASNAPTDWGQGTLVTVSS[SEP]QSALTQPPSASGSLGQSVTISCTGSSSDVGGYAYVSWYQQHPGKAPKVVIYEVTKRPSGVPERFSGSKSGNTASLTVSGLQAEDEADYYCISYAGANKLGVFGGGTKLTVL,,human,390c,"James et al, 2020",65 to 70,IGHV4-38-2,IGLV2-8,IGHV4-38-2,IGLV2-8
5,QPQLQESGPRLVKPSETLSLTCSVSGGSITNDNYYWVWIRQPPGKGLDWVGSINYSGRTYYNPSLKSRLTMSVDTSKNQFSLKLTSVTAADTAIYYCARLFDPFVNDYSPGTGYGWLDPWGQGTPVTVSA,Unsorted-B-Cells,1,Memory-B-Cells,GCCATTGACAAGCGTTTTCTTATATGGGATGCTTTCTGAGAGTCATGGATCTCACGTGCAAGAAAATGAAGCACCTGTGGTTCTTCCTCCTGCTGGGGGCGGCTCCCGGATGGGTCCTGTCCCAGCCGCAGCTGCAGGAGTCGGGCCCACGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCTCTGTCTCTGGTGGCTCCATCACAAATGATAATTATTACTGGGTCTGGATCCGCCAGCCCCCAGGTAAGGGGCTGGACTGGGTTGGCAGTATCAATTATAGTGGGAGAACCTACTATAATCCGTCCCTCAAGAGTCGACTCACCATGTCCGTGGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGACCTCTGTGACCGCCGCAGACACGGCTATATATTACTGTGCGAGACTTTTTGACCCCTTCGTCAATGACTACTCCCCGGGGACCGGCTACGGCTGGCTCGACCCCTGGGGCCAGGGAACCCCGGTCACCGTCTCCGCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-39*01,CAGCCGCAGCTGCAGGAGTCGGGCCCACGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCTCTGTCTCTGGTGGCTCCATCACAAATGATAATTATTACTGGGTCTGGATCCGCCAGCCCCCAGGTAAGGGGCTGGACTGGGTTGGCAGTATCAATTATAGTGGGAGAACCTACTATAATCCGTCCCTCAAGAGTCGACTCACCATGTCCGTGGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGACCTCTGTGACCGCCGCAGACACGGCTATATATTACTGTGCGAGACTTTTTGACCCCTTCGTCAATGACTACTCCCCGGGGACCGGCTACGGCTGGCTCGACCCCTGGGGCCAGGGAACCCCGGTCACCGTCTCCGCAG,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXXXXXXYSXXXXXXWFDPWGQGTLVTVSS,ARLFDPFVNDYSPGTGYGWLDP,AGGAGTCAGTCCCAACCAGGACACAGCATGGACATGAGGGTCCCTGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGCTCTCAGGTGCCAGATGTGACATCCAGTTGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTTGGAGACAGAGTCACCATCACTTGCCAGGCGACTCAGGACATCAGGAAGTCTTTAAATTGGTATCAACAGAAACCAGGGAAAGCCCCTAAACTCCTGATCAACGATGCGTCCAATTTGCAAACAGGGGTCCCATCAAGGTTCAGTGGAAGTGGATCTGGGACAGATTTTTCTTTCACCATCAACACCCTGCAGCCTGAAGATATCGCAACATATTTCTGTCAACAATATAGAAGTCTCCCTCTCACTTTCGGCGGAGGGTCCAACGTAGAGATCAAGCGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-33*01,GACATCCAGTTGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTTGGAGACAGAGTCACCATCACTTGCCAGGCGACTCAGGACATCAGGAAGTCTTTAAATTGGTATCAACAGAAACCAGGGAAAGCCCCTAAACTCCTGATCAACGATGCGTCCAATTTGCAAACAGGGGTCCCATCAAGGTTCAGTGGAAGTGGATCTGGGACAGATTTTTCTTTCACCATCAACACCCTGCAGCCTGAAGATATCGCAACATATTTCTGTCAACAATATAGAAGTCTCCCTCTCACTTTCGGCGGAGGGTCCAACGTAGAGATCAA,DIQLTQSPSSLSASVGDRVTITCQATQDIRKSLNWYQQKPGKAPKLLINDASNLQTGVPSRFSGSGSGTDFSFTINTLQPEDIATYFCQQYRSLPLTFGGGSNVEI,DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQYDNLPLTFGGGTKVEI,QQYRSLPLT,QPQLQESGPRLVKPSETLSLTCSVSGGSITNDNYYWVWIRQPPGKGLDWVGSINYSGRTYYNPSLKSRLTMSVDTSKNQFSLKLTSVTAADTAIYYCARLFDPFVNDYSPGTGYGWLDPWGQGTPVTVSA[SEP]DIQLTQSPSSLSASVGDRVTITCQATQDIRKSLNWYQQKPGKAPKLLINDASNLQTGVPSRFSGSGSGTDFSFTINTLQPEDIATYFCQQYRSLPLTFGGGSNVEI,,human,390c,"James et al, 2020",65 to 70,IGHV4-39,IGKV1-33,IGHV4-39,IGKV1-33
6,QVQLQVSGPGLVKPSETLSLTCSVSNYSIGSGYYWGWVRQPPGRGLEWIGSIFRNGNTYYNPSLQSRVTISVETSKNHFSLRLSPVTAADTAVYYCARHNRYNQRNPFDLWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,GAACATGAAGCACCTGTGGTTTTTCCTCCTGCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTACAGGTGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCAGTGTCTCTAATTACTCCATCGGCAGTGGTTACTACTGGGGCTGGGTCCGGCAGCCCCCAGGGAGGGGGCTGGAGTGGATTGGAAGTATCTTTCGTAATGGGAACACATACTACAACCCGTCCCTCCAGAGTCGAGTCACCATATCAGTAGAAACGTCCAAGAACCACTTCTCCTTGAGGCTGAGCCCTGTGACCGCCGCAGACACGGCCGTCTATTACTGTGCGAGACACAATCGATATAATCAGAGGAATCCATTTGACTTGTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-38-2*02,CAGGTGCAGCTACAGGTGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCAGTGTCTCTAATTACTCCATCGGCAGTGGTTACTACTGGGGCTGGGTCCGGCAGCCCCCAGGGAGGGGGCTGGAGTGGATTGGAAGTATCTTTCGTAATGGGAACACATACTACAACCCGTCCCTCCAGAGTCGAGTCACCATATCAGTAGAAACGTCCAAGAACCACTTCTCCTTGAGGCTGAGCCCTGTGACCGCCGCAGACACGGCCGTCTATTACTGTGCGAGACACAATCGATATAATCAGAGGAATCCATTTGACTTGTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCTVSGYSISSGYYWGWIRQPPGKGLEWIGSIYHSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXRYXXXXXFDYWGQGTLVTVSS,ARHNRYNQRNPFDL,GAGCTACAACAGGCAGGCAGGGGCAGCAAGATGGTGTTGCAGACCCAGGTCTTCATTTCTCTGTTGCTCTGGATCTCTGGTGCCTACGGGGACATCGTGATGACCCAGTCTCCAGACTCCCTGGCCGTGTCTCTGGGCGAGAGGGCCACCATCAACTGCAAGTCCAGCCAGAGTGTTTTGGACAACTCCTTCAATAAGAACTACTTAGCTTGGTACCAGCAAAAACCAGGACTGCCTCCTAAGTTACTCATTTACTGGGCATTTACCCGGGAATCCGGGGTCCCTGATCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAACAGCCTGCAGGCTGAAGATGTGGCAATTTATTACTGTCAGCAATATTATACTTATCCATTCACTTTCGGCCCTGGGACCAAAGTGGATATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV4-1*01,GACATCGTGATGACCCAGTCTCCAGACTCCCTGGCCGTGTCTCTGGGCGAGAGGGCCACCATCAACTGCAAGTCCAGCCAGAGTGTTTTGGACAACTCCTTCAATAAGAACTACTTAGCTTGGTACCAGCAAAAACCAGGACTGCCTCCTAAGTTACTCATTTACTGGGCATTTACCCGGGAATCCGGGGTCCCTGATCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAACAGCCTGCAGGCTGAAGATGTGGCAATTTATTACTGTCAGCAATATTATACTTATCCATTCACTTTCGGCCCTGGGACCAAAGTGGATATCAAAC,DIVMTQSPDSLAVSLGERATINCKSSQSVLDNSFNKNYLAWYQQKPGLPPKLLIYWAFTRESGVPDRFSGSGSGTDFTLTINSLQAEDVAIYYCQQYYTYPFTFGPGTKVDIK,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPFTFGPGTKVDIK,QQYYTYPFT,QVQLQVSGPGLVKPSETLSLTCSVSNYSIGSGYYWGWVRQPPGRGLEWIGSIFRNGNTYYNPSLQSRVTISVETSKNHFSLRLSPVTAADTAVYYCARHNRYNQRNPFDLWGQGTLVTVSS[SEP]DIVMTQSPDSLAVSLGERATINCKSSQSVLDNSFNKNYLAWYQQKPGLPPKLLIYWAFTRESGVPDRFSGSGSGTDFTLTINSLQAEDVAIYYCQQYYTYPFTFGPGTKVDIK,,human,390c,"James et al, 2020",65 to 70,IGHV4-38-2,IGKV4-1,IGHV4-38-2,IGKV4-1
7,QVQLQESGPGLVKSSETLSLTCTVSGVPISSSSYYWAWIRQPPGKGLEWIGSIYYSGSSFYHPSLGSRVTISMDKSKNVFSLKVHSLTAADTAVYFCARTALITYSYGEGRAFFDYWGQGRLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,TTTCTTATATGGGGATGCTTTCTGAGAGTCATGGACCTCCTGTGCAAGAACATGAAGCACCTGTGGCTCTTCCTCCTGCTGGTGGCGGCTCCCAGAGCGATCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGTCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCCGGCGTCCCCATTAGTAGTAGCAGTTACTACTGGGCCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGCAGCATATATTATAGTGGCAGTTCGTTCTACCACCCGTCCCTCGGGAGTCGAGTCACCATTTCCATGGACAAGTCCAAGAATGTGTTCTCCCTGAAGGTACATTCTCTGACCGCCGCGGACACGGCCGTATATTTCTGTGCGAGGACCGCCCTCATCACATACAGTTATGGTGAGGGGCGCGCCTTCTTTGACTACTGGGGCCAGGGAAGGTTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV4-39*07,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGTCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCCGGCGTCCCCATTAGTAGTAGCAGTTACTACTGGGCCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGCAGCATATATTATAGTGGCAGTTCGTTCTACCACCCGTCCCTCGGGAGTCGAGTCACCATTTCCATGGACAAGTCCAAGAATGTGTTCTCCCTGAAGGTACATTCTCTGACCGCCGCGGACACGGCCGTATATTTCTGTGCGAGGACCGCCCTCATCACATACAGTTATGGTGAGGGGCGCGCCTTCTTTGACTACTGGGGCCAGGGAAGGTTGGTCACCGTCTCCTCAG,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAXXXXXXYSYGXXXXXFDYWGQGTLVTVSS,ARTALITYSYGEGRAFFDY,GGGGTCACAAGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTGCTCCTCACCCTCCTCACTCAGGACACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCATGCTCTGGAACCAGCAGTGATGTAGGGAATTATAACCTTATCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAAGTCATGATTTATGAGGTCAGTCAGAGGCCCTCAGGGGTTTCTAGTCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGCCTGAGGACGAGGCTGATTATTACTGCTCCTCATATGCAGGGAGATACACTTTTGTCTTCGGCGCTGGGACCAGGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-23*02,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCATGCTCTGGAACCAGCAGTGATGTAGGGAATTATAACCTTATCTCCTGGTACCAACAGCACCCAGGCAAAGCCCCCAAAGTCATGATTTATGAGGTCAGTCAGAGGCCCTCAGGGGTTTCTAGTCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACAATCTCTGGGCTCCAGCCTGAGGACGAGGCTGATTATTACTGCTCCTCATATGCAGGGAGATACACTTTTGTCTTCGGCGCTGGGACCAGGGTCACCGTCCTAG,QSALTQPASVSGSPGQSITISCSGTSSDVGNYNLISWYQQHPGKAPKVMIYEVSQRPSGVSSRFSGSKSGNTASLTISGLQPEDEADYYCSSYAGRYTFVFGAGTRVTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGKAPKLMIYEVSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCCSYAGSSTFVFGTGTKVTVL,SSYAGRYTFV,QVQLQESGPGLVKSSETLSLTCTVSGVPISSSSYYWAWIRQPPGKGLEWIGSIYYSGSSFYHPSLGSRVTISMDKSKNVFSLKVHSLTAADTAVYFCARTALITYSYGEGRAFFDYWGQGRLVTVSS[SEP]QSALTQPASVSGSPGQSITISCSGTSSDVGNYNLISWYQQHPGKAPKVMIYEVSQRPSGVSSRFSGSKSGNTASLTISGLQPEDEADYYCSSYAGRYTFVFGAGTRVTVL,,human,390c,"James et al, 2020",65 to 70,IGHV4-39,IGLV2-23,IGHV4-39,IGLV2-23
8,QVQLQESGPGLVRPSETLSLECSVSGSSLSNDYYWGWIRQPPGKGLQWIGNIYHSGTTYYNPSLKSRLTMSVDTSRNHFSLQLDSVTAADTAVYYCARLIYTGYGKRCFDYWGQGALVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,CTTTCTGAGAGTCATGGACCTCCTGTGCAAGAACATGAAGCACCTGTGGTTTTTCCTCCTGCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAGGCCTTCGGAGACCCTGTCCCTCGAATGCTCTGTCTCTGGTTCCTCTCTCAGCAATGATTATTATTGGGGCTGGATCCGGCAGCCCCCAGGGAAGGGGCTGCAGTGGATTGGTAATATCTATCATAGTGGGACCACCTACTACAACCCGTCCCTCAAGAGTCGACTCACCATGTCAGTGGACACGTCCCGGAACCACTTCTCCTTGCAGCTGGACTCTGTGACCGCCGCAGACACGGCCGTGTATTACTGTGCGAGGCTAATCTATACTGGCTACGGCAAGAGATGCTTTGACTACTGGGGCCAGGGAGCCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-38-2*01,CAGGTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAGGCCTTCGGAGACCCTGTCCCTCGAATGCTCTGTCTCTGGTTCCTCTCTCAGCAATGATTATTATTGGGGCTGGATCCGGCAGCCCCCAGGGAAGGGGCTGCAGTGGATTGGTAATATCTATCATAGTGGGACCACCTACTACAACCCGTCCCTCAAGAGTCGACTCACCATGTCAGTGGACACGTCCCGGAACCACTTCTCCTTGCAGCTGGACTCTGTGACCGCCGCAGACACGGCCGTGTATTACTGTGCGAGGCTAATCTATACTGGCTACGGCAAGAGATGCTTTGACTACTGGGGCCAGGGAGCCCTGGTCACCGTCTCCTCAG,QVQLQESGPGLVKPSETLSLTCAVSGYSISSGYYWGWIRQPPGKGLEWIGSIYHSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAXXXYSGYXXXXFDYWGQGTLVTVSS,ARLIYTGYGKRCFDY,AGGAGTCAGACCCAGTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGTTCCCAGGTTCCAGATGCGACATCCAGATGACCCAGTCTCCACCTTTCGTGTCTGCATCTGTGGGAGACAGCGTCACCATCACTTGTCGGGCGAGTCAGGGTATTACCGACTGGTTAGCCTGGTATCAGCATAAACAAGGGAAAGCCCCTAAGCTCCTCATCTTCGCTGCATCCACTTTGCAGAGTGGGGTCCCGTCACGATTCAGCGGCACTGGATCTGGAACAGATTTCACTCTCACCATCACCAGACTACAGCCTGAAGATTCTGCAACTTACTATTGTCAACAGGGTTACACATTCCCCGGGGGTTTCACTTTCGGCCCTGGGACCAAAGTGGATGTCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-12*01,GACATCCAGATGACCCAGTCTCCACCTTTCGTGTCTGCATCTGTGGGAGACAGCGTCACCATCACTTGTCGGGCGAGTCAGGGTATTACCGACTGGTTAGCCTGGTATCAGCATAAACAAGGGAAAGCCCCTAAGCTCCTCATCTTCGCTGCATCCACTTTGCAGAGTGGGGTCCCGTCACGATTCAGCGGCACTGGATCTGGAACAGATTTCACTCTCACCATCACCAGACTACAGCCTGAAGATTCTGCAACTTACTATTGTCAACAGGGTTACACATTCCCCGGGGGTTTCACTTTCGGCCCTGGGACCAAAGTGGATGTCAAAC,DIQMTQSPPFVSASVGDSVTITCRASQGITDWLAWYQHKQGKAPKLLIFAASTLQSGVPSRFSGTGSGTDFTLTITRLQPEDSATYYCQQGYTFPGGFTFGPGTKVDVK,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQANSFPXXFTFGPGTKVDIK,QQGYTFPGGFT,QVQLQESGPGLVRPSETLSLECSVSGSSLSNDYYWGWIRQPPGKGLQWIGNIYHSGTTYYNPSLKSRLTMSVDTSRNHFSLQLDSVTAADTAVYYCARLIYTGYGKRCFDYWGQGALVTVSS[SEP]DIQMTQSPPFVSASVGDSVTITCRASQGITDWLAWYQHKQGKAPKLLIFAASTLQSGVPSRFSGTGSGTDFTLTITRLQPEDSATYYCQQGYTFPGGFTFGPGTKVDVK,,human,390c,"James et al, 2020",65 to 70,IGHV4-38-2,IGKV1-12,IGHV4-38-2,IGKV1-12
9,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVGVIWYDGSKKYYSDSVKGRFTISRDSPNNMLYLQMNSLRAEDTAVYFCARDDDGSNQYGIFEYWGQGTVVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,GAGCTCTGGGAGAGGAGCCCAGCACTAGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCAGTGTCAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCGTCTGGATTCACCTTCAGTAGTTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTTGGAGTTATATGGTATGATGGAAGTAAAAAATACTATTCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAGTCCCAACAACATGCTGTATTTGCAGATGAACAGCCTGAGAGCCGAGGACACGGCTGTTTATTTCTGTGCGAGAGATGATGATGGTAGTAATCAGTATGGGATCTTTGAATACTGGGGCCAGGGAACCGTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-33*01,CAGGTGCAGCTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCGTCTGGATTCACCTTCAGTAGTTATGGCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGGCTGGAGTGGGTTGGAGTTATATGGTATGATGGAAGTAAAAAATACTATTCAGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAGTCCCAACAACATGCTGTATTTGCAGATGAACAGCCTGAGAGCCGAGGACACGGCTGTTTATTTCTGTGCGAGAGATGATGATGGTAGTAATCAGTATGGGATCTTTGAATACTGGGGCCAGGGAACCGTGGTCACCGTCTCCTCAG,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVAVIWYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARXXDSSXXXXXFDYWGQGTLVTVSS,ARDDDGSNQYGIFEY,TGGGGCTGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTATTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGTCTCCGTGTCTGGGTCTCCTGGACAGTCGATCGCCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAACTCTGTCTCCTGGTTCCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGCCTCCAGGCTGAAGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTCGGCTGTTCGGCGGAGGGACCAAGTTGACCGTCCTAAGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-14*03,CAGTCTGCCCTGACTCAGCCTGTCTCCGTGTCTGGGTCTCCTGGACAGTCGATCGCCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAACTCTGTCTCCTGGTTCCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGCCTCCAGGCTGAAGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTCGGCTGTTCGGCGGAGGGACCAAGTTGACCGTCCTA,QSALTQPVSVSGSPGQSIAISCTGTSSDVGGYNSVSWFQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTRLFGGGTKLTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTXXFGGGTKLTVL,SSYTSSSTRL,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLEWVGVIWYDGSKKYYSDSVKGRFTISRDSPNNMLYLQMNSLRAEDTAVYFCARDDDGSNQYGIFEYWGQGTVVTVSS[SEP]QSALTQPVSVSGSPGQSIAISCTGTSSDVGGYNSVSWFQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTRLFGGGTKLTVL,,human,390c,"James et al, 2020",65 to 70,IGHV3-33,IGLV2-14,IGHV3-33,IGLV2-14
10,QLRLQESGPGLVKPSETLSLTCSVSGVSISSSSYFWGWIRQSPGKGLEWIGNIYDRGSTYYNPSLKTRATLRVDASKNEFSLELNSVSAADTGVYYCARTRFSVETYYYNGMDVWGQGTTVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,TTGATATTTCTTATATGGGGATGCTTTCTGAGAGTCATGGATCTCATGTGCAAGAAAATGAAGCACCTGTGGTTCCTCCTCCTGCTGGTGGCGGCTCCCAGATGGGTCCTGTCCCAACTTCGACTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCCTCGGAGACCCTGTCCCTCACCTGCAGTGTCTCTGGTGTCTCCATCAGTAGTAGCAGTTACTTCTGGGGCTGGATCCGCCAGTCCCCAGGGAAGGGGCTGGAGTGGATTGGAAACATCTATGATCGTGGGAGTACCTACTACAATCCGTCCCTCAAGACTCGAGCCACCTTACGCGTTGACGCGTCGAAGAACGAGTTTTCCCTGGAACTGAACTCTGTGAGCGCCGCAGACACGGGTGTCTATTACTGTGCGAGGACGAGGTTTAGTGTGGAGACTTATTACTACAACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-39*01,CAACTTCGACTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCCTCGGAGACCCTGTCCCTCACCTGCAGTGTCTCTGGTGTCTCCATCAGTAGTAGCAGTTACTTCTGGGGCTGGATCCGCCAGTCCCCAGGGAAGGGGCTGGAGTGGATTGGAAACATCTATGATCGTGGGAGTACCTACTACAATCCGTCCCTCAAGACTCGAGCCACCTTACGCGTTGACGCGTCGAAGAACGAGTTTTCCCTGGAACTGAACTCTGTGAGCGCCGCAGACACGGGTGTCTATTACTGTGCGAGGACGAGGTTTAGTGTGGAGACTTATTACTACAACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCA,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAXXXYSXXXXYYYGMDVWGQGTTVTVSS,ARTRFSVETYYYNGMDV,CTGGGCCTCAGGAAGCAGCATCGGGGGTGCCTCAGCCATGGCATGGATCCCTCTCTTCCTCGGCGTCCTTGCTTACTGCACAGGATCCGTGGCCTCCTTTGCGCTGACTCAGCCACCCTCTCTGTCCGTGTCCCCAGGACAGACAGCCAGCGTCAATTGTTCTGGAGATAAATTGGAGGATAAATATGCTTGTTGGTATCAACATAAGCCAGGCCAGTCCCCTCTCTTGATCATATATCAGGATAACAAGCGGCCCCCCGGGATCCCTGAGCGATTCTCTGGCTCCAATTCTGGGAACACAGCCACTCTGACCATCAGCGGGACCCAGCCCATGGATGAGGCTGACTATTACTGTCAGGCGTGGGACAGTTTCTCTGGTGTTTTTGTCTTCGGAGCTGGGACCAAGGTCACCGTCCTAGGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGTGTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV3-1*01,TCCTTTGCGCTGACTCAGCCACCCTCTCTGTCCGTGTCCCCAGGACAGACAGCCAGCGTCAATTGTTCTGGAGATAAATTGGAGGATAAATATGCTTGTTGGTATCAACATAAGCCAGGCCAGTCCCCTCTCTTGATCATATATCAGGATAACAAGCGGCCCCCCGGGATCCCTGAGCGATTCTCTGGCTCCAATTCTGGGAACACAGCCACTCTGACCATCAGCGGGACCCAGCCCATGGATGAGGCTGACTATTACTGTCAGGCGTGGGACAGTTTCTCTGGTGTTTTTGTCTTCGGAGCTGGGACCAAGGTCACCGTCCTAG,SFALTQPPSLSVSPGQTASVNCSGDKLEDKYACWYQHKPGQSPLLIIYQDNKRPPGIPERFSGSNSGNTATLTISGTQPMDEADYYCQAWDSFSGVFVFGAGTKVTVL,SYELTQPPSVSVSPGQTASITCSGDKLGDKYACWYQQKPGQSPVLVIYQDSKRPSGIPERFSGSNSGNTATLTISGTQAMDEADYYCQAWDXXXXXXVFGTGTKVTVL,QAWDSFSGVFV,QLRLQESGPGLVKPSETLSLTCSVSGVSISSSSYFWGWIRQSPGKGLEWIGNIYDRGSTYYNPSLKTRATLRVDASKNEFSLELNSVSAADTGVYYCARTRFSVETYYYNGMDVWGQGTTVTVSS[SEP]SFALTQPPSLSVSPGQTASVNCSGDKLEDKYACWYQHKPGQSPLLIIYQDNKRPPGIPERFSGSNSGNTATLTISGTQPMDEADYYCQAWDSFSGVFVFGAGTKVTVL,,human,390c,"James et al, 2020",65 to 70,IGHV4-39,IGLV3-1,IGHV4-39,IGLV3-1


In [82]:
# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
merged_unsorted_b_cells_unique_grouped = groupby(merged_unsorted_b_cells_unique, [:general_v_gene_heavy_no_para, :cdr3_aa_heavy])

# Step 1: Filter out groups with only one row
merged_unsorted_b_cells_unique_grouped_filtered = filter(g -> nrow(g) > 1, merged_unsorted_b_cells_unique_grouped)


Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String31,Int64,String,String,String1,String15,String,String,String,String,String1,String15,String,String,String,String31,String,String,String7,String15,String31,String15,String,String,String,String
1,QLVESGGGVVQPGRSLRLSCAASGFTFSNHAMHWVRQAPGKGLEWVAFISYDEAEQIYADAVRGRFTISRDNSKNTVYLQMNSLTTDDTAVYFCAKGAYGPLELFHGTDVWGQGTTVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,GAGCTCTGGGAGAGGAGCCCAGCACTCGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCATGTGTCGGGAACAACTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAATCATGCCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGTCTGGAGTGGGTGGCATTTATATCATATGATGAAGCTGAGCAAATTTACGCAGACGCCGTGAGGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGGTGTATCTGCAAATGAACAGCCTGACAACTGACGACACGGCTGTGTATTTCTGTGCGAAAGGGGCCTATGGTCCCTTAGAACTATTCCACGGAACGGACGTCTGGGGCCAAGGGACTACGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-30*04,CAACTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAATCATGCCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGTCTGGAGTGGGTGGCATTTATATCATATGATGAAGCTGAGCAAATTTACGCAGACGCCGTGAGGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGGTGTATCTGCAAATGAACAGCCTGACAACTGACGACACGGCTGTGTATTTCTGTGCGAAAGGGGCCTATGGTCCCTTAGAACTATTCCACGGAACGGACGTCTGGGGCCAAGGGACTACGGTCACCGTCTCCTCA,QLVESGGGVVQPGRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARXXXXXXXLFHGMDVWGQGTTVTVSS,AKGAYGPLELFHGTDV,CCTGGGTCAGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGCTATCGCCTTAGAAATTATGCTGACGCAGTCTCCAGACACCCTATCTTTGTCTCCAGGAGAAAGAGCCACCCTCTCCTGCAAGGCCAGTCAGAGTATTGTCACCAATTACCTAGCCTGGTACCAGCAGACACCTGGCCAGGCTCCCAAACTCCTCGTCTTTGGTGTGTCTAACACGGTCACTGGCATCCCAGGCAGGTTCGTTGGCGGTGGGTCTGGGACAGACTTCACTCTCACCATCACCAGCCTGGAGCCTGAAGACTTTGCGGTGTACTACTGTCATCACTATGGAAGTCAGTCGTGGACGTTCGGCCAGGGGACCACGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-20*01,GAAATTATGCTGACGCAGTCTCCAGACACCCTATCTTTGTCTCCAGGAGAAAGAGCCACCCTCTCCTGCAAGGCCAGTCAGAGTATTGTCACCAATTACCTAGCCTGGTACCAGCAGACACCTGGCCAGGCTCCCAAACTCCTCGTCTTTGGTGTGTCTAACACGGTCACTGGCATCCCAGGCAGGTTCGTTGGCGGTGGGTCTGGGACAGACTTCACTCTCACCATCACCAGCCTGGAGCCTGAAGACTTTGCGGTGTACTACTGTCATCACTATGGAAGTCAGTCGTGGACGTTCGGCCAGGGGACCACGGTGGAGATCAAAC,EIMLTQSPDTLSLSPGERATLSCKASQSIVTNYLAWYQQTPGQAPKLLVFGVSNTVTGIPGRFVGGGSGTDFTLTITSLEPEDFAVYYCHHYGSQSWTFGQGTTVEIK,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGXXXWTFGQGTKVEIK,HHYGSQSWT,QLVESGGGVVQPGRSLRLSCAASGFTFSNHAMHWVRQAPGKGLEWVAFISYDEAEQIYADAVRGRFTISRDNSKNTVYLQMNSLTTDDTAVYFCAKGAYGPLELFHGTDVWGQGTTVTVSS[SEP]EIMLTQSPDTLSLSPGERATLSCKASQSIVTNYLAWYQQTPGQAPKLLVFGVSNTVTGIPGRFVGGGSGTDFTLTITSLEPEDFAVYYCHHYGSQSWTFGQGTTVEIK,,human,390c,"James et al, 2020",65 to 70,IGHV3-30,IGKV3-20,IGHV3-30,IGKV3-20
2,QLVESGGGVVQPGRSLRLSCAASGFTFSNHAMHWVRQAPGKGLEWVAFISYDEAEQIYADAVRGRFTISRDNSKNTVYLQMNSVTTDDTAVYFCAKGAYGPLELFHGTDVWGQGTTVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,GAGCTCTGGGAGAGGAGCCCAGCACTCGAAGTCGGCGGTGTTTCCATTCGGTGATCAGCACTGAACACAGAGGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTCGTTGCTCTTTTAAGAGGTGTCCTGTGTCGGGAACAACTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAATCATGCCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGTCTGGAGTGGGTGGCATTTATATCATATGATGAAGCTGAGCAAATTTACGCAGACGCCGTGAGGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGGTGTATCTGCAAATGAACAGCGTGACAACTGACGACACGGCTGTGTATTTCTGTGCGAAAGGGGCCTATGGTCCCTTAGAACTGTTCCACGGAACGGACGTCTGGGGCCAAGGGACTACGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-30*04,CAACTGGTGGAGTCTGGGGGAGGCGTGGTCCAGCCTGGGAGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAATCATGCCATGCACTGGGTCCGCCAGGCTCCAGGCAAGGGTCTGGAGTGGGTGGCATTTATATCATATGATGAAGCTGAGCAAATTTACGCAGACGCCGTGAGGGGCCGATTCACCATCTCCAGAGACAATTCCAAGAACACGGTGTATCTGCAAATGAACAGCGTGACAACTGACGACACGGCTGTGTATTTCTGTGCGAAAGGGGCCTATGGTCCCTTAGAACTGTTCCACGGAACGGACGTCTGGGGCCAAGGGACTACGGTCACCGTCTCCTCA,QLVESGGGVVQPGRSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARXXYGXXXXXXGMDVWGQGTTVTVSS,AKGAYGPLELFHGTDV,GAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGCTTTCGCCGTAGACATTATTCTGACGCAGTCTCCAGACACCCTATCTTTGTCTCCAGGAGAAAGAGCCACCCTCTCCTGCAAGGCCAGTCAGAGTATTGTCACCAATTACCTAGCCTGGTACCAGCAGACACCTGGCCAGGCTCCCAAACTCCTCGTCTTTGGTGTGTCTAACACGGTCGCTGGCATCCCAGGCAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCACCAGCCTGGAGCCTGAAGACTTTGCGGTGTACTACTGTCATCACTATGGAAGTCAGTCGTGGACGTTCGGCCAGGGGACCACGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-20*01,GACATTATTCTGACGCAGTCTCCAGACACCCTATCTTTGTCTCCAGGAGAAAGAGCCACCCTCTCCTGCAAGGCCAGTCAGAGTATTGTCACCAATTACCTAGCCTGGTACCAGCAGACACCTGGCCAGGCTCCCAAACTCCTCGTCTTTGGTGTGTCTAACACGGTCGCTGGCATCCCAGGCAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCACCAGCCTGGAGCCTGAAGACTTTGCGGTGTACTACTGTCATCACTATGGAAGTCAGTCGTGGACGTTCGGCCAGGGGACCACGGTGGAGATCAAAC,DIILTQSPDTLSLSPGERATLSCKASQSIVTNYLAWYQQTPGQAPKLLVFGVSNTVAGIPGRFSGSGSGTDFTLTITSLEPEDFAVYYCHHYGSQSWTFGQGTTVEIK,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGXXXWTFGQGTKVEIK,HHYGSQSWT,QLVESGGGVVQPGRSLRLSCAASGFTFSNHAMHWVRQAPGKGLEWVAFISYDEAEQIYADAVRGRFTISRDNSKNTVYLQMNSVTTDDTAVYFCAKGAYGPLELFHGTDVWGQGTTVTVSS[SEP]DIILTQSPDTLSLSPGERATLSCKASQSIVTNYLAWYQQTPGQAPKLLVFGVSNTVAGIPGRFSGSGSGTDFTLTITSLEPEDFAVYYCHHYGSQSWTFGQGTTVEIK,,human,390c,"James et al, 2020",65 to 70,IGHV3-30,IGKV3-20,IGHV3-30,IGKV3-20

Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String31,Int64,String,String,String1,String15,String,String,String,String,String1,String15,String,String,String,String31,String,String,String7,String15,String31,String15,String,String,String,String
1,QITLRESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVAWIRQPPGKALEWLALIYWDDDKRYSPSLKSRLTITKDTSKSHVILSMTNMDPVDTGTYYCAHIHRDVIRYYFYMDVWGKGTTVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,GGGACTCCTGTGCCCCACCATGGACACACTTTGCTCCACGCTCCTGCTGCTGACCATCCCTTCATGGGTCTTGTCCCAGATCACCTTGAGGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTCTCTGGGTTCTCACTCAGCACTAGTGGAGTGGGTGTGGCCTGGATCCGTCAGCCCCCCGGAAAGGCCCTGGAGTGGCTTGCACTCATTTACTGGGATGATGATAAGCGCTACAGCCCATCTCTGAAGAGCAGGCTCACTATCACCAAGGATACCTCCAAAAGCCATGTGATCCTTAGCATGACCAACATGGACCCTGTGGACACAGGCACATATTACTGTGCGCACATTCACAGGGACGTTATCAGGTACTACTTCTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV2-5*02,CAGATCACCTTGAGGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTCTCTGGGTTCTCACTCAGCACTAGTGGAGTGGGTGTGGCCTGGATCCGTCAGCCCCCCGGAAAGGCCCTGGAGTGGCTTGCACTCATTTACTGGGATGATGATAAGCGCTACAGCCCATCTCTGAAGAGCAGGCTCACTATCACCAAGGATACCTCCAAAAGCCATGTGATCCTTAGCATGACCAACATGGACCCTGTGGACACAGGCACATATTACTGTGCGCACATTCACAGGGACGTTATCAGGTACTACTTCTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCA,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWDDDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHXXXXVIXYYYYMDVWGKGTTVTVSS,AHIHRDVIRYYFYMDV,AGCTTCAGCTGTGGGTAGAGAAGACAGGACTCAGGACAATCTCCAGCATGGCCAGCTTCCCTCTCCTCCTCACCCTCCTCACTCACTGTGCAGGGTCCTGGGCCCAGTCTGTACTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGCAGAGGGTCACCATCTCTTGTTCTGGAAGCAGCTCCAACATCGGAAGTAATCCTGTAAACTGGTACCAGCAGCTCCCAGGAACGGCCCCCAAACTCCTCATCTATAGTTATTATCACCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCAGCCTCCCTGGCCATCAGTGGGCTCCAGTCTGAGGATGAGGCTGATTACTACTGTGCAGCATGGGATGACGGCCTGAATGGTCCGGAACTGGTTTTCGGCGGAGGGACCAAGCTGATCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV1-44*01,CAGTCTGTACTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGCAGAGGGTCACCATCTCTTGTTCTGGAAGCAGCTCCAACATCGGAAGTAATCCTGTAAACTGGTACCAGCAGCTCCCAGGAACGGCCCCCAAACTCCTCATCTATAGTTATTATCACCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCAGCCTCCCTGGCCATCAGTGGGCTCCAGTCTGAGGATGAGGCTGATTACTACTGTGCAGCATGGGATGACGGCCTGAATGGTCCGGAACTGGTTTTCGGCGGAGGGACCAAGCTGATCGTCCTAG,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNPVNWYQQLPGTAPKLLIYSYYHRPSGVPDRFSGSKSGTSASLAISGLQSEDEADYYCAAWDDGLNGPELVFGGGTKLIVL,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNTVNWYQQLPGTAPKLLIYSNNQRPSGVPDRFSGSKSGTSASLAISGLQSEDEADYYCAAWDDSLNGPXXVFGGGTKLTVL,AAWDDGLNGPELV,QITLRESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVAWIRQPPGKALEWLALIYWDDDKRYSPSLKSRLTITKDTSKSHVILSMTNMDPVDTGTYYCAHIHRDVIRYYFYMDVWGKGTTVTVSS[SEP]QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNPVNWYQQLPGTAPKLLIYSYYHRPSGVPDRFSGSKSGTSASLAISGLQSEDEADYYCAAWDDGLNGPELVFGGGTKLIVL,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV2-5,IGLV1-44,IGHV2-5,IGLV1-44
2,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWDDDKRYSPSLKSRLTITKDTSKNHVILSMTNMDPVDTGTYYCAHIHRDVIRYYFYMDVWGKGTTVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,GGACTCCTGTGCCCCACCATGGACACACTTTGCTCCACGCTCCTGCTGCTGACCATCCCTTCATGGGTCTTGTCCCAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTCTCTGGGTTCTCACTCAGCACTAGTGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCCGGAAAGGCCCTGGAGTGGCTTGCACTCATTTACTGGGATGATGATAAGCGCTACAGCCCATCTCTGAAGAGCAGGCTCACTATCACCAAGGATACCTCCAAAAACCATGTGATCCTTAGCATGACCAACATGGACCCTGTGGACACAGGCACATATTACTGTGCGCACATTCACAGGGACGTTATCAGGTACTACTTCTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV2-5*02,CAGATCACCTTGAAGGAGTCTGGTCCTACGCTGGTGAAACCCACACAGACCCTCACGCTGACCTGCACCTTCTCTGGGTTCTCACTCAGCACTAGTGGAGTGGGTGTGGGCTGGATCCGTCAGCCCCCCGGAAAGGCCCTGGAGTGGCTTGCACTCATTTACTGGGATGATGATAAGCGCTACAGCCCATCTCTGAAGAGCAGGCTCACTATCACCAAGGATACCTCCAAAAACCATGTGATCCTTAGCATGACCAACATGGACCCTGTGGACACAGGCACATATTACTGTGCGCACATTCACAGGGACGTTATCAGGTACTACTTCTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCA,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWDDDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHXXXXVIXYYYYMDVWGKGTTVTVSS,AHIHRDVIRYYFYMDV,TCAGCTGTGGGTAGAGAAGACAGGACTCAGGACAATCTCCAGCATGGCCAGCTTCCCTCTCCTCCTCACCCTCCTCACTCACTGTGCAGGGTCCTGGGCCCAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGCAGAGGGTCACCATCTCTTGTTCTGGAAGCAGCTCCAACATCGGAAGTAATCCTGTAAACTGGTACCAGCAGCTCCCAGGAACGGCCCCCAAACTCCTCATCTATAGTTATAATCAGCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCAGCCTCCCTGGCCATCAGTGGGCTCCAGTCTGAGGATGAGGCTGATTACTACTGTGCAGCATGGGATGACGGCCTGAATGGTCCGGAACTGGTTTTCGGCGGAGGGACCAAGCTGATCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV1-44*01,CAGTCTGTGCTGACTCAGCCACCCTCAGCGTCTGGGACCCCCGGGCAGAGGGTCACCATCTCTTGTTCTGGAAGCAGCTCCAACATCGGAAGTAATCCTGTAAACTGGTACCAGCAGCTCCCAGGAACGGCCCCCAAACTCCTCATCTATAGTTATAATCAGCGGCCCTCAGGGGTCCCTGACCGATTCTCTGGCTCCAAGTCTGGCACCTCAGCCTCCCTGGCCATCAGTGGGCTCCAGTCTGAGGATGAGGCTGATTACTACTGTGCAGCATGGGATGACGGCCTGAATGGTCCGGAACTGGTTTTCGGCGGAGGGACCAAGCTGATCGTCCTAG,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNPVNWYQQLPGTAPKLLIYSYNQRPSGVPDRFSGSKSGTSASLAISGLQSEDEADYYCAAWDDGLNGPELVFGGGTKLIVL,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNTVNWYQQLPGTAPKLLIYSNNQRPSGVPDRFSGSKSGTSASLAISGLQSEDEADYYCAAWDDSLNGPXXVFGGGTKLTVL,AAWDDGLNGPELV,QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWDDDKRYSPSLKSRLTITKDTSKNHVILSMTNMDPVDTGTYYCAHIHRDVIRYYFYMDVWGKGTTVTVSS[SEP]QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNPVNWYQQLPGTAPKLLIYSYNQRPSGVPDRFSGSKSGTSASLAISGLQSEDEADYYCAAWDDGLNGPELVFGGGTKLIVL,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV2-5,IGLV1-44,IGHV2-5,IGLV1-44


In [83]:

# filter out every group that has the same entry in Subject
merged_unsorted_b_cells_unique_grouped_filtered_o_1_s = filter(g -> length(unique(g.Subject)) > 1, merged_unsorted_b_cells_unique_grouped_filtered)

Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String31,Int64,String,String,String1,String15,String,String,String,String,String1,String15,String,String,String,String31,String,String,String7,String15,String31,String15,String,String,String,String
1,EVQLVESGGGLVQPGESLRLSCEASGITFSSYWMSWVRQAPGKGLEWVANIKKDGSETWYVDSVKGRFTISRDNAKNSLYLQMNNLRVEDAAVYYCGRGSGWLQDYWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGGATTGGGGCTGAGCTGGGTTTTCCTTGTTGCTCTTTTAGAAGTTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGAGTCCCTGAGACTCTCCTGTGAAGCCTCTGGAATCACGTTCAGTAGCTATTGGATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATAAAAAAAGATGGAAGTGAGACATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAACCTGAGAGTCGAGGACGCGGCTGTGTATTACTGTGGGAGAGGGAGTGGCTGGTTACAGGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-7*01,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGAGTCCCTGAGACTCTCCTGTGAAGCCTCTGGAATCACGTTCAGTAGCTATTGGATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATAAAAAAAGATGGAAGTGAGACATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAACCTGAGAGTCGAGGACGCGGCTGTGTATTACTGTGGGAGAGGGAGTGGCTGGTTACAGGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXSGWXXDYWGQGTLVTVSS,GRGSGWLQDY,GAGCTACAACAGGCAGGCAGGGGCAGCAAGATGGTGTTGCAGACCCAGGTCTTCATTTCTCTGTTGCTCTGGATCTCTGGTGCCTACGGGGACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTGTTTTCTCTAGCTCCAGTAATAAGAACTTCCTAGCTTGGTTCCAGAAGAAACCAGGGCAGCCTCCTAAGTTGCTAATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATCATAGTGGTCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV4-1*01,GACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTGTTTTCTCTAGCTCCAGTAATAAGAACTTCCTAGCTTGGTTCCAGAAGAAACCAGGGCAGCCTCCTAAGTTGCTAATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATCATAGTGGTCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIVMTQSPDSLAVSLGERATVNCKSSQSVFSSSSNKNFLAWFQKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYHSGPLTFGGGTKVEIK,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPLTFGGGTKVEIK,QQYHSGPLT,EVQLVESGGGLVQPGESLRLSCEASGITFSSYWMSWVRQAPGKGLEWVANIKKDGSETWYVDSVKGRFTISRDNAKNSLYLQMNNLRVEDAAVYYCGRGSGWLQDYWGQGTLVTVSS[SEP]DIVMTQSPDSLAVSLGERATVNCKSSQSVFSSSSNKNFLAWFQKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYHSGPLTFGGGTKVEIK,Multiple-sclerosis,human,,"Ramesh et al, 2020",37.0,IGHV3-7,IGKV4-1,IGHV3-7,IGKV4-1
2,EVQLVESGGGLVQPGGSLRLSCAASGVRLSNYWMSWVRQAPGKGLEWVANMKKDGSEKWYVDSVKGRFTISRDNAENSLFLQMDKLRDDDTAVYYCGRGSGWLQDYWGQGISVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAATTGGGGCTGACCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGAGTCCGCCTCAGTAACTATTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAAGAAAGATGGAAGTGAGAAATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATTTCCAGAGACAACGCCGAGAACTCACTGTTTCTGCAAATGGACAAACTGAGAGACGACGACACGGCTGTGTATTACTGCGGGAGGGGCAGTGGCTGGCTACAAGATTACTGGGGCCAGGGAATATCGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-7*04,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGAGTCCGCCTCAGTAACTATTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAAGAAAGATGGAAGTGAGAAATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATTTCCAGAGACAACGCCGAGAACTCACTGTTTCTGCAAATGGACAAACTGAGAGACGACGACACGGCTGTGTATTACTGCGGGAGGGGCAGTGGCTGGCTACAAGATTACTGGGGCCAGGGAATATCGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARGSGWXXXYWGQGTLVTVSS,GRGSGWLQDY,GCTACAACAGGCAGGCAGGGGCAGCAAGATGGTGTTGCAGACCCAGGTCTTCATTTCTCTGTTGCTCTGGATCTCTGGTGCCTACGGGGACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTATTTTATCCAGTTCCAACAATAAGAACTACTTAGCTTGGTTCCACAAGAAACCAGGACAGCCTCCTAAACTACTCATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAACAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATTATGGTGGTCCTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV4-1*01,GACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTATTTTATCCAGTTCCAACAATAAGAACTACTTAGCTTGGTTCCACAAGAAACCAGGACAGCCTCCTAAACTACTCATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAACAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATTATGGTGGTCCTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIVMTQSPDSLAVSLGERATVNCKSSQSILSSSNNKNYLAWFHKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTINSLQAEDVAVYYCQQYYGGPLTFGGGTKVEIK,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPLTFGGGTKVEIK,QQYYGGPLT,EVQLVESGGGLVQPGGSLRLSCAASGVRLSNYWMSWVRQAPGKGLEWVANMKKDGSEKWYVDSVKGRFTISRDNAENSLFLQMDKLRDDDTAVYYCGRGSGWLQDYWGQGISVTVSS[SEP]DIVMTQSPDSLAVSLGERATVNCKSSQSILSSSNNKNYLAWFHKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTINSLQAEDVAVYYCQQYYGGPLTFGGGTKVEIK,CMV,human,Donor-4,"Jaffe et al., 2022",50.0,IGHV3-7,IGKV4-1,IGHV3-7,IGKV4-1

Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String31,Int64,String,String,String1,String15,String,String,String,String,String1,String15,String,String,String,String31,String,String,String7,String15,String31,String15,String,String,String,String
1,EVQLAESGGGLVQPGGSLRLSCVVSGFTFSDYWMSWVRQAPGKGLEWVANMNEDGSEKYCLDGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARGDVNSGDYWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAGTTGGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAGCTGGCGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGTAGTCTCTGGATTCACGTTTAGTGACTACTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAATGAAGATGGAAGTGAGAAATACTGTCTGGACGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTGTATTACTGTGCGAGAGGAGATGTCAACTCGGGCGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV3-7*03,GAGGTGCAGCTGGCGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGTAGTCTCTGGATTCACGTTTAGTGACTACTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAATGAAGATGGAAGTGAGAAATACTGTCTGGAC---------GGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTGTATTACTGTGCGAGAGGAGATGTCAACTCGGGCGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXXNXXDYWGQGTLVTVSS,ARGDVNSGDY,GATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCCCAGGATCCAGTGGGGATCTTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAAAGCCTCGTACACAGTGATGGAAACATCTACTTGAATTGGCTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTCATACATAGGGTTTCTACCCGGGACTCTGGGGTCCCAGAAAGATTCAGCGGCAGTGGGTCAGGCACTAATTTCACACTGAGAATCAGCAGGGTGGAGGCTGAGGATGTTGGCGTTTATTACTGCATGCAAGGTAAACACTGGACTTTTGGCCAGGGGACCAAGCTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-30*02,GATCTTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAAAGCCTCGTACACAGTGATGGAAACATCTACTTGAATTGGCTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTCATACATAGGGTTTCTACCCGGGACTCTGGGGTCCCAGAAAGATTCAGCGGCAGTGGGTCAGGCACTAATTTCACACTGAGAATCAGCAGGGTGGAGGCTGAGGATGTTGGCGTTTATTACTGCATGCAAGGTAAACACTGGACTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC,DLVMTQSPLSLPVTLGQPASISCRSSQSLVHSDGNIYLNWLQQRPGQSPRRLIHRVSTRDSGVPERFSGSGSGTNFTLRISRVEAEDVGVYYCMQGKHWTFGQGTKLEIK,DVVMTQSPLSLPVTLGQPASISCRSSQSLVHSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTHWPFGQGTKLEIK,MQGKHWT,EVQLAESGGGLVQPGGSLRLSCVVSGFTFSDYWMSWVRQAPGKGLEWVANMNEDGSEKYCLDGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARGDVNSGDYWGQGTLVTVSS[SEP]DLVMTQSPLSLPVTLGQPASISCRSSQSLVHSDGNIYLNWLQQRPGQSPRRLIHRVSTRDSGVPERFSGSGSGTNFTLRISRVEAEDVGVYYCMQGKHWTFGQGTKLEIK,SARS-COV-2,human,Patient-15,"Mor et al., 2021",51,IGHV3-7,IGKV2-30,IGHV3-7,IGKV2-30
2,EVQLVESGGGLVQPGGSLRLSCAGSGFMFSDYWMTWVRQAPGKGLECVAITDQEGNERYSVHSVRGRFTISRDNAKNSLYLEMHSLRAEDTALYYCARGDVNSGDYWGQGTMVTVAS,Unsorted-B-Cells,1,Memory-B-Cells,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACGCACCATGGAATTGGGGCTGAGCTGGGTTTTCCTTGTTGTTATTCTAGAAGGTGTCCAGTGTGAGGTGCAATTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGCAGGCTCTGGATTTATGTTTAGCGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGCGTGGCCATCACAGACCAAGAAGGAAATGAGAGATACTCTGTTCACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACGCCAAAAATTCTCTGTATTTGGAAATGCACAGCCTGAGAGCCGAAGACACGGCTCTATATTACTGTGCGAGAGGGGATGTCAATTCGGGGGACTATTGGGGCCAGGGAACCATGGTCACCGTCGCGTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-7*01,GAGGTGCAATTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGCAGGCTCTGGATTTATGTTTAGCGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGCGTGGCCATCACAGACCAAGAAGGAAATGAGAGATACTCTGTTCACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACGCCAAAAATTCTCTGTATTTGGAAATGCACAGCCTGAGAGCCGAAGACACGGCTCTATATTACTGTGCGAGAGGGGATGTCAATTCGGGGGACTATTGGGGCCAGGGAACCATGGTCACCGTCGCGTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXXXSGDYWGQGTLVTVSS,ARGDVNSGDY,GGGGACTGATCAGGGCTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCCCAGGATCTCGTGGGTTTTTTGTGCTGACTCAGTCTCCACTCTCACTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGGGATGGATATACCTACCTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTTCTTTATAAGGTTTCTCACCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAGAATCAGTAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAACATACTGGATGTTCGGCCAAGGGACCAAACTGGAGATCAAAGGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-30*01,TTGTGCTGACTCAGTCTCCACTCTCACTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGGGATGGATATACCTACCTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTTCTTTATAAGGTTTCTCACCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAGAATCAGTAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAACATACTGGATGTTCGGCCAAGGGACCAAACTGGAGATCAAA,VLTQSPLSLPVTLGQPASISCRSNASLLDRDGYTYLNWFQQRPGQSPRRLLYKVSHRDSGVPDRFSGSGSGTDFTLRISRVEAEDVAVYYCMQATYWMFGQGTKLEIK,VMTQSPLSLPVTLGQPASISCRSSQSLVYSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTHWXFGQGTKVEIK,MQATYWM,EVQLVESGGGLVQPGGSLRLSCAGSGFMFSDYWMTWVRQAPGKGLECVAITDQEGNERYSVHSVRGRFTISRDNAKNSLYLEMHSLRAEDTALYYCARGDVNSGDYWGQGTMVTVAS[SEP]VLTQSPLSLPVTLGQPASISCRSNASLLDRDGYTYLNWFQQRPGQSPRRLLYKVSHRDSGVPDRFSGSGSGTDFTLRISRVEAEDVAVYYCMQATYWMFGQGTKLEIK,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV3-7,IGKV2-30,IGHV3-7,IGKV2-30
3,EVQLVESGGGLVQPGGSLRLSCVASGFMFSDYWMTWVRQAPGKGLEWVANTNQDGSDKHYVYSVRGRFTISRDNTENSLFLEMHSLRPEDTALYYCARGDVNSGDYWGQGTMVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACGCACCATGGAATTGGGGCTGAACTGGGTTTTCCTTGTTGCTATTCTGGAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGTAGCCTCTGGATTTATGTTTAGTGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACACAAACCAAGATGGGAGTGACAAGCACTATGTCTACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACACCGAAAATTCTCTGTTTCTGGAAATGCACAGCCTGAGACCCGAAGACACGGCTCTATATTATTGTGCGCGAGGGGATGTCAACTCGGGGGACTACTGGGGCCAGGGAACCATGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-7*01,GAGGTGCAGCTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGTAGCCTCTGGATTTATGTTTAGTGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACACAAACCAAGATGGGAGTGACAAGCACTATGTCTACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACACCGAAAATTCTCTGTTTCTGGAAATGCACAGCCTGAGACCCGAAGACACGGCTCTATATTATTGTGCGCGAGGGGATGTCAACTCGGGGGACTACTGGGGCCAGGGAACCATGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXXXSGDYWGQGTLVTVSS,ARGDVNSGDY,CTGGGGGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCCCAGGATCTCGTGGGTATTTTGTGATGACTCAGTCTCCTCTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGTGATGGGAACACCCACTTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTACTTTATAAGGTTTCTCGCCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAAAATCAGCAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAATATACTGGACGTTCGGCCAAGGGACCAAACTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-30*01,ATTTTGTGATGACTCAGTCTCCTCTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGTGATGGGAACACCCACTTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTACTTTATAAGGTTTCTCGCCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAAAATCAGCAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAATATACTGGACGTTCGGCCAAGGGACCAAACTGGAAATCAAAC,FVMTQSPLSLPVTLGQPASISCRSNASLLDSDGNTHLNWFQQRPGQSPRRLLYKVSRRDSGVPDRFSGSGSGTDFTLKISRVEAEDVAVYYCMQAIYWTFGQGTKLEIK,VVMTQSPLSLPVTLGQPASISCRSSQSLVYSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTHWTFGQGTKVEIK,MQAIYWT,EVQLVESGGGLVQPGGSLRLSCVASGFMFSDYWMTWVRQAPGKGLEWVANTNQDGSDKHYVYSVRGRFTISRDNTENSLFLEMHSLRPEDTALYYCARGDVNSGDYWGQGTMVTVSS[SEP]FVMTQSPLSLPVTLGQPASISCRSNASLLDSDGNTHLNWFQQRPGQSPRRLLYKVSRRDSGVPDRFSGSGSGTDFTLKISRVEAEDVAVYYCMQAIYWTFGQGTKLEIK,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV3-7,IGKV2-30,IGHV3-7,IGKV2-30


In [85]:
# alternative: Calculate the fraction of groups where all entries have the same `general_v_gene_light`
true_cases_uns = 0
total_groups_uns = length(merged_unsorted_b_cells_unique_grouped_filtered_o_1_s)

for group in merged_unsorted_b_cells_unique_grouped_filtered_o_1_s
    # Check if all entries in the group have the same `general_v_gene_light`
    unique_genes_uns = unique(group.general_v_gene_light_no_para)
    if length(unique_genes_uns) == 1
        true_cases_uns += 1  # Increment the count if all genes are identical
    end
end

# Step 3: Calculate the fraction of "true" cases
fraction_true_uns = (true_cases_uns / total_groups_uns) * 100

println("Percentage of groups where all entries have the same general_v_gene_light: $fraction_true_uns%")
fraction_true_uns

68.73508353221956

In [None]:
# group by predicted_btype
grouped_by_predicted_btype = groupby(merged_unsorted_b_cells_unique, :predicted_btype)

# Extract the first and second groups into separate DataFrames:
df_pred_memory = DataFrame(grouped_by_predicted_btype[1])
df_pred_naive = DataFrame(grouped_by_predicted_btype[2])

Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String31,Int64,String,String,String1,String15,String,String,String,String,String1,String15,String,String,String,String31,String,String,String7,String15,String31,String15,String,String,String,String
1,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCARGRRLYSGSGGMDVWGQGTTVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,CTCTGCTGAAGAAAACCAGCCCTGCAGCTCTGGGAGAGGAGCCCCAGCCCTGGGATTCCCAGCTGTTTCTGCTTGCTGATCAGGACTGCACACAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTACTGGATGCACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTAATAGTGATGGGAGTAGCACAAGCTACGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGCTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCAAGAGGACGCCGGCTCTATAGTGGTTCTGGTGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGT,H,IGHV3-74*01,GAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTAGTTCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTACTGGATGCACTGGGTCCGCCAAGCTCCAGGGAAGGGGCTGGTGTGGGTCTCACGTATTAATAGTGATGGGAGTAGCACAAGCTACGCGGACTCCGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACACGCTGTATCTGCAAATGAACAGTCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCAAGAGGACGCCGGCTCTATAGTGGTTCTGGTGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCA,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCARXXXXYSGXXGMDVWGQGTTVTVSS,ARGRRLYSGSGGMDV,GGAGGCTGATCACACCCTGTGCAGGAGTCAGACCCACTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGCTCCCAGGTGCCAGATGTGCCATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGCCGGGCAAGTCAGGGCATTAGAAATGATTTAGGCTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTACAAAGTGGGGTCCCATCAAGGTTCAGCGGCAGTGGATCTGGCACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTATTACTGTCTACAAGATTACAATTACCCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-6*01,GCCATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGCCGGGCAAGTCAGGGCATTAGAAATGATTTAGGCTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTACAAAGTGGGGTCCCATCAAGGTTCAGCGGCAGTGGATCTGGCACAGATTTCACTCTCACCATCAGCAGCCTGCAGCCTGAAGATTTTGCAACTTATTACTGTCTACAAGATTACAATTACCCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC,AIQMTQSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCLQDYNYPYTFGQGTKLEIK,AIQMTQSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCLQDYNYPYTFGQGTKLEIK,LQDYNYPYT,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMHWVRQAPGKGLVWVSRINSDGSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCARGRRLYSGSGGMDVWGQGTTVTVSS[SEP]AIQMTQSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCLQDYNYPYTFGQGTKLEIK,,human,390c,"James et al, 2020",65 to 70,IGHV3-74,IGKV1-6,IGHV3-74,IGKV1-6
2,EVQLVESGGGLVQPGGSLKLSCAASGFTFSGSAMHWVRQASGKGLEWVGRIRSKANSYATAYAASVKGRFTISRDDSKNTAYLQMNSLKTEDRAVYYCTRQVGDEGFDYWGQGTLVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,AGCTCTGGGAGAGGAGCTCCAGCCTTGGGATTCCCAGCTGTCTCCACTCGGTGATCGGCACTGAATACAGGAGACTCACCATGGAGTTTGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAAACTCTCCTGTGCAGCCTCTGGGTTCACCTTCAGTGGCTCTGCTATGCACTGGGTCCGCCAGGCTTCCGGGAAAGGGCTGGAGTGGGTTGGCCGTATTAGAAGCAAAGCTAACAGTTACGCGACAGCATATGCTGCGTCGGTGAAAGGCAGGTTCACCATCTCCAGAGATGATTCAAAGAACACGGCGTATCTGCAAATGAACAGCCTGAAAACCGAGGACAGGGCCGTGTATTACTGTACTAGACAAGTCGGCGACGAGGGCTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-73*02,GAGGTGCAGCTGGTGGAGTCCGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAAACTCTCCTGTGCAGCCTCTGGGTTCACCTTCAGTGGCTCTGCTATGCACTGGGTCCGCCAGGCTTCCGGGAAAGGGCTGGAGTGGGTTGGCCGTATTAGAAGCAAAGCTAACAGTTACGCGACAGCATATGCTGCGTCGGTGAAAGGCAGGTTCACCATCTCCAGAGATGATTCAAAGAACACGGCGTATCTGCAAATGAACAGCCTGAAAACCGAGGACAGGGCCGTGTATTACTGTACTAGACAAGTCGGCGACGAGGGCTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLKLSCAASGFTFSGSAMHWVRQASGKGLEWVGRIRSKANSYATAYAASVKGRFTISRDDSKNTAYLQMNSLKTEDTAVYYCTRXXXXXXFDYWGQGTLVTVSS,TRQVGDEGFDY,GCTGTGCTGTGGGTCCAGGAGGCAGAACTCTGGGTGTCTCACCATGGCCTGGATCCCTCTACTTCTCCCCCTCCTCACTCTCTGCACAGGCTCTGAGGCCTCCTATGAGCTGACACAGCCACCCTCGGTGTCAGTGTCCCCAGGACAGACGGCCAGGATCACCTGCTCTGGAGATGCATTGCCAAAGCAATATGCTTATTGGTACCAGCAGAAGCCAGGCCAGGCCCCTGTGATGGTGATATATAAAGACAGTGAGAGGCCCTCAGGGATCCCTGAGCGATTCTCTGGCTCCAGCTCAGGGACAACAGTCACGTTGACCATCAGTGGAGTCCAGGCAGAAGACGAGGCTGACTATTACTGTCAATCAGCAGACAGCAGTAGTACTTATCGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV3-25*03,TCCTATGAGCTGACACAGCCACCCTCGGTGTCAGTGTCCCCAGGACAGACGGCCAGGATCACCTGCTCTGGAGATGCATTGCCAAAGCAATATGCTTATTGGTACCAGCAGAAGCCAGGCCAGGCCCCTGTGATGGTGATATATAAAGACAGTGAGAGGCCCTCAGGGATCCCTGAGCGATTCTCTGGCTCCAGCTCAGGGACAACAGTCACGTTGACCATCAGTGGAGTCCAGGCAGAAGACGAGGCTGACTATTACTGTCAATCAGCAGACAGCAGTAGTACTTATCGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,SYELTQPPSVSVSPGQTARITCSGDALPKQYAYWYQQKPGQAPVMVIYKDSERPSGIPERFSGSSSGTTVTLTISGVQAEDEADYYCQSADSSSTYRVFGGGTKLTVL,SYELTQPPSVSVSPGQTARITCSGDALPKQYAYWYQQKPGQAPVLVIYKDSERPSGIPERFSGSSSGTTVTLTISGVQAEDEADYYCQSADSSGTYRVFGGGTKLTVL,QSADSSSTYRV,EVQLVESGGGLVQPGGSLKLSCAASGFTFSGSAMHWVRQASGKGLEWVGRIRSKANSYATAYAASVKGRFTISRDDSKNTAYLQMNSLKTEDRAVYYCTRQVGDEGFDYWGQGTLVTVSS[SEP]SYELTQPPSVSVSPGQTARITCSGDALPKQYAYWYQQKPGQAPVMVIYKDSERPSGIPERFSGSSSGTTVTLTISGVQAEDEADYYCQSADSSSTYRVFGGGTKLTVL,,human,390c,"James et al, 2020",65 to 70,IGHV3-73,IGLV3-25,IGHV3-73,IGLV3-25
3,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTKSVDTSKNQFSLKLSSVTAADTAVYYCARELRTHNFDYWGQGTLVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,GATGCTTTCTGAGAGTCATGGACCTCCTGTGCAAGAACATGAAGCACCTGTGGTTCTTCCTCCTGCTGGTGGCGGCTCCCAGATGGGTCCTGTCCCAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTAGTAGTTACTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGTCTGGAGTGGATTGGGAGTATCTATTATAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTCACCAAATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGAATTGCGTACCCATAACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV4-39*07,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTAGTAGTTACTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGTCTGGAGTGGATTGGGAGTATCTATTATAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTCACCAAATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCCGTGTATTACTGTGCGAGAGAATTGCGTACCCATAACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXTXXFDYWGQGTLVTVSS,ARELRTHNFDY,GGCTGGTGGGATCAGTCCTGGTGGTAGCTCAGGAAGCAGAGCCTGGAGCATCTCCACTATGGCCTGGGCTCCACTACTTCTCACCCTCCTCGCTCACTGCACAGGTTCTTGGGCCAATTTTATGCTGACTCAGCCCCACTCTGTGTCGGAGTCTCCGGGGAAGACGGTAACCATCTCCTGCACCCGCAGCAGTGGCAGCATTGCCAGCAACTATGTGCAGTGGTACCAGCAGCGCCCGGGCAGTTCCCCCACCACTGTGATCTATGAGGATAACCAAAGACCCTCTGGGGTCCCTGATCGGTTCTCTGGCTCCATCGACAACTCCTCCAACTCTGCCTCCCTCACCATCTCTGGACTGAAGACTGAGGACGAGGCTGACTACTACTGTCAGTCTTATGATTACCGCAATCGGGTGTTCGGCGGAGGGACCCAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTC,L,IGLV6-57*01,AATTTTATGCTGACTCAGCCCCACTCTGTGTCGGAGTCTCCGGGGAAGACGGTAACCATCTCCTGCACCCGCAGCAGTGGCAGCATTGCCAGCAACTATGTGCAGTGGTACCAGCAGCGCCCGGGCAGTTCCCCCACCACTGTGATCTATGAGGATAACCAAAGACCCTCTGGGGTCCCTGATCGGTTCTCTGGCTCCATCGACAACTCCTCCAACTCTGCCTCCCTCACCATCTCTGGACTGAAGACTGAGGACGAGGCTGACTACTACTGTCAGTCTTATGATTACCGCAATCGGGTGTTCGGCGGAGGGACCCAGCTGACCGTCCTAG,NFMLTQPHSVSESPGKTVTISCTRSSGSIASNYVQWYQQRPGSSPTTVIYEDNQRPSGVPDRFSGSIDNSSNSASLTISGLKTEDEADYYCQSYDYRNRVFGGGTQLTVL,NFMLTQPHSVSESPGKTVTISCTRSSGSIASNYVQWYQQRPGSSPTTVIYEDNQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSNRVFGGGTKLTVL,QSYDYRNRV,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTKSVDTSKNQFSLKLSSVTAADTAVYYCARELRTHNFDYWGQGTLVTVSS[SEP]NFMLTQPHSVSESPGKTVTISCTRSSGSIASNYVQWYQQRPGSSPTTVIYEDNQRPSGVPDRFSGSIDNSSNSASLTISGLKTEDEADYYCQSYDYRNRVFGGGTQLTVL,,human,390c,"James et al, 2020",65 to 70,IGHV4-39,IGLV6-57,IGHV4-39,IGLV6-57
4,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYCASRGRWEEGLYYYYGMDVWGQGTTVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,TGGGGACCCAAAAACCACACCCCTCCTTGGGAGAATCCCCTAGATCACAGCTCCTCACCATGGACTGGACCTGGAGCATCCTTTTCTTGGTGGCAGCAGCAACAGGTGCCCACTCCCAGGTTCAGCTGGTGCAGTCTGGAGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTACCAGCTATGGTATCAGCTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAGCGCTTACAATGGTAACACAAACTATGCACAGAAGCTCCAGGGCAGAGTTACCATGACCACAGACACATCCACGAGCACAGCCTACATGGAGCTGAGGAGCCTGAGATCTGACGACACGGCCGTGTATTACTGTGCGAGCCGGGGGAGGTGGGAGGAGGGACTATATTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV1-18*01,CAGGTTCAGCTGGTGCAGTCTGGAGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGTTACACCTTTACCAGCTATGGTATCAGCTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAGCGCTTACAATGGTAACACAAACTATGCACAGAAGCTCCAGGGCAGAGTTACCATGACCACAGACACATCCACGAGCACAGCCTACATGGAGCTGAGGAGCCTGAGATCTGACGACACGGCCGTGTATTACTGTGCGAGCCGGGGGAGGTGGGAGGAGGGACTATATTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCA,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYCAXXXXWEXXXXYYYGMDVWGQGTTVTVSS,ASRGRWEEGLYYYYGMDV,CGTTTTGTTTTCTTATATGGGGGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTATTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACATTGGTGGTTATAACTTTGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTTATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGAAGCAACACTCTGGGGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-14*03,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACATTGGTGGTTATAACTTTGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTTATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGAAGCAACACTCTGGGGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDIGGYNFVSWYQQHPGKAPKLMIYDVSYRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTRSNTLGVFGGGTKLTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTLXVFGGGTKLTVL,SSYTRSNTLGV,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGWISAYNGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYCASRGRWEEGLYYYYGMDVWGQGTTVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDIGGYNFVSWYQQHPGKAPKLMIYDVSYRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTRSNTLGVFGGGTKLTVL,Multiple-sclerosis,human,,"Ramesh et al, 2020",37.0,IGHV1-18,IGLV2-14,IGHV1-18,IGLV2-14
5,QMQLVQSGPEVKKPGTSVKVSCKASGFTFTSSAVQWVRQARGQRLEWIGWIVVGSGNTNYAQKFQERVTITRDMSTSTAYMELSSLRSEDTAVYYCAAGDGVLLWFRDGMDVWGQGTTVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,AGCATCATCCAGAAACCACATCCCTCCGCTAGAGAAGCCCCTGACGGCACAGTTCCTCACTATGGACTGGATTTGGAGGATCCTCTTCTTGGTGGGAGCAGCGACAGGTGCCCACTCCCAAATGCAGCTGGTGCAGTCTGGGCCTGAGGTGAAGAAGCCTGGGACCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGATTCACCTTTACTAGCTCTGCTGTGCAGTGGGTGCGACAGGCTCGTGGACAACGCCTTGAGTGGATAGGATGGATCGTCGTTGGCAGTGGTAACACAAACTACGCACAGAAGTTCCAGGAAAGAGTCACCATTACCAGGGACATGTCCACAAGCACAGCCTATATGGAGCTGAGTAGCCTGAGATCCGAGGACACGGCCGTGTATTACTGTGCGGCAGGCGACGGGGTATTACTATGGTTCAGGGACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV1-58*01,CAAATGCAGCTGGTGCAGTCTGGGCCTGAGGTGAAGAAGCCTGGGACCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGATTCACCTTTACTAGCTCTGCTGTGCAGTGGGTGCGACAGGCTCGTGGACAACGCCTTGAGTGGATAGGATGGATCGTCGTTGGCAGTGGTAACACAAACTACGCACAGAAGTTCCAGGAAAGAGTCACCATTACCAGGGACATGTCCACAAGCACAGCCTATATGGAGCTGAGTAGCCTGAGATCCGAGGACACGGCCGTGTATTACTGTGCGGCAGGCGACGGGGTATTACTATGGTTCAGGGACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCA,QMQLVQSGPEVKKPGTSVKVSCKASGFTFTSSAVQWVRQARGQRLEWIGWIVVGSGNTNYAQKFQERVTITRDMSTSTAYMELSSLRSEDTAVYYCAAXXXVLLWFGDGMDVWGQGTTVTVSS,AAGDGVLLWFRDGMDV,TATGGGGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCTCTGGATCCAGTGGGGATCTTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGCATAGTAATGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGCAGTCTCCACAGCTCCTGATCTATTTGGGTTCTAATCGGGCCTCCGGGGTCTCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGCTCTACAAACTCCTCATACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-28*01,GATCTTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGCATAGTAATGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGCAGTCTCCACAGCTCCTGATCTATTTGGGTTCTAATCGGGCCTCCGGGGTCTCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGCTCTACAAACTCCTCATACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DLVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVSDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPHTFGGGTKVEIK,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPXTFGGGTKVEIK,MQALQTPHT,QMQLVQSGPEVKKPGTSVKVSCKASGFTFTSSAVQWVRQARGQRLEWIGWIVVGSGNTNYAQKFQERVTITRDMSTSTAYMELSSLRSEDTAVYYCAAGDGVLLWFRDGMDVWGQGTTVTVSS[SEP]DLVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVSDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPHTFGGGTKVEIK,Multiple-sclerosis,human,,"Ramesh et al, 2020",37.0,IGHV1-58,IGKV2-28,IGHV1-58,IGKV2-28
6,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARGRVAAAGTWSPWGQGTLVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,AGTGCTTTCTGAGAGTCATGGACCTCCTGCACAAGAACATGAAACACCTGTGGTTCTTCCTCCTCCTGGTGGCAGCTCCCAGATGGGTCCTGTCCCAGGTGCAGCTACAGCAGTGGGGCGCAGGACTGTTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGGAAATCAATCATAGTGGAAGCACCAACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCTGTGTATTACTGTGCGAGAGGGCGGGTAGCAGCAGCTGGTACGTGGAGCCCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV4-34*01,CAGGTGCAGCTACAGCAGTGGGGCGCAGGACTGTTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGGAAATCAATCATAGTGGAAGCACCAACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCGGACACGGCTGTGTATTACTGTGCGAGAGGGCGGGTAGCAGCAGCTGGTACGTGGAGCCCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARGXXAAAGTXXPWGQGTLVTVSS,ARGRVAAAGTWSP,TGGGGCTGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTATTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAACTATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTGTGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-14*03,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAACTATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTGTGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTVVFGGGTKLTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTVVFGGGTKLTVL,SSYTSSSTVV,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARGRVAAAGTWSPWGQGTLVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTVVFGGGTKLTVL,Multiple-sclerosis,human,,"Ramesh et al, 2020",37.0,IGHV4-34,IGLV2-14,IGHV4-34,IGLV2-14
7,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARHSPYSSSWYYFDYWGQGTLVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,TGGGCTTAGATTTCTTATATGGGGGCTTTCTGAGAGTCATGGATCTCATGTGCAAGAAAATGAAGCACCTGTGGTTCTTCCTCCTGCTGGTGGCGGCTCCCAGATGGGTCCTGTCCCAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTAGTAGTTACTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGAGTATCTATTATAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCCGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCTGTGTATTACTGTGCGAGACATTCCCCCTATAGCAGCAGCTGGTACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV4-39*01,CAGCTGCAGCTGCAGGAGTCGGGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTAGTAGTTACTACTGGGGCTGGATCCGCCAGCCCCCAGGGAAGGGGCTGGAGTGGATTGGGAGTATCTATTATAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTCACCATATCCGTAGACACGTCCAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACCGCCGCAGACACGGCTGTGTATTACTGTGCGAGACATTCCCCCTATAGCAGCAGCTGGTACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARXXXYSSSWYYFDYWGQGTLVTVSS,ARHSPYSSSWYYFDY,TGGGGGCTGGGGTCTCAGGAGGCAGCGCTCTCGGGACGTCTCCACCATGGCCTGGGCTCTGCTATTCCTCACCCTCCTCACTCAGGGCACAGGGTCCTGGGCCCAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAACTATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTCTCGCGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV2-14*03,CAGTCTGCCCTGACTCAGCCTGCCTCCGTGTCTGGGTCTCCTGGACAGTCGATCACCATCTCCTGCACTGGAACCAGCAGTGACGTTGGTGGTTATAACTATGTCTCCTGGTACCAACAACACCCAGGCAAAGCCCCCAAACTCATGATTTATGATGTCAGTAATCGGCCCTCAGGGGTTTCTAATCGCTTCTCTGGCTCCAAGTCTGGCAACACGGCCTCCCTGACCATCTCTGGGCTCCAGGCTGAGGACGAGGCTGATTATTACTGCAGCTCATATACAAGCAGCAGCACTCTCGCGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTLAVFGGGTKLTVL,QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTLXVFGGGTKLTVL,SSYTSSSTLAV,QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYSGSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARHSPYSSSWYYFDYWGQGTLVTVSS[SEP]QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYDVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTLAVFGGGTKLTVL,Multiple-sclerosis,human,,"Ramesh et al, 2020",37.0,IGHV4-39,IGLV2-14,IGHV4-39,IGLV2-14
8,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYDINWVRQATGQGLEWMGWMNPNSGNTGYAQKFQGRVTMTRNTSISTAYMELSSLRSEDTAVYYCARGSSITIFGVVISDAFDIWGQGTMVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,TGGGGACTCAACAACCACATCTGTCCTCTAGAGAAAACCCTGTGAGCACAGCTCCTCACCATGGACTGGACCTGGAGGATCCTCTTCTTGGTGGCAGCAGCTACAAGTGCCCACTCCCAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGATACACCTTCACCAGTTATGATATCAACTGGGTGCGACAGGCCACTGGACAAGGGCTTGAGTGGATGGGATGGATGAACCCTAACAGTGGTAACACAGGCTATGCACAGAAGTTCCAGGGCAGAGTCACCATGACCAGGAACACCTCCATAAGCACAGCCTACATGGAGCTGAGCAGCCTGAGATCTGAGGACACGGCCGTGTATTACTGTGCGAGAGGCAGTTCGATTACGATTTTTGGAGTGGTTATTTCGGATGCTTTTGATATCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV1-8*01,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGATACACCTTCACCAGTTATGATATCAACTGGGTGCGACAGGCCACTGGACAAGGGCTTGAGTGGATGGGATGGATGAACCCTAACAGTGGTAACACAGGCTATGCACAGAAGTTCCAGGGCAGAGTCACCATGACCAGGAACACCTCCATAAGCACAGCCTACATGGAGCTGAGCAGCCTGAGATCTGAGGACACGGCCGTGTATTACTGTGCGAGAGGCAGTTCGATTACGATTTTTGGAGTGGTTATTTCGGATGCTTTTGATATCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAG,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYDINWVRQATGQGLEWMGWMNPNSGNTGYAQKFQGRVTMTRNTSISTAYMELSSLRSEDTAVYYCARGXXITIFGVVIXDAFDIWGQGTMVTVSS,ARGSSITIFGVVISDAFDI,TGGGGGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCTCTGGATCCAGTGGGGATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGCATAGTAATGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGCAGTCTCCACAGCTCCTGATCTATTTGGGTTCTAATCGGGCCTCCGGGGTCCCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGCTCTACAAACTCCGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-28*01,GATATTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCCTGGAGAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAGAGCCTCCTGCATAGTAATGGATACAACTATTTGGATTGGTACCTGCAGAAGCCAGGGCAGTCTCCACAGCTCCTGATCTATTTGGGTTCTAATCGGGCCTCCGGGGTCCCTGACAGGTTCAGTGGCAGTGGATCAGGCACAGATTTTACACTGAAAATCAGCAGAGTGGAGGCTGAGGATGTTGGGGTTTATTACTGCATGCAAGCTCTACAAACTCCGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPWTFGQGTKVEIK,DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPWTFGQGTKVEIK,MQALQTPWT,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYDINWVRQATGQGLEWMGWMNPNSGNTGYAQKFQGRVTMTRNTSISTAYMELSSLRSEDTAVYYCARGSSITIFGVVISDAFDIWGQGTMVTVSS[SEP]DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYNYLDWYLQKPGQSPQLLIYLGSNRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPWTFGQGTKVEIK,Multiple-sclerosis,human,,"Ramesh et al, 2020",37.0,IGHV1-8,IGKV2-28,IGHV1-8,IGKV2-28
9,QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAREGQNNYFDYWGQGTLVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,GGGAGCATCACCCAGCAACCACATCTGTCCTCTAGAGAATCCCCTGAGAGCTCCGTTCCTCACCATGGACTGGACCTGGAGGATCCTCTTCTTGGTGGCAGCAGCCACAGGAGCCCACTCCCAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGATACACCTTCACCGGCTACTATATGCACTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAACCCTAACAGTGGTGGCACAAACTATGCACAGAAGTTTCAGGGCAGGGTCACCATGACCAGGGACACGTCCATCAGCACAGCCTACATGGAGCTGAGCAGGCTGAGATCTGACGACACGGCCGTGTATTACTGTGCGAGAGAAGGTCAAAACAACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV1-2*02,CAGGTGCAGCTGGTGCAGTCTGGGGCTGAGGTGAAGAAGCCTGGGGCCTCAGTGAAGGTCTCCTGCAAGGCTTCTGGATACACCTTCACCGGCTACTATATGCACTGGGTGCGACAGGCCCCTGGACAAGGGCTTGAGTGGATGGGATGGATCAACCCTAACAGTGGTGGCACAAACTATGCACAGAAGTTTCAGGGCAGGGTCACCATGACCAGGGACACGTCCATCAGCACAGCCTACATGGAGCTGAGCAGGCTGAGATCTGACGACACGGCCGTGTATTACTGTGCGAGAGAAGGTCAAAACAACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCARXXXXXYFDYWGQGTLVTVSS,AREGQNNYFDY,CTGGGCCTCAGGAAGCAGCATCGGAGGTGCCTCAGCCATGGCATGGATCCCTCTCTTCCTCGGCGTCCTTGCTTACTGCACAGGATCCGTGGCCTCCTATGAGCTGACTCAGCCACCCTCAGTGTCCGTGTCCCCAGGACAGACAGCCAGCATCACCTGCTCTGGAGATAAATTGGGGGATAAATATGCTTGCTGGTATCAGCAGAAGCCAGGCCAGTCCCCTGTGCTGGTCATCTATCAAGATAGCAAGCGGCCCTCAGGGATCCCTGAGCGATTCTCTGGCTCCAACTCTGGGAACACAGCCACTCTGACCATCAGCGGGACCCAGGCTATGGATGAGGCTGACTATTACTGTCAGGCGTGGGACAGCAGCATGAGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV3-1*01,TCCTATGAGCTGACTCAGCCACCCTCAGTGTCCGTGTCCCCAGGACAGACAGCCAGCATCACCTGCTCTGGAGATAAATTGGGGGATAAATATGCTTGCTGGTATCAGCAGAAGCCAGGCCAGTCCCCTGTGCTGGTCATCTATCAAGATAGCAAGCGGCCCTCAGGGATCCCTGAGCGATTCTCTGGCTCCAACTCTGGGAACACAGCCACTCTGACCATCAGCGGGACCCAGGCTATGGATGAGGCTGACTATTACTGTCAGGCGTGGGACAGCAGCATGAGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,SYELTQPPSVSVSPGQTASITCSGDKLGDKYACWYQQKPGQSPVLVIYQDSKRPSGIPERFSGSNSGNTATLTISGTQAMDEADYYCQAWDSSMRVFGGGTKLTVL,SYELTQPPSVSVSPGQTASITCSGDKLGDKYACWYQQKPGQSPVLVIYQDSKRPSGIPERFSGSNSGNTATLTISGTQAMDEADYYCQAWDSSXXVFGGGTKLTVL,QAWDSSMRV,QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAREGQNNYFDYWGQGTLVTVSS[SEP]SYELTQPPSVSVSPGQTASITCSGDKLGDKYACWYQQKPGQSPVLVIYQDSKRPSGIPERFSGSNSGNTATLTISGTQAMDEADYYCQAWDSSMRVFGGGTKLTVL,Multiple-sclerosis,human,,"Ramesh et al, 2020",37.0,IGHV1-2,IGLV3-1,IGHV1-2,IGLV3-1
10,EVQLVESGGGLVKPGGSLRLSCAASGFTFSSYSMNWVRQAPGKGLEWVSSISSSSSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARDTGRGATITEPKEDPPRDYWGQGTLVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,GAGAGAGGAGCCTTAGCCCTGGATTCCAAGGCCTATCCACTTGGTGATCAGCACTGAGCACCGAGGATTCACCATGGAACTGGGGCTCCGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCCTGGTCAAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATAGCATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCATCCATTAGTAGTAGTAGTAGTTACATATACTACGCAGACTCAGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCGAGAGATACCGGTCGGGGGGCTACGATCACCGAGCCAAAAGAAGATCCCCCCCGTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-21*01,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCCTGGTCAAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTCAGTAGCTATAGCATGAACTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCATCCATTAGTAGTAGTAGTAGTTACATATACTACGCAGACTCAGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCTGTGTATTACTGTGCGAGAGATACCGGTCGGGGGGCTACGATCACCGAGCCAAAAGAAGATCCCCCCCGTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVKPGGSLRLSCAASGFTFSSYSMNWVRQAPGKGLEWVSSISSSSSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXXXXATXXXXXXXXXXDYWGQGTLVTVSS,ARDTGRGATITEPKEDPPRDY,CTGGGGGAGTCAGTCCCAACCAGGACACAGCATGGACATGAGGGTCCCTGCTCAGCTCCTGGGGCTCCTGCTGCTCTGGCTCTCAGGTGCCAGATGTGACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGCCAGGCGAGTCAGGACATTAGCAACTATTTAAATTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAGCTCCTGATCTACGATGCATCCAATTTGGAAACAGGGGTCCCATCAAGGTTCAGTGGAAGTGGATCTGGGACAGATTTTACTTTCACCATCAGCAGCCTGCAGCCTGAAGATATTGCAACATATTACTGTCAACAGTATGATAATCTCCCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-33*01,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGCCAGGCGAGTCAGGACATTAGCAACTATTTAAATTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAGCTCCTGATCTACGATGCATCCAATTTGGAAACAGGGGTCCCATCAAGGTTCAGTGGAAGTGGATCTGGGACAGATTTTACTTTCACCATCAGCAGCCTGCAGCCTGAAGATATTGCAACATATTACTGTCAACAGTATGATAATCTCCCGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC,DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQYDNLPYTFGQGTKLEIK,DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQYDNLPYTFGQGTKLEIK,QQYDNLPYT,EVQLVESGGGLVKPGGSLRLSCAASGFTFSSYSMNWVRQAPGKGLEWVSSISSSSSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARDTGRGATITEPKEDPPRDYWGQGTLVTVSS[SEP]DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQYDNLPYTFGQGTKLEIK,Multiple-sclerosis,human,,"Ramesh et al, 2020",37.0,IGHV3-21,IGKV1-33,IGHV3-21,IGKV1-33


In [89]:
# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
df_pred_memory_grouped = groupby(df_pred_memory, [:general_v_gene_heavy_no_para, :cdr3_aa_heavy])

# Step 1: Filter out groups with only one row
df_pred_memory_grouped_filt = filter(g -> nrow(g) > 1, df_pred_memory_grouped)

# filter out every group that has the same entry in Subject
df_pred_memory_grouped_filt_o1s = filter(g -> length(unique(g.Subject)) > 1, df_pred_memory_grouped_filt)



Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String31,Int64,String,String,String1,String15,String,String,String,String,String1,String15,String,String,String,String31,String,String,String7,String15,String31,String15,String,String,String,String
1,EVQLVESGGGLVQPGESLRLSCEASGITFSSYWMSWVRQAPGKGLEWVANIKKDGSETWYVDSVKGRFTISRDNAKNSLYLQMNNLRVEDAAVYYCGRGSGWLQDYWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGGATTGGGGCTGAGCTGGGTTTTCCTTGTTGCTCTTTTAGAAGTTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGAGTCCCTGAGACTCTCCTGTGAAGCCTCTGGAATCACGTTCAGTAGCTATTGGATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATAAAAAAAGATGGAAGTGAGACATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAACCTGAGAGTCGAGGACGCGGCTGTGTATTACTGTGGGAGAGGGAGTGGCTGGTTACAGGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCGACAGCACCCCCCAAGATGGGAACGTGGTCGTCGCATGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGAACGTGACCGCCAGAAACTTCCC,H,IGHV3-7*01,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGAGTCCCTGAGACTCTCCTGTGAAGCCTCTGGAATCACGTTCAGTAGCTATTGGATGAGTTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATAAAAAAAGATGGAAGTGAGACATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAACCTGAGAGTCGAGGACGCGGCTGTGTATTACTGTGGGAGAGGGAGTGGCTGGTTACAGGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXSGWXXDYWGQGTLVTVSS,GRGSGWLQDY,GAGCTACAACAGGCAGGCAGGGGCAGCAAGATGGTGTTGCAGACCCAGGTCTTCATTTCTCTGTTGCTCTGGATCTCTGGTGCCTACGGGGACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTGTTTTCTCTAGCTCCAGTAATAAGAACTTCCTAGCTTGGTTCCAGAAGAAACCAGGGCAGCCTCCTAAGTTGCTAATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATCATAGTGGTCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV4-1*01,GACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTGTTTTCTCTAGCTCCAGTAATAAGAACTTCCTAGCTTGGTTCCAGAAGAAACCAGGGCAGCCTCCTAAGTTGCTAATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAGCAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATCATAGTGGTCCGCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIVMTQSPDSLAVSLGERATVNCKSSQSVFSSSSNKNFLAWFQKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYHSGPLTFGGGTKVEIK,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPLTFGGGTKVEIK,QQYHSGPLT,EVQLVESGGGLVQPGESLRLSCEASGITFSSYWMSWVRQAPGKGLEWVANIKKDGSETWYVDSVKGRFTISRDNAKNSLYLQMNNLRVEDAAVYYCGRGSGWLQDYWGQGTLVTVSS[SEP]DIVMTQSPDSLAVSLGERATVNCKSSQSVFSSSSNKNFLAWFQKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYHSGPLTFGGGTKVEIK,Multiple-sclerosis,human,,"Ramesh et al, 2020",37.0,IGHV3-7,IGKV4-1,IGHV3-7,IGKV4-1
2,EVQLVESGGGLVQPGGSLRLSCAASGVRLSNYWMSWVRQAPGKGLEWVANMKKDGSEKWYVDSVKGRFTISRDNAENSLFLQMDKLRDDDTAVYYCGRGSGWLQDYWGQGISVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAATTGGGGCTGACCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGAGTCCGCCTCAGTAACTATTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAAGAAAGATGGAAGTGAGAAATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATTTCCAGAGACAACGCCGAGAACTCACTGTTTCTGCAAATGGACAAACTGAGAGACGACGACACGGCTGTGTATTACTGCGGGAGGGGCAGTGGCTGGCTACAAGATTACTGGGGCCAGGGAATATCGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-7*04,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTCCAGCCGGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGAGTCCGCCTCAGTAACTATTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAAGAAAGATGGAAGTGAGAAATGGTATGTGGACTCTGTGAAGGGCCGATTCACCATTTCCAGAGACAACGCCGAGAACTCACTGTTTCTGCAAATGGACAAACTGAGAGACGACGACACGGCTGTGTATTACTGCGGGAGGGGCAGTGGCTGGCTACAAGATTACTGGGGCCAGGGAATATCGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARGSGWXXXYWGQGTLVTVSS,GRGSGWLQDY,GCTACAACAGGCAGGCAGGGGCAGCAAGATGGTGTTGCAGACCCAGGTCTTCATTTCTCTGTTGCTCTGGATCTCTGGTGCCTACGGGGACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTATTTTATCCAGTTCCAACAATAAGAACTACTTAGCTTGGTTCCACAAGAAACCAGGACAGCCTCCTAAACTACTCATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAACAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATTATGGTGGTCCTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV4-1*01,GACATCGTGATGACCCAGTCTCCAGACTCCCTGGCTGTGTCTCTGGGCGAGAGGGCCACCGTCAACTGCAAGTCCAGCCAGAGTATTTTATCCAGTTCCAACAATAAGAACTACTTAGCTTGGTTCCACAAGAAACCAGGACAGCCTCCTAAACTACTCATTTCCTGGGCATCTACCCGGGAATCCGGGGTCCCTGACCGATTCAGTGGCAGCGGGTCTGGGACAGATTTCACTCTCACCATCAACAGCCTGCAGGCTGAAGATGTGGCAGTTTATTACTGTCAGCAATATTATGGTGGTCCTCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC,DIVMTQSPDSLAVSLGERATVNCKSSQSILSSSNNKNYLAWFHKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTINSLQAEDVAVYYCQQYYGGPLTFGGGTKVEIK,DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPLTFGGGTKVEIK,QQYYGGPLT,EVQLVESGGGLVQPGGSLRLSCAASGVRLSNYWMSWVRQAPGKGLEWVANMKKDGSEKWYVDSVKGRFTISRDNAENSLFLQMDKLRDDDTAVYYCGRGSGWLQDYWGQGISVTVSS[SEP]DIVMTQSPDSLAVSLGERATVNCKSSQSILSSSNNKNYLAWFHKKPGQPPKLLISWASTRESGVPDRFSGSGSGTDFTLTINSLQAEDVAVYYCQQYYGGPLTFGGGTKVEIK,CMV,human,Donor-4,"Jaffe et al., 2022",50.0,IGHV3-7,IGKV4-1,IGHV3-7,IGKV4-1

Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String31,Int64,String,String,String1,String15,String,String,String,String,String1,String15,String,String,String,String31,String,String,String7,String15,String31,String15,String,String,String,String
1,EVQLAESGGGLVQPGGSLRLSCVVSGFTFSDYWMSWVRQAPGKGLEWVANMNEDGSEKYCLDGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARGDVNSGDYWGQGTLVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACTCACCATGGAGTTGGGGCTGAGCTGGGTTTTCCTTGTTGCTATTTTAGAAGGTGTCCAGTGTGAGGTGCAGCTGGCGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGTAGTCTCTGGATTCACGTTTAGTGACTACTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAATGAAGATGGAAGTGAGAAATACTGTCTGGACGGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTGTATTACTGTGCGAGAGGAGATGTCAACTCGGGCGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGCCTCCACCAAGGGCCCATCGGTCTTCCCCCTGGCGCCCTGCTCCAGGAGCACCTCCGAGAGCACAGCGGCCCTGGGCTGCCTGGTCAAGGACTACTTCCCCGAACCGGTGACGGTGTCGTGGAACTCAGGCGCTCTGACCAGCGGCGTGCACACCTTCCCGGCTGTCCTACAGTCCTCAGGA,H,IGHV3-7*03,GAGGTGCAGCTGGCGGAGTCTGGGGGAGGCTTGGTCCAGCCTGGGGGGTCCCTGAGACTCTCCTGTGTAGTCTCTGGATTCACGTTTAGTGACTACTGGATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACATGAATGAAGATGGAAGTGAGAAATACTGTCTGGAC---------GGCCGATTCACCATCTCCAGAGACAACGCCAAGAACTCACTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTGTATTACTGTGCGAGAGGAGATGTCAACTCGGGCGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXXNXXDYWGQGTLVTVSS,ARGDVNSGDY,GATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCCCAGGATCCAGTGGGGATCTTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAAAGCCTCGTACACAGTGATGGAAACATCTACTTGAATTGGCTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTCATACATAGGGTTTCTACCCGGGACTCTGGGGTCCCAGAAAGATTCAGCGGCAGTGGGTCAGGCACTAATTTCACACTGAGAATCAGCAGGGTGGAGGCTGAGGATGTTGGCGTTTATTACTGCATGCAAGGTAAACACTGGACTTTTGGCCAGGGGACCAAGCTGGAGATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-30*02,GATCTTGTGATGACTCAGTCTCCACTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAGTCAAAGCCTCGTACACAGTGATGGAAACATCTACTTGAATTGGCTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTCATACATAGGGTTTCTACCCGGGACTCTGGGGTCCCAGAAAGATTCAGCGGCAGTGGGTCAGGCACTAATTTCACACTGAGAATCAGCAGGGTGGAGGCTGAGGATGTTGGCGTTTATTACTGCATGCAAGGTAAACACTGGACTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC,DLVMTQSPLSLPVTLGQPASISCRSSQSLVHSDGNIYLNWLQQRPGQSPRRLIHRVSTRDSGVPERFSGSGSGTNFTLRISRVEAEDVGVYYCMQGKHWTFGQGTKLEIK,DVVMTQSPLSLPVTLGQPASISCRSSQSLVHSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTHWPFGQGTKLEIK,MQGKHWT,EVQLAESGGGLVQPGGSLRLSCVVSGFTFSDYWMSWVRQAPGKGLEWVANMNEDGSEKYCLDGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARGDVNSGDYWGQGTLVTVSS[SEP]DLVMTQSPLSLPVTLGQPASISCRSSQSLVHSDGNIYLNWLQQRPGQSPRRLIHRVSTRDSGVPERFSGSGSGTNFTLRISRVEAEDVGVYYCMQGKHWTFGQGTKLEIK,SARS-COV-2,human,Patient-15,"Mor et al., 2021",51,IGHV3-7,IGKV2-30,IGHV3-7,IGKV2-30
2,EVQLVESGGGLVQPGGSLRLSCAGSGFMFSDYWMTWVRQAPGKGLECVAITDQEGNERYSVHSVRGRFTISRDNAKNSLYLEMHSLRAEDTALYYCARGDVNSGDYWGQGTMVTVAS,Unsorted-B-Cells,1,Memory-B-Cells,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACGCACCATGGAATTGGGGCTGAGCTGGGTTTTCCTTGTTGTTATTCTAGAAGGTGTCCAGTGTGAGGTGCAATTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGCAGGCTCTGGATTTATGTTTAGCGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGCGTGGCCATCACAGACCAAGAAGGAAATGAGAGATACTCTGTTCACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACGCCAAAAATTCTCTGTATTTGGAAATGCACAGCCTGAGAGCCGAAGACACGGCTCTATATTACTGTGCGAGAGGGGATGTCAATTCGGGGGACTATTGGGGCCAGGGAACCATGGTCACCGTCGCGTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-7*01,GAGGTGCAATTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGCAGGCTCTGGATTTATGTTTAGCGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGCGTGGCCATCACAGACCAAGAAGGAAATGAGAGATACTCTGTTCACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACGCCAAAAATTCTCTGTATTTGGAAATGCACAGCCTGAGAGCCGAAGACACGGCTCTATATTACTGTGCGAGAGGGGATGTCAATTCGGGGGACTATTGGGGCCAGGGAACCATGGTCACCGTCGCGTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXXXSGDYWGQGTLVTVSS,ARGDVNSGDY,GGGGACTGATCAGGGCTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCCCAGGATCTCGTGGGTTTTTTGTGCTGACTCAGTCTCCACTCTCACTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGGGATGGATATACCTACCTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTTCTTTATAAGGTTTCTCACCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAGAATCAGTAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAACATACTGGATGTTCGGCCAAGGGACCAAACTGGAGATCAAAGGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-30*01,TTGTGCTGACTCAGTCTCCACTCTCACTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGGGATGGATATACCTACCTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTTCTTTATAAGGTTTCTCACCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAGAATCAGTAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAACATACTGGATGTTCGGCCAAGGGACCAAACTGGAGATCAAA,VLTQSPLSLPVTLGQPASISCRSNASLLDRDGYTYLNWFQQRPGQSPRRLLYKVSHRDSGVPDRFSGSGSGTDFTLRISRVEAEDVAVYYCMQATYWMFGQGTKLEIK,VMTQSPLSLPVTLGQPASISCRSSQSLVYSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTHWXFGQGTKVEIK,MQATYWM,EVQLVESGGGLVQPGGSLRLSCAGSGFMFSDYWMTWVRQAPGKGLECVAITDQEGNERYSVHSVRGRFTISRDNAKNSLYLEMHSLRAEDTALYYCARGDVNSGDYWGQGTMVTVAS[SEP]VLTQSPLSLPVTLGQPASISCRSNASLLDRDGYTYLNWFQQRPGQSPRRLLYKVSHRDSGVPDRFSGSGSGTDFTLRISRVEAEDVAVYYCMQATYWMFGQGTKLEIK,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV3-7,IGKV2-30,IGHV3-7,IGKV2-30
3,EVQLVESGGGLVQPGGSLRLSCVASGFMFSDYWMTWVRQAPGKGLEWVANTNQDGSDKHYVYSVRGRFTISRDNTENSLFLEMHSLRPEDTALYYCARGDVNSGDYWGQGTMVTVSS,Unsorted-B-Cells,1,Memory-B-Cells,AGGTCTCAGAGAGGAGCCTTAGCCCTGGACTCCAAGGCCTTTCCACTTGGTGATCAGCACTGAGCACAGAGGACGCACCATGGAATTGGGGCTGAACTGGGTTTTCCTTGTTGCTATTCTGGAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGTAGCCTCTGGATTTATGTTTAGTGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACACAAACCAAGATGGGAGTGACAAGCACTATGTCTACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACACCGAAAATTCTCTGTTTCTGGAAATGCACAGCCTGAGACCCGAAGACACGGCTCTATATTATTGTGCGCGAGGGGATGTCAACTCGGGGGACTACTGGGGCCAGGGAACCATGGTCACCGTCTCCTCAGCATCCCCGACCAGCCCCAAGGTCTTCCCGCTGAGCCTCTGCAGCACCCAGCCAGATGGGAACGTGGTCATCGCCTGCCTGGTCCAGGGCTTCTTCCCCCAGGAGCCACTCAGTGTGACCTGGAGCGAAAGCGGACAGGGCGTGACCGCCAGAAACTTCCC,H,IGHV3-7*01,GAGGTGCAGCTGGTGGAGTCTGGGGGCGGCTTGGTCCAGCCTGGGGGATCCCTGAGACTCTCCTGTGTAGCCTCTGGATTTATGTTTAGTGACTATTGGATGACCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTGGCCAACACAAACCAAGATGGGAGTGACAAGCACTATGTCTACTCTGTGAGGGGCCGCTTCACCATCTCCAGAGACAACACCGAAAATTCTCTGTTTCTGGAAATGCACAGCCTGAGACCCGAAGACACGGCTCTATATTATTGTGCGCGAGGGGATGTCAACTCGGGGGACTACTGGGGCCAGGGAACCATGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYWMSWVRQAPGKGLEWVANIKQDGSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCARXXXXSGDYWGQGTLVTVSS,ARGDVNSGDY,CTGGGGGATCAGGACTCCTCAGTTCACCTTCTCACAATGAGGCTCCCTGCTCAGCTCCTGGGGCTGCTAATGCTCTGGGTCCCAGGATCTCGTGGGTATTTTGTGATGACTCAGTCTCCTCTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGTGATGGGAACACCCACTTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTACTTTATAAGGTTTCTCGCCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAAAATCAGCAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAATATACTGGACGTTCGGCCAAGGGACCAAACTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV2-30*01,ATTTTGTGATGACTCAGTCTCCTCTCTCCCTGCCCGTCACCCTTGGACAGCCGGCCTCCATCTCCTGCAGGTCTAATGCCAGCCTCCTCGACAGTGATGGGAACACCCACTTGAATTGGTTTCAGCAGAGGCCAGGCCAATCTCCAAGGCGCCTACTTTATAAGGTTTCTCGCCGGGACTCTGGGGTCCCAGACAGATTCAGCGGCAGTGGGTCAGGCACTGATTTCACACTGAAAATCAGCAGGGTGGAGGCCGAGGATGTTGCAGTTTATTACTGCATGCAAGCAATATACTGGACGTTCGGCCAAGGGACCAAACTGGAAATCAAAC,FVMTQSPLSLPVTLGQPASISCRSNASLLDSDGNTHLNWFQQRPGQSPRRLLYKVSRRDSGVPDRFSGSGSGTDFTLKISRVEAEDVAVYYCMQAIYWTFGQGTKLEIK,VVMTQSPLSLPVTLGQPASISCRSSQSLVYSDGNTYLNWFQQRPGQSPRRLIYKVSNRDSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQGTHWTFGQGTKVEIK,MQAIYWT,EVQLVESGGGLVQPGGSLRLSCVASGFMFSDYWMTWVRQAPGKGLEWVANTNQDGSDKHYVYSVRGRFTISRDNTENSLFLEMHSLRPEDTALYYCARGDVNSGDYWGQGTMVTVSS[SEP]FVMTQSPLSLPVTLGQPASISCRSNASLLDSDGNTHLNWFQQRPGQSPRRLLYKVSRRDSGVPDRFSGSGSGTDFTLKISRVEAEDVAVYYCMQAIYWTFGQGTKLEIK,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV3-7,IGKV2-30,IGHV3-7,IGKV2-30


In [90]:
# alternative: Calculate the fraction of groups where all entries have the same `general_v_gene_light`
true_cases_uns = 0
total_groups_uns = length(df_pred_memory_grouped_filt_o1s)

for group in df_pred_memory_grouped_filt_o1s
    # Check if all entries in the group have the same `general_v_gene_light`
    unique_genes_uns = unique(group.general_v_gene_light_no_para)
    if length(unique_genes_uns) == 1
        true_cases_uns += 1  # Increment the count if all genes are identical
    end
end

# Step 3: Calculate the fraction of "true" cases
fraction_true_uns = (true_cases_uns / total_groups_uns) * 100

println("Percentage of groups where all entries have the same general_v_gene_light: $fraction_true_uns%")
fraction_true_uns

81.1377245508982

In [91]:
# Group by `general_v_gene_heavy` and `cdr3_aa_heavy`
df_pred_naive_grouped = groupby(df_pred_naive, [:general_v_gene_heavy_no_para, :cdr3_aa_heavy])

# Step 1: Filter out groups with only one row
df_pred_naive_grouped_filt = filter(g -> nrow(g) > 1, df_pred_naive_grouped)

# filter out every group that has the same entry in Subject
df_pred_naive_grouped_filt_o1s = filter(g -> length(unique(g.Subject)) > 1, df_pred_naive_grouped_filt)


Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String31,Int64,String,String,String1,String15,String,String,String,String,String1,String15,String,String,String,String31,String,String,String7,String15,String31,String15,String,String,String,String
1,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKAWDYWGQGTLVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCATTTGGTGATCAGGACTGAACAGAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGCTTTTTCTTGTGGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATATTACTGTGCGAAAGCTTGGGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-23*01,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATATTACTGTGCGAAAGCTTGGGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXDYWGQGTLVTVSS,AKAWDY,AGAGCTCTGGAGAAGAGCTGCTCAGTTAGGACCCAGAGGGAACCATGGAAACCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACCGGAGAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCAGCTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTAGCTCACCGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-20*01,GAAATTGTGTTGACGCAGTCTCCAGGCACCCTGTCTTTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCAGCTACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCAGCAGGGCCACTGGCATCCCAGACAGGTTCAGTGGCAGTGGGTCTGGGACAGACTTCACTCTCACCATCAGCAGACTGGAGCCTGAAGATTTTGCAGTGTATTACTGTCAGCAGTATGGTAGCTCACCGTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSPWTFGQGTKVEIK,EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSPWTFGQGTKVEIK,QQYGSSPWT,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKAWDYWGQGTLVTVSS[SEP]EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSPWTFGQGTKVEIK,,human,Donor-3,"Jaffe et al., 2022",38,IGHV3-23,IGKV3-20,IGHV3-23,IGKV3-20
2,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKAWDYWGQGTLVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCATTTGGTGATCAGGACTGAACAGAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGCTTTTTCTTGTGGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATATTACTGTGCGAAAGCCTGGGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-23*04,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATATTACTGTGCGAAAGCCTGGGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXDYWGQGTLVTVSS,AKAWDY,TGGGGGAGTCAGTCTCAGTCAGGACACAGCATGGACATGAGGGTCCCCGCTCAGCTCCTGGGGCTCCTGCTACTCTGGCTCCGAGGTGCCAGATGTGACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGCCGGGCAAGTCAGAGCATTAGCAGCTATTTAAATTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAGGTTCAGTGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGTCTGCAACCTGAAGATTTTGCAACTTACTACTGTCAACAGAGTTACAGTACCCCTTTGTTCACTTTCGGCCCTGGGACCAAAGTGGATATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV1-39*01,GACATCCAGATGACCCAGTCTCCATCCTCCCTGTCTGCATCTGTAGGAGACAGAGTCACCATCACTTGCCGGGCAAGTCAGAGCATTAGCAGCTATTTAAATTGGTATCAGCAGAAACCAGGGAAAGCCCCTAAGCTCCTGATCTATGCTGCATCCAGTTTGCAAAGTGGGGTCCCATCAAGGTTCAGTGGCAGTGGATCTGGGACAGATTTCACTCTCACCATCAGCAGTCTGCAACCTGAAGATTTTGCAACTTACTACTGTCAACAGAGTTACAGTACCCCTTTGTTCACTTTCGGCCCTGGGACCAAAGTGGATATCAAAC,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPLFTFGPGTKVDIK,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPXFTFGPGTKVDIK,QQSYSTPLFT,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKAWDYWGQGTLVTVSS[SEP]DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPLFTFGPGTKVDIK,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV3-23,IGKV1-39,IGHV3-23,IGKV1-39

Row,sequence_alignment_aa_heavy,BType,predicted_class,predicted_btype,sequence_heavy,locus_heavy,v_call_heavy,sequence_alignment_heavy,germline_alignment_aa_heavy,cdr3_aa_heavy,sequence_light,locus_light,v_call_light,sequence_alignment_light,sequence_alignment_aa_light,germline_alignment_aa_light,cdr3_aa_light,sequence_alignment_heavy_sep_light,Disease,Species,Subject,Author,Age,general_v_gene_heavy,general_v_gene_light,general_v_gene_heavy_no_para,general_v_gene_light_no_para
Unnamed: 0_level_1,String,String31,Int64,String,String,String1,String15,String,String,String,String,String1,String15,String,String,String,String31,String,String,String7,String15,String31,String15,String,String,String,String
1,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKDAGYYFDYWGQGTLVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCATTTGGTGATCAGGACTGAACAGAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGCTTTTTCTTGTGGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATATTACTGTGCGAAAGATGCTGGGTACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-23*04,GAGGTGCAGCTGGTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATATTACTGTGCGAAAGATGCTGGGTACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXAGXYFDYWGQGTLVTVSS,AKDAGYYFDY,GGGCCTCAGGAAGCAGCATCGGAGGTGCCTCAGCCATGGCATGGATCCCTCTCTTCCTCGGCGTCCTTGCTTACTGCACAGGATCCGTGGCCTCCTATGAGCTGACTCAGCCACCCTCAGTGTCCGTGTCCCCAGGACAGACAGCCAGCATCACCTGCTCTGGAGATAAATTGGGGGATAAATATGCTTGCTGGTATCAGCAGAAGCCAGGCCAGTCCCCTGTGCTGGTCATCTATCAAGATAGCAAGCGGCCCTCAGGGATCCCTGAGCGATTCTCTGGCTCCAACTCTGGGAACACAGCCACTCTGACCATCAGCGGGACCCAGGCTATGGATGAGGCTGACTATTACTGTCAGGCGTGGGACAGCAGCACTGGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAGGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGTGTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGGAGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTA,L,IGLV3-1*01,TCCTATGAGCTGACTCAGCCACCCTCAGTGTCCGTGTCCCCAGGACAGACAGCCAGCATCACCTGCTCTGGAGATAAATTGGGGGATAAATATGCTTGCTGGTATCAGCAGAAGCCAGGCCAGTCCCCTGTGCTGGTCATCTATCAAGATAGCAAGCGGCCCTCAGGGATCCCTGAGCGATTCTCTGGCTCCAACTCTGGGAACACAGCCACTCTGACCATCAGCGGGACCCAGGCTATGGATGAGGCTGACTATTACTGTCAGGCGTGGGACAGCAGCACTGGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG,SYELTQPPSVSVSPGQTASITCSGDKLGDKYACWYQQKPGQSPVLVIYQDSKRPSGIPERFSGSNSGNTATLTISGTQAMDEADYYCQAWDSSTGVFGGGTKLTVL,SYELTQPPSVSVSPGQTASITCSGDKLGDKYACWYQQKPGQSPVLVIYQDSKRPSGIPERFSGSNSGNTATLTISGTQAMDEADYYCQAWDSSTGVFGGGTKLTVL,QAWDSSTGV,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKDAGYYFDYWGQGTLVTVSS[SEP]SYELTQPPSVSVSPGQTASITCSGDKLGDKYACWYQQKPGQSPVLVIYQDSKRPSGIPERFSGSNSGNTATLTISGTQAMDEADYYCQAWDSSTGVFGGGTKLTVL,CMV,human,Donor-4,"Jaffe et al., 2022",50,IGHV3-23,IGLV3-1,IGHV3-23,IGLV3-1
2,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKDAGYYFDYWGQGTLVTVSS,Unsorted-B-Cells,0,Naive-B-Cells,AGCTCTGAGAGAGGAGCCCAGCCCTGGGATTTTCAGGTGTTTTCATTTGGTGATCAGGACTGAACAGAGAGAACTCACCATGGAGTTTGGGCTGAGCTGGCTTTTTCTTGTGGCTATTTTAAAAGGTGTCCAGTGTGAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATATTACTGTGCGAAAGATGCGGGATACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCCTTTTCCCCCTCGTCTCCTGTGAGAATTCCCCGTCGGATACGAGCAGCGTG,H,IGHV3-23*01,GAGGTGCAGCTGTTGGAGTCTGGGGGAGGCTTGGTACAGCCTGGGGGGTCCCTGAGACTCTCCTGTGCAGCCTCTGGATTCACCTTTAGCAGCTATGCCATGAGCTGGGTCCGCCAGGCTCCAGGGAAGGGGCTGGAGTGGGTCTCAGCTATTAGTGGTAGTGGTGGTAGCACATACTACGCAGACTCCGTGAAGGGCCGGTTCACCATCTCCAGAGACAATTCCAAGAACACGCTGTATCTGCAAATGAACAGCCTGAGAGCCGAGGACACGGCCGTATATTACTGTGCGAAAGATGCGGGATACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKXXXXYFDYWGQGTLVTVSS,AKDAGYYFDY,AGAGCTCTGGGGAGGAACTGCTCAGTTAGGACCCAGACGGAACCATGGAAGCCCCAGCGCAGCTTCTCTTCCTCCTGCTACTCTGGCTCCCAGATACCACTGGAGAAATAGTGATGACGCAGTCTCCAGCCACCCTGTCTGTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCAACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCACCAGGGCCACTGGTATCCCAGCCAGGTTCAGTGGCAGTGGGTCTGGGACAGAGTTCACTCTCACCATCAGCAGCCTGCAGTCTGAAGATTTTGCAGTTTATTACTGTCAGCAGTATAATAACTGGCCTCCCTTGGCTTTCGGCCCTGGGACCAAAGTGGATATCAAACGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTGCCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGC,K,IGKV3-15*01,GAAATAGTGATGACGCAGTCTCCAGCCACCCTGTCTGTGTCTCCAGGGGAAAGAGCCACCCTCTCCTGCAGGGCCAGTCAGAGTGTTAGCAGCAACTTAGCCTGGTACCAGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATCTATGGTGCATCCACCAGGGCCACTGGTATCCCAGCCAGGTTCAGTGGCAGTGGGTCTGGGACAGAGTTCACTCTCACCATCAGCAGCCTGCAGTCTGAAGATTTTGCAGTTTATTACTGTCAGCAGTATAATAACTGGCCTCCCTTGGCTTTCGGCCCTGGGACCAAAGTGGATATCAAAC,EIVMTQSPATLSVSPGERATLSCRASQSVSSNLAWYQQKPGQAPRLLIYGASTRATGIPARFSGSGSGTEFTLTISSLQSEDFAVYYCQQYNNWPPLAFGPGTKVDIK,EIVMTQSPATLSVSPGERATLSCRASQSVSSNLAWYQQKPGQAPRLLIYGASTRATGIPARFSGSGSGTEFTLTISSLQSEDFAVYYCQQYNNWPPXXFGPGTKVDIK,QQYNNWPPLA,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKDAGYYFDYWGQGTLVTVSS[SEP]EIVMTQSPATLSVSPGERATLSCRASQSVSSNLAWYQQKPGQAPRLLIYGASTRATGIPARFSGSGSGTEFTLTISSLQSEDFAVYYCQQYNNWPPLAFGPGTKVDIK,SARS-COV-2,human,Donor-2,"Jaffe et al., 2022",35,IGHV3-23,IGKV3-15,IGHV3-23,IGKV3-15


In [92]:
# alternative: Calculate the fraction of groups where all entries have the same `general_v_gene_light`
true_cases_uns = 0
total_groups_uns = length(df_pred_naive_grouped_filt_o1s)

for group in df_pred_naive_grouped_filt_o1s
    # Check if all entries in the group have the same `general_v_gene_light`
    unique_genes_uns = unique(group.general_v_gene_light_no_para)
    if length(unique_genes_uns) == 1
        true_cases_uns += 1  # Increment the count if all genes are identical
    end
end

# Step 3: Calculate the fraction of "true" cases
fraction_true_uns = (true_cases_uns / total_groups_uns) * 100

println("Percentage of groups where all entries have the same general_v_gene_light: $fraction_true_uns%")
fraction_true_uns

23.809523809523807