In [50]:
import pandas as pd

# Specify path to the VCAb file
datafile = "../VCAb_data/VCAb.csv"

# Pandas to read csv file as a dataframe
df = pd.read_csv(datafile)

# Simplify the naming conventions in certain columns using str.extract(r"regex_expression")
# This captures the matched word at the start of string, stopping at the first non-word character "("
df["H_isotype_clean"] = df["Htype"].str.extract(r"^(\w+)")
df["L_isotype_clean"] = df["Ltype"].str.extract(r"^(\w+)")

# Reduce dataframe size down to just the key columns (especially the "_clean" ones)
columns_to_keep = ["pdb","Hchain", "Lchain", "H_coordinate_seq", "L_coordinate_seq", 
                   "title", "release_date","method", "resolution", "carbohydrate", 
                   "HC_species", "H_isotype_clean", "L_isotype_clean", "HC_coordinate_seq", 
                   "LC_coordinate_seq", "HV_seq", "LV_seq","disulfide_bond"
]
df_keep = df[columns_to_keep]

# Filter down to rows of interest with conditions. Using wrapped conditions with & 
# `[(cond_1) & (cond_2) & ...etc]`.
df_only_human_and_kappa_chain = df_keep[
    (df_keep["HC_species"] == "homo_sapiens") & 
    (df_keep["L_isotype_clean"].str.contains("kappa", case=False))
]

# Visualise the data
print(f"Rows, columns: {df_only_human_and_kappa_chain.shape}")
df_only_human_and_kappa_chain.head(5)


Rows, columns: (3767, 18)


Unnamed: 0,pdb,Hchain,Lchain,H_coordinate_seq,L_coordinate_seq,title,release_date,method,resolution,carbohydrate,HC_species,H_isotype_clean,L_isotype_clean,HC_coordinate_seq,LC_coordinate_seq,HV_seq,LV_seq,disulfide_bond
5,1a4j,B;H,A;L,QVQLLESGPELKKPGETVKISCKASGYTFTNYGMNWVKQAPGKGLK...,ELVMTQTPLSLPVSLGDQASISCRSSQSLVHSNGNTYLHWYLQKPG...,DIELS ALDER CATALYTIC ANTIBODY GERMLINE PRECURSOR,19980513,X-ray diffraction,2.1,,homo_sapiens,IgG1,kappa,ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGAL...,RTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNA...,QVQLLESGPELKKPGETVKISCKASGYTFTNYGMNWVKQAPGKGLK...,ELVMTQTPLSLPVSLGDQASISCRSSQSLVHSNGNTYLHWYLQKPG...,"CYS146(B)-CYS202(B):6.60, CYS139(A)-CYS199(A):..."
6,1a4k,B,A,QVQLLESGPELKKPGETVKISCKASGYTFTNYGMNWVKQAPGKGLK...,ELVMTQTPLSLPVSLGDQASISCRSSQSLLHSNGNTYLHWYLQKPG...,DIELS ALDER CATALYTIC ANTIBODY WITH TRANSITION...,19980513,X-ray diffraction,2.4,,homo_sapiens,IgG1,kappa,ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGAL...,RTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNA...,QVQLLESGPELKKPGETVKISCKASGYTFTNYGMNWVKQAPGKGLK...,ELVMTQTPLSLPVSLGDQASISCRSSQSLLHSNGNTYLHWYLQKPG...,"CYS140(B)-CYS196(B):6.71, CYS134(A)-CYS194(A):..."
7,1a4k,H,L,QVQLLESGPELKKPGETVKISCKASGYTFTNYGMNWVKQAPGKGLK...,ELVMTQTPLSLPVSLGDQASISCRSSQSLLHSNGNTYLHWYLQKPG...,DIELS ALDER CATALYTIC ANTIBODY WITH TRANSITION...,19980513,X-ray diffraction,2.4,,homo_sapiens,IgG1,kappa,ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGAL...,RTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNA...,QVQLLESGPELKKPGETVKISCKASGYTFTNYGMNWVKQAPGKGLK...,ELVMTQTPLSLPVSLGDQASISCRSSQSLLHSNGNTYLHWYLQKPG...,"CYS140(H)-CYS196(H):6.65, CYS134(L)-CYS194(L):..."
11,1ad0,B;D,A;C,EVQLLESGGGLVQPGGSLRLSCATSGFTFTDYYMNWVRQAPGKGLE...,QTVLTQSPSSLSVSVGDRVTITCRASSSVTYIHWYQQKPGLAPKSL...,FAB FRAGMENT OF ENGINEERED HUMAN MONOCLONAL AN...,19980225,X-ray diffraction,2.5,,homo_sapiens,IgG4,kappa,ASTKGPSVFPLAPCSRSTSESTAALGCLVKDYFPEPVTVSWNSGAL...,RTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNA...,EVQLLESGGGLVQPGGSLRLSCATSGFTFTDYYMNWVRQAPGKGLE...,QTVLTQSPSSLSVSVGDRVTITCRASSSVTYIHWYQQKPGLAPKSL...,"CYS127(B)-CYS213(A):4.41, CYS140(B)-CYS195(B):..."
12,1ad9,B;H,A;L,EIQLVQSGAEVKKPGSSVKVSCKASGYTFTDYYINWMRQAPGQGLE...,DIQMTQSPSTLSASVGDRVTITCRSSKSLLHSNGDTFLYWFQQKPG...,IGG-FAB FRAGMENT OF ENGINEERED HUMAN MONOCLONA...,19980225,X-ray diffraction,2.8,,homo_sapiens,IgG4,kappa,ASTKGPSVFPLAPCSRSTSESTAALGCLVKDYFPEPVTVSWNSGAL...,RTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNA...,EIQLVQSGAEVKKPGSSVKVSCKASGYTFTDYYINWMRQAPGQGLE...,DIQMTQSPSTLSASVGDRVTITCRSSKSLLHSNGDTFLYWFQQKPG...,"CYS127(B)-CYS213(A):5.15, CYS140(B)-CYS196(B):..."


In [51]:
# Create a list of rows of interest (can be by PDB ID or Heavy Chain Type)

# Define the templates of interest
# - Variable Heavy & Light template
v_template = "1n8z"
# - Constant Heavy & Light template
c_template = "3m8o"

# Filter to template rows. Filters only rows where matching pdb is True. 
# NOTE: df_filtered is now and independant copy of df_only_human_and_kappa
df_filtered = df_only_human_and_kappa_chain[
    df_only_human_and_kappa_chain["pdb"].isin([v_template, c_template])
].copy()

df_filtered

Unnamed: 0,pdb,Hchain,Lchain,H_coordinate_seq,L_coordinate_seq,title,release_date,method,resolution,carbohydrate,HC_species,H_isotype_clean,L_isotype_clean,HC_coordinate_seq,LC_coordinate_seq,HV_seq,LV_seq,disulfide_bond
242,1n8z,B,A,EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKL...,Crystal structure of extracellular domain of h...,20030218,X-ray diffraction,2.52,,homo_sapiens,IgG1,kappa,ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGAL...,RTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNA...,EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKL...,"CYS147(B)-CYS203(B):6.59, CYS134(A)-CYS194(A):..."
1017,3m8o,H,L,EVQLVESGGGLVQPGGSLKLSCAASGFTLSGSNVHWVRQASGKGLE...,DIVMTQSPLSLSVTPGEPASISCRSSQSLLRRDGHNDLEWYLQKPG...,Human IgA1 Fab fragment,20110330,X-ray diffraction,1.55,,homo_sapiens,IgA1,kappa,ASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQ...,RTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNA...,EVQLVESGGGLVQPGGSLKLSCAASGFTLSGSNVHWVRQASGKGLE...,DIVMTQSPLSLSVTPGEPASISCRSSQSLLRRDGHNDLEWYLQKPG...,"CYS133(H)-CYS219(L):5.69, CYS145(H)-CYS204(H):..."


In [None]:
# Validation function to check seq lengths actual vs expected
# If present, remove duplicate letter from the V-to-C region boundary in columns "LV_seq" and "HV_seq"

def safely_trim_overlap(df, whole_seq_col, v_region_col, c_region_col, output_col):
    # Using iterrows for stable behaviour in pandas
    for idx, row in df.iterrows():
        actual = str(row[v_region_col]) + str(row[c_region_col])
        expected = str(row[whole_seq_col])
        
        if len(expected) == len(actual) - 1:
            # Check if the final V-region residue == first C-region residue
            if str(row[v_region_col])[-1] == str(row[c_region_col])[0]:
                trimmed_V_seq = str(row[v_region_col])[:-1]  # remove last character from seq
                print(trimmed_V_seq)
                # Adds output column containing trimmed sequence
                df.loc[idx, output_col] = trimmed_V_seq
                print(f"Trimmed last residue from {v_region_col} (PDB {row['pdb']}) at {output_col}")

# Trim V-to-C overlap for heavy chains
safely_trim_overlap(df_filtered, "H_coordinate_seq", "HV_seq", "HC_coordinate_seq", "HV_trimmed")

# Trim V-to-C overlap for heavy chains
safely_trim_overlap(df_filtered, "L_coordinate_seq", "LV_seq", "LC_coordinate_seq", "LV_trimmed")

df_filtered

EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLEWVARIYPTNGYTRYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCSRWGGDGFYAMDYWGQGTLVTVSS
Trimmed last residue from HV_seq (PDB 1n8z) at HV_trimmed
EVQLVESGGGLVQPGGSLKLSCAASGFTLSGSNVHWVRQASGKGLEWVGRIKRNAESDATAYAASMRGRLTISRDDSKNTAFLQMNSLKSDDTAMYYCVIRGDVYNRQWGQGTLVTVSS
Trimmed last residue from HV_seq (PDB 3m8o) at HV_trimmed
DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQHYTTPPTFGQGTKVEIK
Trimmed last residue from LV_seq (PDB 1n8z) at LV_trimmed
DIVMTQSPLSLSVTPGEPASISCRSSQSLLRRDGHNDLEWYLQKPGQSPQPLIYLGSTRASGVPDRFSGSGSGTDFTLKIIRVEAEDAGTYYCMQNKQTPLTFGQGTRLEIK
Trimmed last residue from LV_seq (PDB 3m8o) at LV_trimmed


Unnamed: 0,pdb,Hchain,Lchain,H_coordinate_seq,L_coordinate_seq,title,release_date,method,resolution,carbohydrate,HC_species,H_isotype_clean,L_isotype_clean,HC_coordinate_seq,LC_coordinate_seq,HV_seq,LV_seq,disulfide_bond,HV_trimmed,LV_trimmed
242,1n8z,B,A,EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKL...,Crystal structure of extracellular domain of h...,20030218,X-ray diffraction,2.52,,homo_sapiens,IgG1,kappa,ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGAL...,RTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNA...,EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKL...,"CYS147(B)-CYS203(B):6.59, CYS134(A)-CYS194(A):...",EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKL...
1017,3m8o,H,L,EVQLVESGGGLVQPGGSLKLSCAASGFTLSGSNVHWVRQASGKGLE...,DIVMTQSPLSLSVTPGEPASISCRSSQSLLRRDGHNDLEWYLQKPG...,Human IgA1 Fab fragment,20110330,X-ray diffraction,1.55,,homo_sapiens,IgA1,kappa,ASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQ...,RTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNA...,EVQLVESGGGLVQPGGSLKLSCAASGFTLSGSNVHWVRQASGKGLE...,DIVMTQSPLSLSVTPGEPASISCRSSQSLLRRDGHNDLEWYLQKPG...,"CYS133(H)-CYS219(L):5.69, CYS145(H)-CYS204(H):...",EVQLVESGGGLVQPGGSLKLSCAASGFTLSGSNVHWVRQASGKGLE...,DIVMTQSPLSLSVTPGEPASISCRSSQSLLRRDGHNDLEWYLQKPG...


In [None]:
# Extract relevant sequences

# Header should be like: >{pdbID}_{pdb-chain}_{Heavy/Light}_{Constant/Variable}_{IgType/Variable-Region}_{Template/Target}

# For each row in df, take all heavy-chain info, take only V or C info, take sequence, put in a FASTA format, 
# repeat for light-chain, then move onto the next row
def write_fasta_from_df(df, heavy_out_path, light_out_path, v_template, c_template):
    templates = [v_template, c_template]
    
    try:
        with open(heavy_out_path, "w") as heavy_out, open(light_out_path, "w") as light_out:
            
            # For each df row
            for idx, row in df.iterrows():
                
                # Set role to "template" or "target"
                role = "Template" if row["pdb"] in templates else "Target"
                
                # Decide if this row is v template
                if row["pdb"] == v_template:

                    # Write Heavy Header
                    vh_header = f">{row["pdb"]}_{row["Hchain"]}_Heavy_V_Var_{role}"
                    # Assign Heavy Sequence
                    vh_seq = str(row["H_coordinate_seq"])
                    # Write Light Header
                    vl_header = f">{row["pdb"]}_{row["Lchain"]}_Light_V_Var_{role}"
                    # Assign Light Sequence
                    vl_seq = str(row["L_coordinate_seq"])
                    
                    # Prime HV_trimmed and LV_trimmed
                    just_hv_seq = str(row["HV_trimmed"])
                    just_lv_seq = str(row["LV_trimmed"])

                                
                # Decide if this row is c template
                if row["pdb"] == c_template:
                    
                    # Write Heavy Header
                    ch_header = f">{row["pdb"]}_{row["Hchain"]}_Heavy_C_{row["H_isotype_clean"]}_{role}"
                    # Assign Heavy Sequence
                    ch_seq = str(row["H_coordinate_seq"])
                    # Write Light Header
                    cl_header = f">{row["pdb"]}_{row["Lchain"]}_Light_C_{row["H_isotype_clean"]}_{role}"
                    # Assign Light Sequence
                    cl_seq = str(row["L_coordinate_seq"])
                    # Write Ig label
                    Ig_label = str(row["H_isotype_clean"])

                    # Prime HC_trimmed and LC_trimmed
                    just_hc_seq = str(row["HC_coordinate_seq"])
                    just_lc_seq = str(row["LC_coordinate_seq"])

            # Write Hybrid Headers
            hybrid_heavy_header = f">Heavy_VC_Hybrid_{Ig_label}_Target"
            hybrid_light_header = f">Light_VC_Hybrid_{Ig_label}_Target"
            
            # Concatenate heavy
            hybrid_heavy_seq = just_hv_seq + just_hc_seq
            print(hybrid_heavy_seq)

            # Concatenate light
            hybrid_light_seq = just_lv_seq + just_lc_seq
            print(hybrid_light_seq)

            # Write Heavy Chain FASTA
            heavy_out.write(f"{vh_header}\n{vh_seq}\n{ch_header}\n{ch_seq}\n{hybrid_heavy_header}\n{hybrid_heavy_seq}\n")
                
            # Write Light Chain FASTA
            light_out.write(f"{vl_header}\n{vl_seq}\n{cl_header}\n{cl_seq}\n{hybrid_light_header}\n{hybrid_light_seq}\n")
        
        print("FASTAs written to: ", heavy_out, light_out)

    except Exception as e:
        print(f"Error whilst writing FASTA file: {e}")

#>1n8z_B_Heavy_V_Var_Template
#>1n8z_A_Light_V_Var_Template

#>3m8o_H_Heavy_C_IgA1_Template
#>3m8o_L_Light_C_IgA1_Template

#>Heavy_V_C_IgA1_Target
#>Light_V_C_IgA1_Target
# Need "HV/LV_trimmed" + "HC/LC_coordinate_seq"

In [60]:
write_fasta_from_df(df_filtered, "../FASTA_seqs/heavy_chains_FASTA_out.fasta", "../FASTA_seqs/light_chains_FASTA_out.fasta", v_template, c_template)

EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLEWVARIYPTNGYTRYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCSRWGGDGFYAMDYWGQGTLVTVSSASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQGVTARNFPPSQDASGDLYTTSSQLTLPATQCLAGKSVTCHVKHYTNPSQDVTVPCP
DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQHYTTPPTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
FASTAs written to:  <_io.TextIOWrapper name='../FASTA_seqs/heavy_chains_FASTA_out.fasta' mode='w' encoding='cp1252'> <_io.TextIOWrapper name='../FASTA_seqs/light_chains_FASTA_out.fasta' mode='w' encoding='cp1252'>


In [None]:
# Dont run this cell. THIS DOES NOT WORK because SeqRecords class object can only hold .id, .desc and .seq.
from Bio import SeqIO

records = SeqIO.parse("THIS_IS_YOUR_INPUT_FILE.clustal", "clustal")
count = SeqIO.write(records, "THIS_IS_YOUR_OUTPUT_FILE.pir", "pir")
print("Converted %i records" % count)

In [None]:
from Bio import SeqIO

MSA_heavy_filepath = "../Alignments/clustalo_alignment_IgA1_heavy - Copy.clustal"
records = SeqIO.parse(MSA_heavy_filepath, "clustal")

count = SeqIO.write(records, "../Alignments/alignment_IgA1_heavy_copy.pir", "pir")
print("Converted %i records" % count)

import os
print(os.getcwd())

Converted 3 records


In [None]:
from Bio import AlignIO

AlignIO.convert(MSA_heavy_filepath, "clustal", "alignment_IgA1_heavy_alignIO_convert_test1", "pir")

In [None]:
# The allowed format:
# >P1;3m8o
# structure:pdb_file:.:.:.:.::::
# seq1---/seq2---*
def write_modeller_pir()

In [77]:
for idx, row in df_filtered.iterrows():
    print(len(row["H_coordinate_seq"]), row["H_coordinate_seq"])
    print(len(row["L_coordinate_seq"]), row["L_coordinate_seq"],"\n")

220 EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLEWVARIYPTNGYTRYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCSRWGGDGFYAMDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEP
214 DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQHYTTPPTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC 

221 EVQLVESGGGLVQPGGSLKLSCAASGFTLSGSNVHWVRQASGKGLEWVGRIKRNAESDATAYAASMRGRLTISRDDSKNTAFLQMNSLKSDDTAMYYCVIRGDVYNRQWGQGTLVTVSSASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQGVTARNFPPSQDASGDLYTTSSQLTLPATQCLAGKSVTCHVKHYTNPSQDVTVPCP
219 DIVMTQSPLSLSVTPGEPASISCRSSQSLLRRDGHNDLEWYLQKPGQSPQPLIYLGSTRASGVPDRFSGSGSGTDFTLKIIRVEAEDAGTYYCMQNKQTPLTFGQGTRLEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC 



In [78]:
from Bio.PDB import MMCIFParser
from Bio.SeqUtils import seq1
parser = MMCIFParser()
structure = parser.get_structure("1n8z", "../atom_files/1n8z.cif")

for model in structure:
    for chain in model:
        seq = "".join([residue.resname for residue in chain.get_residues()])
        seq_1_letter = seq1(seq)
        clean_seq_1_letter = seq_1_letter.replace("X", "")
        print(chain.id, len(clean_seq_1_letter), clean_seq_1_letter)

A 214 DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQHYTTPPTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
B 220 EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLEWVARIYPTNGYTRYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCSRWGGDGFYAMDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEP
C 581 TQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLELTYLPTNASLSFLQDIQEVQGYVLIAHNQVRQVPLQRLRIVRGTQLFEDNYALAVLDNGDPLSPGGLRELQLRSLTEILKGGVLIQRNPQLCYQDTILWKDIFHKNNQLALTLIDTNRSRACHPCSPMCKGSRCWGESSEDCQSLTRTVCAGGCARCKGPLPTDCCHEQCAAGCTGPKHSDCLACLHFNHSGICELHCPALVTYNTDTFESMPNPEGRYTFGASCVTACPYNYLSTDVGSCTLVCPLHNQEVTATQRCEKCSKPCARVCYGLGMEHLREVRAVTSANIQEFAGCKKIFGSLAFLPESFDSNTAPLQPEQLQVFETLEEITGYLYISAWPDSLPDLSVFQNLQVIRGRILHNGAYSLTLQGLGISWLGLRSLRELGSGLALIHHNTHLCFVHTVPWDQLFRNPHQALLHTANRPEDECVGEGLACHQLCARGHCWGPGPTQCVNCSQFLRGQECVEECRVLQGLPREYVNARHCLPCHPECQPQNGSVTCFGPEADQC



In [79]:
from Bio.PDB import MMCIFParser
from Bio.SeqUtils import seq1
parser = MMCIFParser()
structure = parser.get_structure("3m8o", "../atom_files/3m8o.cif")

for model in structure:
    for chain in model:
        seq = "".join([residue.resname for residue in chain.get_residues()])
        seq_1_letter = seq1(seq)
        clean_seq_1_letter = seq_1_letter.replace("X", "")
        print(chain.id, len(clean_seq_1_letter), clean_seq_1_letter)

L 219 DIVMTQSPLSLSVTPGEPASISCRSSQSLLRRDGHNDLEWYLQKPGQSPQPLIYLGSTRASGVPDRFSGSGSGTDFTLKIIRVEAEDAGTYYCMQNKQTPLTFGQGTRLEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
H 221 EVQLVESGGGLVQPGGSLKLSCAASGFTLSGSNVHWVRQASGKGLEWVGRIKRNAESDATAYAASMRGRLTISRDDSKNTAFLQMNSLKSDDTAMYYCVIRGDVYNRQWGQGTLVTVSSASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQGVTARNFPPSQDASGDLYTTSSQLTLPATQCLAGKSVTCHVKHYTNPSQDVTVPCP




DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQHYTTPPTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQHYTTPPTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC

EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLEWVARIYPTNGYTRYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCSRWGGDGFYAMDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEP
EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLEWVARIYPTNGYTRYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCSRWGGDGFYAMDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEP

DIVMTQSPLSLSVTPGEPASISCRSSQSLLRRDGHNDLEWYLQKPGQSPQPLIYLGSTRASGVPDRFSGSGSGTDFTLKIIRVEAEDAGTYYCMQNKQTPLTFGQGTRLEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
DIVMTQSPLSLSVTPGEPASISCRSSQSLLRRDGHNDLEWYLQKPGQSPQPLIYLGSTRASGVPDRFSGSGSGTDFTLKIIRVEAEDAGTYYCMQNKQTPLTFGQGTRLEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC

EVQLVESGGGLVQPGGSLKLSCAASGFTLSGSNVHWVRQASGKGLEWVGRIKRNAESDATAYAASMRGRLTISRDDSKNTAFLQMNSLKSDDTAMYYCVIRGDVYNRQWGQGTLVTVSSASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQGVTARNFPPSQDASGDLYTTSSQLTLPATQCLAGKSVTCHVKHYTNPSQDVTVPCP
EVQLVESGGGLVQPGGSLKLSCAASGFTLSGSNVHWVRQASGKGLEWVGRIKRNAESDATAYAASMRGRLTISRDDSKNTAFLQMNSLKSDDTAMYYCVIRGDVYNRQWGQGTLVTVSSASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQGVTARNFPPSQDASGDLYTTSSQLTLPATQCLAGKSVTCHVKHYTNPSQDVTVPCP