In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [None]:
import pandas as pd
import random
import re
from Bio import SeqIO
from IPython.display import display, HTML
def build_ppi_dataset(dip_file, fasta_file):
    # -------- 1. Load DIP interaction table -------- #
    df = pd.read_csv(dip_file, sep="\t", index_col=False)
    display(df)
    # Extract UniProt IDs
    def extract_uniprot(x):
        match = re.search(r'uniprotkb:([A-Z0-9]+)', str(x))
        return match.group(1) if match else None

    df["protA"] = df["ID interactor A"].apply(extract_uniprot)
    df["protB"] = df["ID interactor B"].apply(extract_uniprot)

    # Keep only valid, unique, non-self interactions
    ppi = df[["protA", "protB"]].dropna().drop_duplicates()
    ppi = ppi[ppi["protA"] != ppi["protB"]]
    ppi["label"] = 1  # positive label

    # -------- 2. Generate Negative Samples -------- #
    positive_pairs = set((a, b) for a, b in zip(ppi["protA"], ppi["protB"]))
    proteins = list(set(ppi["protA"]).union(set(ppi["protB"])))

    negatives = set()
    random.seed(42)
    while len(negatives) < len(ppi):
        a, b = random.sample(proteins, 2)
        if (a, b) not in positive_pairs and (b, a) not in positive_pairs:
            negatives.add((a, b))

    neg_df = pd.DataFrame(list(negatives), columns=["protA", "protB"])
    neg_df["label"] = 0  # negative label

    # -------- 3. Combine Positive + Negative -------- #
    final_df = pd.concat([ppi, neg_df], ignore_index=True)

    # -------- 4. Load FASTA and map UniProt → Sequence -------- #
    uniprot_to_seq = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        header = record.description
        seq = str(record.seq)

        match = re.search(r'uniprot:([A-Z0-9]+)', header)
        if match:
            uniprot = match.group(1)
            uniprot_to_seq[uniprot] = seq

    # -------- 5. Add sequences -------- #
    final_df["seqA"] = final_df["protA"].map(uniprot_to_seq)
    final_df["seqB"] = final_df["protB"].map(uniprot_to_seq)

    # Remove rows where sequences missing
    final_df = final_df.dropna(subset=["seqA", "seqB"]).reset_index(drop=True)

    return final_df

In [None]:
final_df = build_ppi_dataset("Scere20170205.txt", "fasta20171201.seq")
final_df

  df = pd.read_csv(dip_file, sep="\t", index_col=False)


Unnamed: 0,ID interactor A,ID interactor B,Alt. ID interactor A,Alt. ID interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s),Processing Status,Unnamed: 16
0,DIP-25N|refseq:NP_012903|uniprotkb:P09798,DIP-25N|refseq:NP_012903|uniprotkb:P09798,-,-,-,-,MI:0019(coimmunoprecipitation)|MI:0018(two hyb...,-,pubmed:7925276|pubmed:DIP-246S|pubmed:7925276|...,taxid:4932(Saccharomyces cerevisiae),taxid:4932(Saccharomyces cerevisiae),MI:0218(physical interaction)|MI:0218(physical...,MI:0465(dip),DIP-137E,dip-quality-status:core,dip:0002(small scale)|dip:0002(small scale),
1,DIP-343N|refseq:NP_009971|uniprotkb:P23255,DIP-80N|refseq:NP_523805|uniprotkb:P20227,-,-,-,-,MI:0045(experimental interaction detection)|MI...,-,pubmed:8178153|pubmed:DIP-31S|pubmed:8178153|p...,taxid:4932(Saccharomyces cerevisiae),taxid:7227(Drosophila melanogaster),MI:0218(physical interaction)|MI:0218(physical...,MI:0465(dip),DIP-163E,dip-quality-status:core,dip:0002(small scale)|dip:0002(small scale),
2,DIP-551N|refseq:NP_012231|uniprotkb:P07278,DIP-548N|refseq:NP_012371|uniprotkb:P06244,-,-,-,-,MI:0018(two hybrid)|MI:0018(two hybrid)|MI:039...,-,pubmed:16924114|pubmed:DIP-17259S|pubmed:16924...,taxid:4932(Saccharomyces cerevisiae),taxid:4932(Saccharomyces cerevisiae),MI:0915(physical association)|MI:0915(physical...,MI:0465(dip),DIP-244E,dip-quality-status:core,dip:0004(small scale)|dip:0004(small scale)|di...,
3,DIP-18N|refseq:NP_010765|uniprotkb:P06782,DIP-780N|refseq:NP_014142|uniprotkb:P38717,-,-,-,-,MI:0018(two hybrid),-,pubmed:8127709|pubmed:DIP-255S,taxid:4932(Saccharomyces cerevisiae),taxid:4932(Saccharomyces cerevisiae),MI:0218(physical interaction),MI:0465(dip),DIP-436E,dip-quality-status:core,dip:0002(small scale),
4,DIP-1104N|refseq:NP_013826|uniprotkb:P07342,DIP-671N|refseq:NP_009918|uniprotkb:P25605,-,-,-,-,MI:0013(biophysical)|MI:0071(molecular sieving...,-,pubmed:10213630|pubmed:DIP-2092S|pubmed:114633...,taxid:4932(Saccharomyces cerevisiae),taxid:4932(Saccharomyces cerevisiae),MI:0218(physical interaction)|MI:0218(physical...,MI:0465(dip),DIP-125E,dip-quality-status:core,dip:0002(small scale)|dip:0002(small scale)|di...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22972,DIP-4637N|refseq:NP_013525|uniprotkb:O13563,DIP-6817N|refseq:NP_013180|uniprotkb:P38634,-,-,-,-,MI:0096(pull down),-,pubmed:26912900|pubmed:DIP-18124S,taxid:4932(Saccharomyces cerevisiae),taxid:4932(Saccharomyces cerevisiae),MI:0915(physical association),MI:0465(dip),DIP-201142E,dip-quality-status:core,dip:0004(small scale),
22973,DIP-1294N|refseq:NP_011314|uniprotkb:P53091,DIP-2405N|refseq:NP_013204|uniprotkb:Q08032,-,-,-,-,MI:0007(anti tag coimmunoprecipitation),-,pubmed:26854664|pubmed:DIP-18092S,taxid:4932(Saccharomyces cerevisiae),taxid:4932(Saccharomyces cerevisiae),MI:0915(physical association),MI:0465(dip),DIP-201270E,dip-quality-status:core,dip:0004(small scale),
22974,DIP-1294N|refseq:NP_011314|uniprotkb:P53091,DIP-1812N|refseq:NP_010296|uniprotkb:Q12488,-,-,-,-,MI:0007(anti tag coimmunoprecipitation),-,pubmed:26854664|pubmed:DIP-18092S,taxid:4932(Saccharomyces cerevisiae),taxid:4932(Saccharomyces cerevisiae),MI:0915(physical association),MI:0465(dip),DIP-201271E,dip-quality-status:core,dip:0004(small scale),
22975,DIP-2492N|refseq:NP_116650|uniprotkb:P07560,DIP-963N|refseq:NP_014824|uniprotkb:Q12446,-,-,-,-,"MI:0096(pull down)|MI:0809,(bimolecular fluore...",-,pubmed:27526190|pubmed:DIP-18183S|pubmed:27526...,taxid:4932(Saccharomyces cerevisiae),taxid:4932(Saccharomyces cerevisiae),MI:0407(direct interaction)|MI:0915(physical a...,MI:0465(dip),DIP-201456E,dip-quality-status:core,dip:0004(small scale)|dip:0004(small scale),


Unnamed: 0,protA,protB,label,seqA,seqB
0,P23255,P20227,1,MMSFSKNATPRAIVSESSTLHEMKFRNFRVAHEKISLDIDLATHCI...,MDQMLSPNFSIPSIGTPLHQMEADQQIVANPVYHPPAVSQPDSLMP...
1,P07278,P06244,1,MVSSLPKESQAELQLFQNEINAANPSDFLQFSANYFNKRLEQQRAF...,MSTEEQNGGGQKSLDDRQGEESQKGETSERETTATESGNESKSVEK...
2,P06782,P38717,1,MSSNNNTNTAPANANSSHHHHHHHHHHHHHGHGGSNSTLNNPKSSL...,MSVHGRDPKKRQLRLISVAFKEASIDSPSFRASVNFFQTRVDALED...
3,P07342,P25605,1,MIRQSTLKNFAIKRCFQHIAYRNTPAMRSVALAQRFYSSSSRYYSA...,MLRSLLQSGHRRVVASSCATMVRCSSSSTSALAYKQMHRHATRPPL...
4,P38930,P15790,1,MGSRSENVGTVTREGSRVEQDDVLMDDDSDSSEYVDMWIDLFLGRK...,MKCRVWSEARVYTNINKQRTEEYWDYENTVIDWSTNTKDYEIENKV...
...,...,...,...,...,...
45061,P41832,P05221,0,MLKNSGSKHSNSKESHSNSSSGIFQNLKRLANSNATNSNTGSPTYA...,MASTVSNTSKLEKPVSLIWGCELNEQNKTFEFKVEDDEEKCEHQLA...
45062,P40060,Q08231,0,MPSKDPESVIDKEIRKISARNDELIKQDGTLKREYTTLLRKVSSVI...,MDMANQLLDELAHGNFSHLTLNLSQNGREIAILQKQLTGFDDKQLE...
45063,Q7LIF1,Q99303,0,MIRYSSRNRSAREVPVRRHPIFQVQHWKTSNEHSYHYSLCITFRSN...,MESQQLHQNPHSLHGSAAASVTSKEVPSNQDPLAVSASNLPEFDRD...
45064,P14680,Q07589,0,MNSSNNNDSSSSNSNMNNSLSPTLVTHSDASMGSGRASPDNSHMGR...,MSRILVIGAGGVGVITALSLWLKKESDVSLVVRSDYDRVLKHGYTI...


In [None]:
final_df = build_ppi_dataset("Hsapi20170205.txt", "fasta20171201.seq")
final_df

  df = pd.read_csv(dip_file, sep="\t", index_col=False)


Unnamed: 0,protA,protB,label,seqA,seqB
0,Q62311,P21675,1,MAEEKKLKLSNTVLPSESMKVVAESMGIAQIQEETCQLLTDEVSYR...,MGPGCDLLLRTAATITAAAIMSDTDSDEDSAGGGPFSLAGFLFGNI...
1,P29375,P06400,1,MAGVGPGGYAAEFVPPPECPVFEPSWEEFTDPLSFIGRIRPLAEKT...,MPPKTPRKTAATAAAAAAEPPAPPPPPPPEEDPEQDSGPEDLPLVR...
2,P20226,P09086,1,MDQNNSLPPYAQGLASPQGAMTPGIPIFSPMMPYGTGLTPQPIQNT...,MVHSSMGAPEIRMSKPLEAEKQGLDSPSEHTDTERNGPDTNHQNPQ...
3,P22681,P46108,1,MAGNVKKSSGAGGGSGSGGSGSGGLIGLMKDAFQPHHHHHHHLSPH...,MAGNFDSEERSSWYWGRLSRQEAVALLQGQRHGVFLVRDSSTSPGD...
4,P27986,P06213,1,MSAEGYQYRALYDYKKEREEDIDLHLGDILTVNKGSLVALGFSDGQ...,MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTR...
...,...,...,...,...,...
13761,Q9WU84,Q9BYF1,0,MASKSGDGGTVCALEFAVQMSCQSCVDAVHKTLKGVAGVQNVDVQL...,MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLA...
13762,P19138,Q96MS0,0,MSGPVPSRARVYTDVNTHRPREYWDYESHVVEWGNQDDYQLVRKLG...,MLRYLLKTLLQMNLFADSLAGDISNSSELLLGFNSSLAALNHTLLP...
13763,P14142,P61007,0,MPSGFQQIGSDDGEPPRQRVTGTLVLAVFSAVLGSLQFGYNIGVIN...,MAKTYDYLFKLLLIGDSGVGKTCVLFRFSEDAFNSTFISTIGIDFK...
13764,P30273,Q15116,0,MIPAVVLLLLLLVEQAAALGEPQLCYILDAILFLYGIVLTLLYCRL...,MQIPQAPWPVVWAVLQLGWRPGWFLDSPDRPWNPPTFSPALLVVTE...


In [None]:
!git clone https://github.com/gmthu66/AbAgIPA.git

Cloning into 'AbAgIPA'...
remote: Enumerating objects: 980, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 980 (delta 5), reused 64 (delta 4), pack-reused 915 (from 1)[K
Receiving objects: 100% (980/980), 358.39 MiB | 31.11 MiB/s, done.
Resolving deltas: 100% (182/182), done.
Updating files: 100% (893/893), done.


In [None]:
path = "https://raw.githubusercontent.com/gmthu66/AbAgIPA/refs/heads/main/SabDab/SabDabdatabase/positive_StdRecord.csv"
pos_df =pd.read_csv(path)

In [None]:
dict(zip(pos_df[["Ab_name",'Abseq']].iloc[:10].values.ravel()))

ValueError: dictionary update sequence element #0 has length 1; 2 is required

In [None]:
for name in pos_df["Ab_name"].unique():
    # pos_df[pos_df['Ab_name'].isin([name])]['Abseq'].values[0]
    if pos_df[pos_df['Ab_name'].isin([name])].shape[0]>1:
        display(pos_df[pos_df['Ab_name'].isin([name])]['Abseq'].values[0])
        break

'QVQLQQPGAELVRPGASVKLSCKASGYTLTTYWMNWFKQRPDQGLEWIGRIDPYDSETHYNQKFKDKAILTVDRSSSTAYMQLSSLTSEDSAVYYCTRFLQITTIIYGMDYWGQGTSVTVSSDVVMTQTPLSLPVSLGDQASISCRSSQTLVHSNGNTYLHWYLQKPGQSPKLLIYKVSNRFSGVPDRFSGSGSGTDFTLKISRVEAEDLGVYFCSQNTHVPYTFGGGTKLEIK'

In [None]:
pos_df[pos_df['Ab_name']==name]

Unnamed: 0,sudo_at,sudo_ab,merged_index,At_name,Ab_name,Hseq,Lseq,Abseq,len_0,len_1,cdr_fr_info,init_ab_mergeobj,ab_mergeobj,ab_preDir,init_at_mergeobj,Atseq,at_mergeobj,at_preDir,pos_dataid
3890,6s3d_E_F_O,6s3d_E_F_O,3891,6s3d_O,6s3d_EF,VQLVQSGAEVKKPGSSVMVSCQASGGPLRNYIINWLRQAPGQGPEW...,DIQMTQSPSSLSAAVGDRVTITCQASQDIVNYLNWYQQKPGKAPKL...,VQLVQSGAEVKKPGSSVMVSCQASGGPLRNYIINWLRQAPGQGPEW...,125.0,107.0,"{'V0_fr': array([[ 0, 24],\n [ 30, 50...",6s3d_E_F_O,6s3d_E_F_O,/data/gm_data/AbAtInteraction/AbAtIPA/abatInte...,6s3d_E_F_O,SPCDKQKNYIDKQLLPIVNKAGCSRPEEVEERIRRALKKMGDTSCF...,6s3d_E_F_O,/data/gm_data/AbAtInteraction/AbAtIPA/abatInte...,3799.0


In [None]:
map_seq =list(zip([(name,pos_df[pos_df['Ab_name'].isin([name])]['Abseq'].values[0]) for name in pos_df["Ab_name"].unique()]))

In [None]:
pd.DataFrame(map_seq)

Unnamed: 0,0
0,"(4k3j_HL, EVQLVESGGGLVQPGGSLRLSCAASGYTFTSYWLHW..."
1,"(4ers_HL, QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHW..."
2,"(3tt1_IM, QVQLQQSGAELARPGASVKLSCKASGYTFTDYYINW..."
3,"(1qfu_HL, QVQLQQPGAELVRPGASVKLSCKASGYTLTTYWMNW..."
4,"(2vir_BA, QVQLKESGPGLVAPSQSLSITCTVSGFLLISNGVHW..."
...,...
900,"(4uu9_HL, EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSW..."
901,"(6b0s_HL, EVQLQESGGGLVKPGGSLRLSCAASGFTFSSYSMNW..."
902,"(5wk3_WV, EVQLVQSGAEVKKPGESLKISCKGSGYSFTSYWIGW..."
903,"(5dhv_HL, QEQLVESGGRLVTPGTALTLTCKVSGFSLSGFWLNW..."


In [None]:
import pandas as pd
import random

def get_sabdab(path):
    df_raw = pd.read_csv(path)

    # -------------------------------------------------------
    # 1. Build UNIQUE dictionaries for all sequences
    # -------------------------------------------------------
    # Ab side
    ab_unique = df_raw[["Ab_name", "Abseq", "Hseq", "Lseq"]].dropna().drop_duplicates("Ab_name")
    abseq_map = dict(zip(ab_unique["Ab_name"], ab_unique["Abseq"]))
    hseq_map  = dict(zip(ab_unique["Ab_name"], ab_unique["Hseq"]))
    lseq_map  = dict(zip(ab_unique["Ab_name"], ab_unique["Lseq"]))

    # Antigen side
    at_unique = df_raw[["At_name", "Atseq"]].dropna().drop_duplicates("At_name")
    atseq_map = dict(zip(at_unique["At_name"], at_unique["Atseq"]))

    # -------------------------------------------------------
    # 2. Create positive pairs
    # -------------------------------------------------------
    pos_df = df_raw[["Ab_name", "At_name"]].dropna().drop_duplicates()
    pos_df = pos_df[pos_df["Ab_name"] != pos_df["At_name"]]
    pos_df["label"] = 1

    # -------------------------------------------------------
    # 3. Negative sampling
    # -------------------------------------------------------
    positive_pairs = set(zip(pos_df["Ab_name"], pos_df["At_name"]))
    proteins = list(set(ab_unique["Ab_name"]).union(set(at_unique["At_name"])))

    negatives = set()
    random.seed(42)
    while len(negatives) < len(pos_df):
        a, b = random.sample(proteins, 2)
        if a != b and (a, b) not in positive_pairs and (b, a) not in positive_pairs:
            negatives.add((a, b))

    neg_df = pd.DataFrame(list(negatives), columns=["Ab_name", "At_name"])
    neg_df["label"] = 0

    # -------------------------------------------------------
    # 4. Combine datasets
    # -------------------------------------------------------
    final_df = pd.concat([pos_df, neg_df], ignore_index=True)

    # -------------------------------------------------------
    # 5. Map all sequences like map_seq method
    # -------------------------------------------------------
    final_df["Abseq"] = final_df["Ab_name"].map(abseq_map)
    final_df["Atseq"] = final_df["At_name"].map(atseq_map)
    final_df["Hseq"]  = final_df["Ab_name"].map(hseq_map)
    final_df["Lseq"]  = final_df["Ab_name"].map(lseq_map)

    # Drop rows where sequences could not be mapped
    final_df = final_df.dropna(subset=["Abseq", "Atseq", "Hseq", "Lseq"]).reset_index(drop=True)

    return final_df


In [None]:
df = get_sabdab("https://raw.githubusercontent.com/gmthu66/AbAgIPA/refs/heads/main/SabDab/SabDabdatabase/positive_StdRecord.csv")

In [None]:
df

Unnamed: 0,Ab_name,At_name,label,Abseq,Atseq,Hseq,Lseq
0,4k3j_HL,4k3j_B,1,EVQLVESGGGLVQPGGSLRLSCAASGYTFTSYWLHWVRQAPGKGLE...,MKYQLPNFTAETPIQNVILHEHHIFLGATNYIYVLNEEDLQKVAEY...,EVQLVESGGGLVQPGGSLRLSCAASGYTFTSYWLHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCKSSQSLLYTSSQKNYLAWYQQKP...
1,4ers_HL,4ers_A,1,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLE...,VMDFLFEKWKLYGDQCHHNLSLLPPPTELVCNRTFDKYSCWPDTPA...,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKR...
2,3tt1_IM,3tt1_A,1,QVQLQQSGAELARPGASVKLSCKASGYTFTDYYINWMKQRTGQGLE...,REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...,QVQLQQSGAELARPGASVKLSCKASGYTFTDYYINWMKQRTGQGLE...,DIVLTQSPASLAVSLGQRATISCKASQSVDYDGDSYMNWYQQKPGQ...
3,1qfu_HL,1qfu_A,1,QVQLQQPGAELVRPGASVKLSCKASGYTLTTYWMNWFKQRPDQGLE...,STATLCLGHHAVPNGTLVKTITDDQIEVTNATELVQSSSTGKICNN...,QVQLQQPGAELVRPGASVKLSCKASGYTLTTYWMNWFKQRPDQGLE...,DVVMTQTPLSLPVSLGDQASISCRSSQTLVHSNGNTYLHWYLQKPG...
4,2vir_BA,1qfu_A,1,QVQLKESGPGLVAPSQSLSITCTVSGFLLISNGVHWVRQPPGKGLE...,STATLCLGHHAVPNGTLVKTITDDQIEVTNATELVQSSSTGKICNN...,QVQLKESGPGLVAPSQSLSITCTVSGFLLISNGVHWVRQPPGKGLE...,QAVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLF...
...,...,...,...,...,...,...,...
4669,3w2d_HL,4dw2_U,0,DVQLQESGPGLVRPSQSLSLTCTVTGYSLTSDFAWNWIRQFPGNKL...,FTTIENQPWFAAIYRRVTYVCGGSLISPCWVISATHCFIDYPKKED...,DVQLQESGPGLVRPSQSLSLTCTVTGYSLTSDFAWNWIRQFPGNKL...,ENVLTQSPAIMSASPGETVTMTCRATSSVSSTYLHWYQQKSGASPK...
4670,3wkm_HL,4xmp_G,0,EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYLHWVKQRPEHGLE...,WKEAETTLFCASDAKAYETEKHNVWATHACVPTDPNPQEIHLENVT...,EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYLHWVKQRPEHGLE...,YIVLTQSPVSLAVSLGQRATISCRASESVDSYGDSFMHWYQQKPGQ...
4671,6pis_IM,4hf5_A,0,QLQLQESGPGLVKPSQSLSLACSVTGFSLSTGGYQWTWIRQFPGKK...,PGDQICIGYHANNSTEKVDTILERNVTVTHAKDILEKTHNGKLCKL...,QLQLQESGPGLVKPSQSLSLACSVTGFSLSTGGYQWTWIRQFPGKK...,DIQLTQLPSFLSVSPGDKVTITCKASQNINQYLHWYQQKPEEAPKL...
4672,5czx_HL,1i9r_B,0,QVQLVQSGAEVKKPGSSVKVSCKASGGTFRTYAMHWVRQAPGQGLE...,NPQIAAHVISEASSKTTSVLQWAEKGYYTMSNNLVTLENGKQLTVK...,QVQLVQSGAEVKKPGSSVKVSCKASGGTFRTYAMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQSIASYLAWYQQKPGKAPKL...


In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,3774
0,3774


In [None]:
df[df['label']==1].sample(1).to_dict()

{'Ab_name': {2662: '6gku_HL'},
 'At_name': {2662: '6glx_B'},
 'label': {2662: 1},
 'Abseq': {2662: 'QLQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAINSGGGSTSYADSVKGRFTISRDNAKNTLYLQMNSLKPEDTAVYYCATPGDRLWYYRYDYWGQGTQVTVSSQAGLTQPPSVSGTLGKAVTISCAGTSSDIGYGNYVSWYQQLPGTAPKLLIYKVSRRASGVPDRFSGSKSGNTASLSISGLQSEDEADYYCASYRYRNNVVFGGGTHLTVL'},
 'Atseq': {2662: 'SLLPVPYTEAASLSTGSTVTIKGRPLACFLNEPYLQVDFHTEMKEESDIVFHFQVCFGRRVVMNSREYGAWKQQVESKNMPFQDGQEFELSISVLPDKYQVMVNGQSSYTFDHRIKPEAVKMVQVWRDISLTKFNVS'},
 'Hseq': {2662: 'QLQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAINSGGGSTSYADSVKGRFTISRDNAKNTLYLQMNSLKPEDTAVYYCATPGDRLWYYRYDYWGQGTQVTVSS'},
 'Lseq': {2662: 'QAGLTQPPSVSGTLGKAVTISCAGTSSDIGYGNYVSWYQQLPGTAPKLLIYKVSRRASGVPDRFSGSKSGNTASLSISGLQSEDEADYYCASYRYRNNVVFGGGTHLTVL'}}

In [None]:
!sudo apt install cd-hit clustalw


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Suggested packages:
  clustalx seaview
The following NEW packages will be installed:
  cd-hit clustalw
0 upgraded, 2 newly installed, 0 to remove and 41 not upgraded.
Need to get 796 kB of archives.
After this operation, 1,901 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 cd-hit amd64 4.8.1-4 [521 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 clustalw amd64 2.1+lgpl-7 [275 kB]
Fetched 796 kB in 1s (762 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfi

In [None]:
import os
import pandas as pd
import requests
from Bio import SeqIO
from Bio.PDB import PDBList
import subprocess
import random
from sklearn.model_selection import train_test_split

##########################################################################
# 1. Load dataset
##########################################################################

df = pd.read_csv("/content/20251129_0374397_summary.tsv",sep = '\t')   # your file

# required columns from your sample:
# pdb, Hchain, Lchain, antigen_chain

df = df.dropna(subset=["pdb", "Hchain", "antigen_chain"])

print("Total complexes:", len(df))


##########################################################################
# 2. Download sequences for heavy, light, antigen chains (from PDBe API)
##########################################################################

def fetch_chain_sequence(pdb_id, chain_id):
    """Fetch chain sequence from PDBe REST API."""
    url = f"https://www.ebi.ac.uk/pdbe/api/pdb/entry/residue_listing/{pdb_id.lower()}"
    r = requests.get(url)

    if r.status_code != 200:
        return None

    data = r.json()
    try:
        reslist = data[pdb_id.lower()]["molecules"]
    except:
        return None

    for mol in reslist:
        for chain in mol["chains"]:
            if chain["chain_id"] == chain_id:
                seq = "".join([r["residue_name"] for r in chain["residues"]])
                # 3-letter AA → 1-letter AA
                from Bio.Data.IUPACData import protein_letters_3to1_extended
                out = ""
                for aa in seq.split():
                    out += protein_letters_3to1_extended.get(aa, "X")
                return out
    return None


# Fetch sequences
df["heavy_seq"]  = df.apply(lambda r: fetch_chain_sequence(r["pdb"], r["Hchain"])
                            if r["Hchain"] != "NA" else None, axis=1)

df["light_seq"]  = df.apply(lambda r: fetch_chain_sequence(r["pdb"], r["Lchain"])
                            if r["Lchain"] != "NA" else "", axis=1)

df["antigen_seq"] = df.apply(lambda r: fetch_chain_sequence(r["pdb"], r["antigen_chain"]), axis=1)

df = df.dropna(subset=["heavy_seq", "antigen_seq"])
print("After sequence extraction:", len(df))


##########################################################################
# 3. Filter antigen length ≥ 50
##########################################################################

df = df[df["antigen_seq"].str.len() >= 50]
print("After antigen length filter:", len(df))  # expect ~1489 type


##########################################################################
# 4. Build antibody sequence (heavy + light)
##########################################################################

df["antibody_seq"] = df["heavy_seq"] + df["light_seq"]


##########################################################################
# 5. CD-HIT antibody clustering @ 0.98
##########################################################################

with open("antibody.fasta", "w") as f:
    for i, r in df.iterrows():
        f.write(f">{r['pdb']}_{r['Hchain']}\n{r['antibody_seq']}\n")

subprocess.run(["cd-hit", "-i", "antibody.fasta", "-o", "ab98", "-c", "0.98"])

# Parse CD-HIT clusters
cluster_map = {}
current = None

with open("ab98.clstr") as f:
    for line in f:
        if line.startswith(">Cluster"):
            current = int(line.split()[1])
        else:
            name = line.split(">")[1].split("...")[0]
            cluster_map[name] = current

df["ab_cluster"] = df.apply(lambda r: cluster_map.get(f"{r['pdb']}_{r['Hchain']}"), axis=1)


##########################################################################
# 6. CD-HIT antigen clustering @ 0.90
##########################################################################

with open("antigen.fasta", "w") as f:
    for i, r in df.iterrows():
        f.write(f">{r['pdb']}_{r['antigen_chain']}\n{r['antigen_seq']}\n")

subprocess.run(["cd-hit", "-i", "antigen.fasta", "-o", "ag90", "-c", "0.90"])

ag_map = {}
current = None

with open("ag90.clstr") as f:
    for line in f:
        if line.startswith(">Cluster"):
            current = int(line.split()[1])
        else:
            name = line.split(">")[1].split("...")[0]
            ag_map[name] = current

df["ag_cluster"] = df.apply(lambda r: ag_map.get(f"{r['pdb']}_{r['antigen_chain']}"), axis=1)

print("Antibody clusters:", df["ab_cluster"].nunique())
print("Antigen clusters:", df["ag_cluster"].nunique())  # expect ≈ 408


##########################################################################
# 7. Generate Positive Samples
##########################################################################

positive = []
for i, r in df.iterrows():
    positive.append({
        "antibody_seq": r["antibody_seq"],
        "antigen_seq": r["antigen_seq"],
        "label": 1,
        "ag_cluster": r["ag_cluster"]
    })

print("Positive samples:", len(positive))   # expect ~3892


##########################################################################
# 8. Generate Negative Samples (antibodies paired to antigens from other clusters)
##########################################################################

clusters = df["ag_cluster"].unique()
negative = []

for p in positive:
    wrong_cluster = random.choice([c for c in clusters if c != p["ag_cluster"]])
    neg_antigen = df[df["ag_cluster"] == wrong_cluster].sample(1).iloc[0]["antigen_seq"]

    negative.append({
        "antibody_seq": p["antibody_seq"],
        "antigen_seq": neg_antigen,
        "label": 0,
        "ag_cluster": wrong_cluster
    })

print("Negative samples:", len(negative))


##########################################################################
# 9. Combine dataset
##########################################################################

dataset = pd.DataFrame(positive + negative)


##########################################################################
# 10. Train/Test split: 4:1
##########################################################################

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

train.to_csv("train_antibody_antigen.csv", index=False)
test.to_csv("test_antibody_antigen.csv", index=False)

print("Train:", train.shape)
print("Test :", test.shape)


Total complexes: 3276


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-3848509825.py", line 56, in <cell line: 0>
    df["heavy_seq"]  = df.apply(lambda r: fetch_chain_sequence(r["pdb"], r["Hchain"])
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/core/frame.py", line 10374, in apply
    return op.apply().__finalize__(self, method="apply")
           ^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/core/apply.py", line 916, in apply
    return self.apply_standard()
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/core/apply.py", line 1063, in apply_standard
    results, res_index = self.apply_series_generator()
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/pyth

TypeError: object of type 'NoneType' has no len()

# Option 4

In [None]:
import pandas as pd
import subprocess
import random
import numpy as np
# ----------------------------------------------------------------------
# Load dataset (your exact format)
# ----------------------------------------------------------------------
df = pd.read_csv("https://raw.githubusercontent.com/gmthu66/AbAgIPA/refs/heads/main/SabDab/SabDabdatabase/positive_StdRecord.csv")
df['label'] =1
# Abseq exists but we also have Hseq and Lseq explicitly
df["AntibodySeq"] = df["Hseq"] + df["Lseq"]


# ----------------------------------------------------------------------
# 1. CD-HIT on Antibodies (H + L sequences)
# ----------------------------------------------------------------------
with open("antibody_seqs.fasta", "w") as f:
    for i, row in df.iterrows():
        f.write(f">{row['Ab_name']}\n{row['AntibodySeq']}\n")

subprocess.run([
    "cd-hit",
    "-i", "antibody_seqs.fasta",
    "-o", "antibody_clustered",
    "-c", "0.98",
    "-n", "5"
])


# Parse CD-HIT cluster file
ab_cluster_map = {}
cluster_id = None
with open("antibody_clustered.clstr") as f:
    for line in f:
        if line.startswith(">Cluster"):
            cluster_id = int(line.split()[1])
        else:
            name = line.split(">")[1].split("...")[0]
            ab_cluster_map[name] = cluster_id

df["Ab_cluster"] = df["Ab_name"].map(ab_cluster_map)


# ----------------------------------------------------------------------
# 2. CD-HIT on Antigens (Atseq)
# ----------------------------------------------------------------------
with open("antigen_seqs.fasta", "w") as f:
    for i, row in df.iterrows():
        f.write(f">{row['At_name']}\n{row['Atseq']}\n")

subprocess.run([
    "cd-hit",
    "-i", "antigen_seqs.fasta",
    "-o", "antigen_clustered",
    "-c", "0.90",
    "-n", "5"
])

# Parse clusters
ag_cluster_map = {}
cluster_id = None
with open("antigen_clustered.clstr") as f:
    for line in f:
        if line.startswith(">Cluster"):
            cluster_id = int(line.split()[1])
        else:
            name = line.split(">")[1].split("...")[0]
            ag_cluster_map[name] = cluster_id

df["Ag_cluster"] = df["At_name"].map(ag_cluster_map)


# ----------------------------------------------------------------------
# 3. Generate positive samples (label = 1 from original data)
# ----------------------------------------------------------------------
positive = df[df["label"] == 1].copy()

positive_samples = []
for i, row in positive.iterrows():
    positive_samples.append({
        "Abseq": row["AntibodySeq"],
        "Atseq": row["Atseq"],
        "label": 1,
        "Ag_cluster": row["Ag_cluster"],
        "Ab_cluster": row["Ab_cluster"],
        'Hseq':row['Hseq'],
        'Lseq':row['Lseq']
    })


# ----------------------------------------------------------------------
# 4. Generate negative samples (Ag_cluster ≠ Ab_cluster)
# 1:1 ratio with positives
# ----------------------------------------------------------------------
negative_samples = []

all_clusters = df["Ag_cluster"].dropna().unique()

for _, row in positive.iterrows():

    # randomly choose a different antigen cluster
    different_clusters = [c for c in all_clusters if (c != row["Ag_cluster"])]
    neg_cluster = random.choice(different_clusters)

    # pick random antigen from that cluster
    neg_antigen = df[df["Ag_cluster"] == neg_cluster].sample(1).iloc[0]

    negative_samples.append({
        "Abseq": row["AntibodySeq"],
        "Atseq": neg_antigen["Atseq"],
        "label": 0,
        "Ag_cluster": neg_cluster,
        "Ab_cluster": row["Ab_cluster"],
        'Hseq':row['Hseq'],
        'Lseq':row['Lseq']
    })


# ----------------------------------------------------------------------
# 5. Build final dataset
# ----------------------------------------------------------------------
final_df = pd.DataFrame(positive_samples + negative_samples)

final_df.to_csv("positive_negative_dataset.csv", index=False)

print("Positive:", len(positive_samples))
print("Negative:", len(negative_samples))
print("Saved → positive_negative_dataset.csv")


Positive: 3891
Negative: 3891
Saved → positive_negative_dataset.csv


In [None]:
final_df[['Abseq','Atseq']].drop_duplicates()

Unnamed: 0,Abseq,Atseq
0,EVQLVESGGGLVQPGGSLRLSCAASGYTFTSYWLHWVRQAPGKGLE...,MKYQLPNFTAETPIQNVILHEHHIFLGATNYIYVLNEEDLQKVAEY...
1,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYGMHWVRQAPGKGLE...,VMDFLFEKWKLYGDQCHHNLSLLPPPTELVCNRTFDKYSCWPDTPA...
2,QVQLQQSGAELARPGASVKLSCKASGYTFTDYYINWMKQRTGQGLE...,REHWATRLGLILAMAGNAVGLGNFLRFPVQAAENGGGAFMIPYIIA...
3,QVQLQQPGAELVRPGASVKLSCKASGYTLTTYWMNWFKQRPDQGLE...,STATLCLGHHAVPNGTLVKTITDDQIEVTNATELVQSSSTGKICNN...
4,QVQLKESGPGLVAPSQSLSITCTVSGFLLISNGVHWVRQPPGKGLE...,STATLCLGHHAVPNGTLVKTITDDQIEVTNATELVQSSSTGKICNN...
...,...,...
7777,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,SQPDPKPDELHKSSKFTGLMENMKVLYDDNHVSAINVKSIDQFLYF...
7778,EVQLQESGGGLVKPGGSLRLSCAASGFTFSSYSMNWVRQAPGKGLE...,GEFSVCDSVSVWVGDKTTATDIKGKEVMVLGEVNINNSVFKQYFFE...
7779,EVQLVQSGAEVKKPGESLKISCKGSGYSFTSYWIGWVRQMPGKGLE...,NPKLYFLSTFVVTYILWFTGAYLSFSSTYSGIYMLIMLPGLMAPFI...
7780,QEQLVESGGRLVTPGTALTLTCKVSGFSLSGFWLNWVRQAPGKGLE...,SKWVFEHPETLYAWEGACVWIPCTYRLESFILFHNPEYNKQTSKFD...


In [None]:
final_df.drop_duplicates(['Abseq','Atseq'])['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,3847
1,3266


In [None]:
final_df[final_df['label']==1].drop_duplicates(['Abseq','Atseq']).sample().to_dict()

{'Abseq': {2248: 'EVKLVESGGGLVKPGGSLKLSCAASGFTFSSYAMSWVRQTPEKRLEWVATISSGGTYTYYPDSVKGRFTISRDNAENTLYLQMSSLRSEDTAMYYCVRDGNSMDYWGQGTSVTVSSDIQMTQTTSSLSASLGDRVTIICRASQDINNYLNWYQQKPDGTVKLLIYYTSRLHSGVPSRFSGSGSGSDYSLTISNLEQEDIATYFCQQANTLPPTFGAGTKLELR'},
 'Atseq': {2248: 'VECDFSPLLSGTPPQVYNFKRLVFTNCNYNLTKLLSLFSVNDFTCSQISPAAIASNCYSSLILDYFSYPLSMKSDLSVSSAGPISQFNYKQSFSNPTCLILATVPHNLTTITKPLKYSYINKCSRLLSDDRTEVPQLVNANQYSPCVSIVPSTVWEDGDYYRKQLSPLEGGGWLVASGSTVAMTEQLQMGFGITVQYGTDTNSVCPKL'},
 'label': {2248: 1},
 'Ag_cluster': {2248: 212.0},
 'Ab_cluster': {2248: 697.0},
 'Hseq': {2248: 'EVKLVESGGGLVKPGGSLKLSCAASGFTFSSYAMSWVRQTPEKRLEWVATISSGGTYTYYPDSVKGRFTISRDNAENTLYLQMSSLRSEDTAMYYCVRDGNSMDYWGQGTSVTVSS'},
 'Lseq': {2248: 'DIQMTQTTSSLSASLGDRVTIICRASQDINNYLNWYQQKPDGTVKLLIYYTSRLHSGVPSRFSGSGSGSDYSLTISNLEQEDIATYFCQQANTLPPTFGAGTKLELR'}}

In [None]:
final_df[final_df['label']==0].drop_duplicates(['Abseq','Atseq']).sample().to_dict()

{'Abseq': {5154: 'VQLVESGPGLVKPLETLSLTCAVPGGSIRRNYWSWIRQPPGKGLEWIGHSYGSGGSTNYNPSLESRVTLSVDTSKNLFSLKLTSVTAADTAVYYCARTVWYYTSGTHYFDHWGQGVLVTVSSVLTQPPSVSAAPGQKVTISCSGSSSNIGRSYVSWYQQVPGAAPKLLIYDTNKRPSGVSDRFSGSKSGSSASLAITGLQTGDEADYYCGAWDGSLNVHIFGSGTKLTVL'},
 'Atseq': {5154: 'VFLGFLGAAGSTMGAASMTLTVQARNLLSGTVWGIKQLQARVLAVERYLRDQQLLGIWGCSGKLICCTNVPWNSSWSNRNLSEIWDNMTWLQWDKEISNYTQIIYGLLEESQNQQEKNEQDLLALD'},
 'label': {5154: 0},
 'Ag_cluster': {5154: 318.0},
 'Ab_cluster': {5154: 261.0},
 'Hseq': {5154: 'VQLVESGPGLVKPLETLSLTCAVPGGSIRRNYWSWIRQPPGKGLEWIGHSYGSGGSTNYNPSLESRVTLSVDTSKNLFSLKLTSVTAADTAVYYCARTVWYYTSGTHYFDHWGQGVLVTVSS'},
 'Lseq': {5154: 'VLTQPPSVSAAPGQKVTISCSGSSSNIGRSYVSWYQQVPGAAPKLLIYDTNKRPSGVSDRFSGSKSGSSASLAITGLQTGDEADYYCGAWDGSLNVHIFGSGTKLTVL'}}

In [None]:
import pandas as pd
import subprocess
import random
import numpy as np

df = pd.read_csv("https://raw.githubusercontent.com/gmthu66/AbAgIPA/refs/heads/main/SabDab/SabDabdatabase/positive_StdRecord.csv")
df['label'] = 1

df["AntibodySeq"] = df["Hseq"] + df["Lseq"]


In [None]:
positive = df.copy()

positive_samples = [
    {
        "Abseq": row["AntibodySeq"],
        "Atseq": row["Atseq"],
        "label": 1,
        # Kept only for optional analysis — will be dropped for ML model!
        "Ab_cluster": row["Ab_cluster"],
        "Ag_cluster": row["Ag_cluster"],
        "Hseq": row["Hseq"],
        "Lseq": row["Lseq"]
    }
    for _, row in positive.iterrows()
]
negative_samples = []

all_antigens = df["At_name"].unique()

for _, row in positive.iterrows():

    # all antigens except the true one
    candidate_antigens = df[df["At_name"] != row["At_name"]]

    neg_antigen = candidate_antigens.sample(1).iloc[0]

    negative_samples.append({
        "Abseq": row["AntibodySeq"],
        "Atseq": neg_antigen["Atseq"],
        "label": 0,
        "Ab_cluster": row["Ab_cluster"],
        "Ag_cluster": neg_antigen["Ag_cluster"],   # kept for analysis only
        "Hseq": row["Hseq"],
        "Lseq": row["Lseq"]
    })

# Optionally exclude antigens > 90% identical to the true antigen
candidate_antigens = candidate_antigens[
    candidate_antigens["Ag_cluster"] != row["Ag_cluster"]
]

final_df = pd.DataFrame(positive_samples + negative_samples)

final_df.to_csv("raw_dataset_with_clusters.csv", index=False)

# drop cluster columns for model
train_df = final_df.drop(columns=["Ag_cluster", "Ab_cluster"])

train_df.to_csv("positive_negative_dataset.csv", index=False)

print("Final dataset saved → positive_negative_dataset.csv")
