In [8]:
import pandas as pd
import random

random.seed(42)
st = random.getstate()
random.setstate(st)

In [24]:
DATA_PATH="~/projects/mint/downstream/GeneralPPI/antibodies/antibody_antigen_sequences.parquet"
CSV_PATH="~/projects/mint/downstream/GeneralPPI/antibodies/seqs.csv"
TSV_PATH="~/projects/mint/downstream/GeneralPPI/antibodies/test.txt"

df = pd.read_parquet(DATA_PATH)

In [11]:
# Pairing the identifiers

negative_pairs = {}

for i, ab in enumerate(df["Name"]):
    is_valid_negative = False
    new_ag = None
    while(not is_valid_negative):
        new_ag = random.choice(df["antigen uniprot id"])
        if(new_ag != df["antigen uniprot id"][i]):
            is_valid_negative = True
    negative_pairs[ab] = new_ag

In [12]:
antigen_sequences = {}

for i, ag in enumerate(df["antigen uniprot id"]):
    antigen_sequences[ag] = df["antigen sequence"][i]

In [5]:
antibody_dataset = {
    "seq1id": [],
    "seq2id": [],
    "seq1": [],
    "seq2": [],
    "labels": []
}

for i, ab in enumerate(df["Name"]):
    # The positive pair
    antibody_dataset["seq1id"].append(ab)
    antibody_dataset["seq2id"].append(df["antigen uniprot id"][i])
    antibody_dataset["seq1"].append(df["Heavy chain full sequence"][i])
    antibody_dataset["seq2"].append(df["antigen sequence"][i])
    antibody_dataset["labels"].append(1)
    # The negative pair
    antibody_dataset["seq1id"].append(ab)
    antibody_dataset["seq2id"].append(negative_pairs[ab])
    antibody_dataset["seq1"].append(df["Heavy chain full sequence"][i])
    antibody_dataset["seq2"].append(antigen_sequences[negative_pairs[ab]])
    antibody_dataset["labels"].append(0)

In [6]:
full_df = pd.DataFrame.from_dict(antibody_dataset)

In [8]:
csv_df = full_df.copy()

In [9]:
csv_df.drop(["seq1id", "seq2id"], axis=1, inplace=True)
csv_df.to_csv(CSV_PATH, index=False)

In [17]:
full_df.rename({"seq1id": "protein1", "seq2id": "protein2", "labels": "label"}, axis=1, inplace=True)
full_df.drop(["seq1", "seq2"], axis=1, inplace=True)

In [21]:
full_df.to_csv(TSV_PATH, sep="\t", index=False)

In [34]:
# Ranking set
CSV_PATH="~/projects/gLM/data/antibodies_rank/seqs.csv"
TSV_PATH="~/projects/gLM/data/antibodies_rank/test.txt"

antigen_uniprots = df["antigen uniprot id"].unique()

antibody_rank_dataset = {
    "seq1id": [],
    "seq2id": [],
    "seq1": [],
    "seq2": [],
    "label": []
}

for i, ab in enumerate(df["Name"]):
    # The positive pair
    antibody_rank_dataset["seq1id"].append(ab)
    antibody_rank_dataset["seq2id"].append(df["antigen uniprot id"][i])
    antibody_rank_dataset["seq1"].append(df["Heavy chain full sequence"][i])
    antibody_rank_dataset["seq2"].append(df["antigen sequence"][i])
    antibody_rank_dataset["label"].append(1)
    for j, ag in enumerate(antigen_uniprots):
        if(ag != df["antigen uniprot id"][i]):
            # The negative pairs
            antibody_rank_dataset["seq1id"].append(ab)
            antibody_rank_dataset["seq2id"].append(ag)
            antibody_rank_dataset["seq1"].append(df["Heavy chain full sequence"][i])
            antibody_rank_dataset["seq2"].append(antigen_sequences[ag])
            antibody_rank_dataset["label"].append(0)

antibody_rank_dataset_df = pd.DataFrame.from_dict(antibody_rank_dataset)
antibody_rank_dataset_csv_df = antibody_rank_dataset_df.copy()
antibody_rank_dataset_csv_df.drop(["seq1id", "seq2id"], axis=1, inplace=True)
antibody_rank_dataset_csv_df.to_csv(CSV_PATH, index=False)
antibody_rank_dataset_df.rename({"seq1id": "protein1", "seq2id": "protein2"}, axis=1, inplace=True)
antibody_rank_dataset_df.drop(["seq1", "seq2"], axis=1, inplace=True)
antibody_rank_dataset_df.to_csv(TSV_PATH, sep="\t", index=False)