In [60]:
import os
import pandas as pd

### Default format of PECAN csvs

In [61]:
parapred_dir = os.path.abspath("")
data_dir = os.path.join(parapred_dir, "data")
PECAN_dir = os.path.join(data_dir, "PECAN")

train_csv = os.path.join(PECAN_dir, "train_set.csv")
val_csv = os.path.join(PECAN_dir, "val_set.csv")
test_csv = os.path.join(PECAN_dir, "test_set.csv")

train_df = pd.read_csv(train_csv, header=None, names=["pdb", "Lchain", "Hchain", "antigen_chain"])
val_df = pd.read_csv(val_csv, header=None, names=["pdb", "Lchain", "Hchain", "antigen_chain"])
test_df = pd.read_csv(test_csv, header=None, names=["pdb", "Lchain", "Hchain", "antigen_chain"])

test_df.head()

Unnamed: 0,pdb,Lchain,Hchain,antigen_chain
0,1CIC,A,B,D;C
1,1IAI,L,H,M;I
2,2EIZ,A,B,C
3,2LTQ,E,F,D
4,2XRA,L,H,A


### Desired data format

In [62]:
default_dataset_csv = os.path.join(data_dir, "dataset.csv")
default_dataset_df = pd.read_csv(default_dataset_csv)
default_dataset_df.head()

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain
0,4bz1,H,L,0,A
1,3gbm,I,M,0,D | C
2,2qqn,H,L,0,A
3,5mes,H,L,0,A
4,2ypv,H,L,0,A


### Converting to desired format

In [63]:
new_train_df = train_df.copy()
new_val_df = val_df.copy()
new_test_df = test_df.copy()

dfs = [new_train_df, new_val_df, new_test_df]

old_col_order = df.columns
new_col_order = default_dataset_df.columns

for df in dfs:
    
    df["model"] = 0
    df["pdb"] = df.apply(lambda row: row["pdb"].lower(), axis=1)
    df["antigen_chain"] = df.apply(lambda row: " | ".join(row["antigen_chain"].split(";")), axis=1)
    
    # we can't simply reorder columns in normal way, instead need to use below
    # https://www.py4u.net/discuss/1390066
    for c in new_col_order:
        df.insert(df.shape[1],c+'_new',df[c])
    for c in old_col_order:
        del df[c]   
    df.columns = new_col_order

new_test_df.head()

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain
0,1cic,B,A,0,D | C
1,1iai,H,L,0,M | I
2,2eiz,B,A,0,C
3,2ltq,F,E,0,D
4,2xra,H,L,0,A


### Save new csvs

In [64]:
new_train_csv = os.path.join(data_dir, "dataset_train.csv")
new_val_csv = os.path.join(data_dir, "dataset_val.csv")
new_test_csv = os.path.join(data_dir, "dataset_test.csv")

csvs = [new_train_csv, new_val_csv, new_test_csv]

for df, csv in zip(dfs, csvs):
    df.to_csv(csv, index=False)