In [1]:
import os
import pandas as pd

### Default format of PECAN csvs / Expanded dataset csvs

In [2]:
parapred_dir = os.path.abspath("")
data_dir = os.path.join(parapred_dir, "data")
PECAN_dir = os.path.join(data_dir, "PECAN")
Expanded_data_dir = os.path.join(data_dir, "Expanded_dataset")

# train_csv = os.path.join(PECAN_dir, "train_set.csv")
# val_csv = os.path.join(PECAN_dir, "val_set.csv")
# test_csv = os.path.join(PECAN_dir, "test_set.csv")

train_csv = os.path.join(Expanded_data_dir, "train_set.csv")
val_csv = os.path.join(Expanded_data_dir, "val_set.csv")
test_csv = os.path.join(Expanded_data_dir, "test_set.csv")

train_df = pd.read_csv(train_csv, header=None, names=["pdb", "Lchain", "Hchain", "antigen_chain"])
val_df = pd.read_csv(val_csv, header=None, names=["pdb", "Lchain", "Hchain", "antigen_chain"])
test_df = pd.read_csv(test_csv, header=None, names=["pdb", "Lchain", "Hchain", "antigen_chain"])

test_df.head()

Unnamed: 0,pdb,Lchain,Hchain,antigen_chain
0,7JMO,L,H,A
1,6AL5,L,H,A
2,6MQR,L,H,A
3,1FBI,L,H,X
4,6JJP,B,A,C


### Desired data format

In [3]:
default_dataset_csv = os.path.join(data_dir, "dataset.csv")
default_dataset_df = pd.read_csv(default_dataset_csv)
default_dataset_df.head()

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain
0,4bz1,H,L,0,A
1,3gbm,I,M,0,D | C
2,2qqn,H,L,0,A
3,5mes,H,L,0,A
4,2ypv,H,L,0,A


### Converting to desired format

In [4]:
new_train_df = train_df.copy()
new_val_df = val_df.copy()
new_test_df = test_df.copy()

dfs = [new_train_df, new_val_df, new_test_df]

for df in dfs:
    
    df["model"] = 0
    df["pdb"] = df.apply(lambda row: row["pdb"].lower(), axis=1)
    df["antigen_chain"] = df.apply(lambda row: " | ".join(row["antigen_chain"].split(";")), axis=1)
    
    old_col_order = df.columns
    new_col_order = default_dataset_df.columns
    
    # we can't simply reorder columns in normal way, instead need to use below
    # https://www.py4u.net/discuss/1390066
    for c in new_col_order:
        df.insert(df.shape[1],c+'_new',df[c])
    for c in old_col_order:
        del df[c]   
    df.columns = new_col_order

new_test_df.head()

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain
0,7jmo,H,L,0,A
1,6al5,H,L,0,A
2,6mqr,H,L,0,A
3,1fbi,H,L,0,X
4,6jjp,A,B,0,C


### Save new csvs

In [5]:
# new_train_csv = os.path.join(data_dir, "dataset_train.csv")
# new_val_csv = os.path.join(data_dir, "dataset_val.csv")
# new_test_csv = os.path.join(data_dir, "dataset_test.csv")

new_train_csv = os.path.join(data_dir, "dataset_train_BIG.csv")
new_val_csv = os.path.join(data_dir, "dataset_val_BIG.csv")
new_test_csv = os.path.join(data_dir, "dataset_test_BIG.csv")

csvs = [new_train_csv, new_val_csv, new_test_csv]

for df, csv in zip(dfs, csvs):
    df.to_csv(csv, index=False)