In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from biopandas.pdb import PandasPdb


In [None]:
DUMMY_DATA_DIR = Path("../data/dummy_data")
DUMMY_REC_FILE = DUMMY_DATA_DIR.joinpath("rec.pdb")
DUMMY_MATHCING_SPHERES_FILE = DUMMY_DATA_DIR.joinpath("matching_spheres.sph")


In [None]:
ppdb = PandasPdb().read_pdb(DUMMY_REC_FILE.as_posix())

In [None]:
ppdb.df.keys()

In [None]:
ppdb.df["ATOM"]

In [None]:
from rdkit.Chem.rdmolops import GetAdjacencyMatrix

GetAdjacencyMatrix(ppdb.df["ATOM"])

In [None]:
ppdb.df["ATOM"].iloc[0]

In [None]:
sorted(ppdb.df["ATOM"].atom_name.unique())

In [None]:
sorted(ppdb.df["ATOM"].residue_name.unique())

In [None]:
COLUMNS_TO_KEEP = [
    "atom_name",
    "residue_name",
    "x_coord",
    "y_coord",
    "z_coord",
    "occupancy",
    "b_factor",
    "element_symbol",
]

df = ppdb.df["ATOM"].drop([col for col in ppdb.df["ATOM"].columns if col not in COLUMNS_TO_KEEP], axis=1)

df

In [None]:
def one_hot_encode_column(df, column):
    df = df.copy()
    for i, el in enumerate(sorted(df[column].unique())):
        df[f"{column}_{i+1}"] = (df[column] == el).astype(float)
    del df[column]
    
    return df

In [None]:
COLUMNS_TO_ONE_HOT_ENCODE = [
    "atom_name",
    "residue_name",
    "element_symbol",
]

for column in COLUMNS_TO_ONE_HOT_ENCODE:
    df = one_hot_encode_column(df, column)

df

## Note: the other numerical columns should be normalized, but we will just do that later when we construct the real dataset. 