# Creating distance matrices

Importing necessary libraries:

In [8]:
from pathlib import Path
import plotly.express as px
import torch
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from plotly.subplots import make_subplots
import plotly.graph_objects as go

Creating DataFrame for mutated proteins with embeddings:

In [14]:
def load_data(folder_path:str) -> pd.DataFrame:
    """
    Creating DataFrame for mutated proteins with embeddings
    """
    
    mut = []
    for i in tqdm(folder_path.glob("*.pt")):
        mut.append((i.stem, torch.load(i)["mean_representations"][33]))
    return pd.DataFrame(mut, columns=["name", "embedding"])


path = Path("../data")
wt = load_data(path / "wt")
mut = load_data(path / "mut")

0it [00:00, ?it/s]
0it [00:00, ?it/s]


Creating dictionary with names and embeddings for wild-type proteins:

In [6]:
wt_dict = wt.set_index("name")["embedding"].to_dict()

Editing the DataFrame:

In [5]:
mut["mut"] = mut["name"].apply(lambda x:x.split("|")[-1])
mut["uniprot_id"] = mut["name"].apply(lambda x:x.split("|")[0])
mut["from"] = mut["mut"].apply(lambda x:x[0])
mut["to"] = mut["mut"].apply(lambda x:x[-1])
mut["pos"] = mut["mut"].apply(lambda x:int(x[1:-1]))

Getting distances between embeddings for mutated protein and wild-type protein:

In [14]:
def get_dist(row):
    """
    Getting distances between embeddings for mutated protein and wild-type protein
    """
    
    # wild-type protein
    parent = wt_dict[row["uniprot_id"]]
    return (row["embedding"] - parent).norm().item()

Adding a new column to the DataFrame with distances:

In [15]:
mut["dist"] = mut.apply(get_dist, axis=1)

Saving our results to CSV file:

In [17]:
mut.drop(["embedding", "name"],axis=1).to_csv("../data/dist_data.csv", index=False)