In [None]:
from datasets import load_dataset
import torch
import torchmetrics
import torch.nn.functional as F
import esm
from torch.utils.data import Dataset
import pytorch_lightning as pl
import plotly.express as px # plotting
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import pandas as pd
import umap
from typing import List, Tuple
from pathlib import Path

In [None]:
def random_seq_emb(p: Path, layer: int):
    embeddings = []
    for i in tqdm(p.glob("*.pt")):
        t = torch.load(i)
        emb = t["mean_representations"][layer]
        embeddings.append(emb)
    return embeddings

In [None]:
p_normal = Path("/shared/homology/esm2_t33_650M_UR50D/sequence_normal")
p_shuffled = Path("/shared/homology/esm2_t33_650M_UR50D/sequence_shuffled")
p_shuffled_by_every_aminoacids = Path("/shared/homology/esm2_t33_650M_UR50D/sequence_shuffled_by_every_aminoacids")

emb_normal = pd.DataFrame(random_seq_emb(p_normal, 33))
emb_shuffled = pd.DataFrame(random_seq_emb(p_shuffled, 33))
emb_shuffled_by_every_aminoacids = pd.DataFrame(random_seq_emb(p_shuffled_by_every_aminoacids, 33))

emb = pd.concat([emb_normal, emb_shuffled, emb_shuffled_by_every_aminoacids])
#print(emb_normal.size())

In [None]:
embeds = umap.UMAP(n_components=2, n_neighbors=5, min_dist=0.1, metric="euclidean", random_state = 42).fit_transform(emb)
embeds = pd.DataFrame(embeds, columns = ["x", "y"])
embeds['source'] = ['normal'] * emb_normal.shape[0] + ['shuffled'] * emb_shuffled.shape[0] +['shuffled_by_every_aminoacid'] * emb_shuffled_by_every_aminoacids.shape[0]
#print(embeds)

In [None]:
#fig = px.scatter(ds, x=0, y=1, color_continuous_scale=px.colors.sequential.Rainbow)
fig = px.scatter(embeds, x='x', y='y', color='source', height = 600, width = 800, opacity=0.8)
fig.update_traces(textposition='top center', marker=dict(size=5))
fig.write_image("normal_vs_shuffled_33.png", scale = 5)
fig.show()