In [None]:
import pandas as pd
from rindti.data import PreTrainDataset
from collections import defaultdict
from torch_geometric.loader import DataLoader
from rindti.models import PfamModel
import random
from pytorch_lightning import Trainer
import torch
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import plotly.express as px
import seaborn as sns
import numpy as np
from umap import UMAP
from rindti.losses import SoftNearestNeighborLoss, GeneralisedLiftedStructureLoss

In [None]:
ds = PreTrainDataset("/scratch/SCRATCH_NVME/ilya/pretrain_data/pfam_label_none_proc.pkl")
fams = defaultdict(list)
for idx, prot in enumerate(ds):
    fams[prot.fam].append(idx)

In [None]:
def get_top_fam_ids(fams, k=5, sample=None):
    fam_lens = pd.Series({k:len(v) for k,v in fams.items()})
    good_fams = fam_lens.sort_values(ascending=False).head(k).index
    res = []
    for i in good_fams:
        res += fams[i]
    if sample:
        return random.choices(res, k=sample)
    return res

In [None]:
subset = ds

In [None]:
class TestModel(PfamModel):
    def predict_step(self, data, *args):
        embed = self.encoder(data)
        return dict(embeds=embed.detach().cpu(), fam=data.fam, id=data.id)

In [None]:
import yaml
with open("config/pfam.yaml", "r") as file:
    config = yaml.load(file, yaml.FullLoader)
config["feat_dim"] = 20
config['edge_type'] = "none"
config['feat_type'] = "label"

In [None]:
model = TestModel.load_from_checkpoint("./tb_logs/pfam/version_1/checkpoints/epoch=282-step=282999.ckpt")
model.eval()
encoder = model.encoder
encoder.return_nodes = False

In [None]:
dl = DataLoader(subset, batch_size=128, shuffle=False)
trainer = Trainer(gpus=1)
prediction = trainer.predict(model, dl)

In [None]:
embeds = torch.cat([x['embeds'] for x in prediction])
batch_id = []
batch_fam = []
for batch in prediction:
    batch_id += batch['id']
    batch_fam += batch['fam']

In [None]:
tsne = TSNE()
x = tsne.fit_transform(embeds)
x = pd.DataFrame(data=x)

In [None]:
x.columns = ["x", "y"]
x['fam'] = batch_fam
x['id'] = batch_id

In [None]:
top_fams = defaultdict(int)
for fams in batch_fam:
    for fam in fams.split(";"):
        top_fams[fam] += 1

good_guys = pd.Series(top_fams).sort_values().tail(500).index

def check_presence(fams):
    for fam in fams.split(";"):
        if fam in good_guys:
            return fam
    return False

x['color'] = x['fam'].apply(lambda x: check_presence(x) if check_presence(x) else "other")

In [None]:
fig = px.scatter(x, "x", "y", opacity=0.4,
                 width=1000, height=800, color="color", symbol="color",
                 hover_name="id", hover_data=["fam"], color_discrete_map={"other":"gray"}, 
                color_discrete_sequence=px.colors.qualitative.Light24,)
fig.update_traces(marker=dict(size=8, line=dict(width=0, color='black')))
fig.write_html("test.html")
fig.show()

In [None]:
x = x.drop("color", axis=1)

In [None]:
x.to_csv("data/embeddings.tsv", sep="\t")

In [None]:
df = pd.read_csv("data/embeddings.tsv", sep="\t", index_col=0)

In [None]:
df

In [None]:
df.set_index("id").to_csv("data/embeddings.tsv", sep="\t")