In [None]:
import torch
import pandas as pd
import plotly.express as px
from umap import UMAP
from sklearn.manifold import TSNE
import numpy as np
from src.utils import Plotter
from tqdm import tqdm

In [None]:
from src.models.gpt import GPTModel
MODEL_NAME = "model-h7iskggx:v0"

model = GPTModel.load_from_checkpoint(f"artifacts/{MODEL_NAME}/model.ckpt")

In [None]:
df = pd.read_csv("data/raw/climbs.csv")
df = df[df["ascensionist_count"] >= 5]
df = df[df["angle"] == 40]
data = []
for name, row in df.iterrows():
    frames = row["frames"]
    angle = row["angle"]
    font_grade = row["font_grade"]
    stack = []
    for _ in range(20):
        stack.append(model.tokenizer.encode(frames, angle, font_grade, pad=64, shuffle=True))
    data.append(torch.stack(stack))
data = torch.stack(data)

In [None]:
embedded_data = []
model.eval()
with torch.no_grad():
    for i in tqdm(data):
        t = model.embed(i.to("cuda")).mean(dim=[0, 1])
        embedded_data.append(t.cpu().detach())
data = torch.stack(embedded_data)

In [None]:
umap = UMAP(n_components=2, n_neighbors=50, min_dist=0.1, metric="euclidean")
embedded = umap.fit_transform(data)
df["x_gpt"] = embedded[:, 0]
df["y_gpt"] = embedded[:, 1]

In [None]:
fig = px.scatter(
    df,
    x="x_gpt",
    y="y_gpt",
    # color="difficulty_average",
    hover_data=["name", "font_grade"],
    width=1200,
    height=800,
    opacity=0.7,
)
#remove colorbar
fig.update_layout(coloraxis_showscale=False)
fig.show()

In [None]:
df.to_csv("data/dash.csv", index=False)

In [None]:
embeddings = []
for k,v in model.tokenizer.encode_map.items():
    embeddings.append(model.model.transformer.wte.forward(torch.tensor([v]).to("cuda")).detach().cpu())
embeddings = torch.stack(embeddings).squeeze(1)

In [None]:
umap = UMAP(n_neighbors=50, min_dist=0.1, metric="cosine")
tok_emb = umap.fit_transform(embeddings)

In [None]:
df = pd.DataFrame(tok_emb, columns=["x_emb", "y_emb"])
df["token"] = list(model.tokenizer.encode_map.keys())

In [None]:
def get_role(token):
    if token.startswith("p"):
        return "hold"
    if token.startswith("r"):
        return "color"
    if token.startswith("a"):
        return "angle"
    if token.startswith("f"):
        return "grade"
    else:
        return "special"

df["role"] = df["token"].apply(get_role)

In [None]:
fig = px.scatter(df, x="x_emb", y="y_emb", text="token", hover_name="token", color="role", width=1200, height=800)
fig.update_traces(marker=dict(size=12, opacity=0.5))
