In [None]:
# ライブラリのインポート

import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np

In [None]:
# デバイスの設定

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# データの読み込み

df = pd.read_csv("../data/processed/amino-acid-genotypes-to-brightness.csv")

In [None]:
# モデルの読み込み

model_name_or_path = "hugohrban/progen2-base"

model = (
    AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)
    .to(device)
    .eval()
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# 対数尤度の計算

@torch.no_grad()
def log_likelihood(sequences, batch_size=32):
    lls = []
    for i in tqdm(range(0, len(sequences), batch_size)):
        batch_sequences = sequences[i : i + batch_size]
        inputs = tokenizer(
            batch_sequences, return_tensors="pt", padding=True, add_special_tokens=False
        ).to(device)
        input_ids = inputs.input_ids  # (B, L)
        attention_mask = inputs.attention_mask  # (B, L)

        logits = model(input_ids).logits  # (B, L, V)
        logits = logits[:, :-1, :]  # (B, L-1, V)
        targets = input_ids[:, 1:]  # (B, L-1)

        # tokens = list("ACDEFGHIKLMNPQRSTVWYBXZUO")
        # ids = tokenizer.convert_tokens_to_ids(tokens)
        # first_tok, last_tok = min(ids), max(ids)
        first_tok, last_tok = 5, 29

        logits = logits[:, :, first_tok : last_tok + 1]  # (B, L-1, 25)
        targets = targets - first_tok  # (B, L-1)
        masks = attention_mask[:, 1:]  # (B, L-1)

        for logit, target, mask in zip(logits, targets, masks):
            ll = -F.cross_entropy(logit[mask.bool()], target[mask.bool()], reduction="mean")
            lls.append(ll.item())
    return lls


df["log_likelihood"] = log_likelihood(df["sequence"].tolist(), batch_size=32)
df["likelihood"] = np.exp(df["log_likelihood"])
df["delta"] = df["log_likelihood"] - df["log_likelihood"].iloc[0]

In [None]:
# 結果の表示

X = df["delta"].values
y = df["brightness"].values

# 共分散を計算
cov = np.cov(X, y, ddof=0)[0, 1]

# 相関係数を計算
r = cov / (np.std(X) * np.std(y))
print(f"R: {r:.3f}")

# 線形回帰の計算
lr = LinearRegression().fit(X.reshape(-1, 1), y)
r2 = lr.score(X.reshape(-1, 1), y)
print(f"R^2: {r2:.3f}")

y_pred = lr.predict(X.reshape(-1, 1))

In [None]:
# 結果のプロット

plt.figure(figsize=(12, 8), dpi=100)
plt.scatter(X, y, s=10)
indices = np.argsort(X.flatten())
plt.plot(
    X.flatten()[indices],
    y_pred[indices],
    linewidth=2,
    label=f"R^2 = {r2:.3f}",
    color="red",
)
plt.xlabel("Likelihood")
plt.ylabel("Brightness")
plt.legend()
plt.tight_layout()
plt.savefig("../figures/eda1/likelihood_vs_brightness.png")
plt.show()