### Feat2LLM

In [None]:
import os
import Feat2LLM
from Feat2LLM.load_data import SmallMolTraj

mol = "ethanol"
smallMol = SmallMolTraj(mol)
smallMol.get_data()
smallMol.gen_representation(n_components=10)
smallMol.save()

##..

In [None]:
smallMol.R, smallMol.R.shape, smallMol.E
smallMol.results

##..

In [None]:
from Feat2LLM.vec2str import ZipFeaturizer
from sklearn.model_selection import train_test_split


X = smallMol.results["cMBDF_trans"]
y = smallMol.results["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
converter = ZipFeaturizer(n_bins=300) #<--- you can change if does not work

X_train = converter.bin_vectors(X_train)
X_test = converter.bin_vectors(X_test)

In [None]:
X_test

In [None]:
from Feat2LLM.roberta_finetuning import write_data_to_json, load_JSON_data, MoleculeDataset

# change the filename depending on the dataset
write_data_to_json(X_train, y_train, 'train.json')
write_data_to_json(X_test, y_test, 'test.json')

data = load_JSON_data("train.json")

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaModel, AdamW 

# Split the data into training and test sets (modify as needed if already split)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
tokenizer       = RobertaTokenizer.from_pretrained('roberta-base')
train_dataset   = MoleculeDataset(train_data, tokenizer)
test_dataset    = MoleculeDataset(test_data, tokenizer)

# Define the custom model with a regression head
class RobertaForRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.regression_head = nn.Linear(self.roberta.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state[:, 0, :]
        logits = self.regression_head(sequence_output)
        return logits

# Set device: Apple/NVIDIA/CPU
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
model = RobertaForRegression().to(device)
optimizer = AdamW(model.parameters(), lr=1e-6)

# DataLoader setup
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Training loop
model.train()
for epoch in range(2):  # Number of epochs
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        mask = batch['attention_mask'].to(device)
        outputs = model(inputs, mask).squeeze(-1)
        loss = nn.MSELoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch}, Loss: {loss.item()}")

# Evaluate the model
model.eval()
total_loss = 0
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        mask = batch['attention_mask'].to(device)
        outputs = model(inputs, mask).squeeze(-1)
        loss = nn.MSELoss()(outputs, labels)
        total_loss += loss.item()
    print(f"Test Loss: {total_loss / len(test_loader)}")

# Save model and optimizer state
def save_model(model, optimizer, epoch, loss, filepath):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
        'loss': loss
    }, filepath)

# Assuming you want to save the model after training
model.eval()

if not os.path.exists('save_models'):
    os.makedirs('save_models')

save_model(model, optimizer, epoch, loss.item(), "regression.pth")