In [1]:
import numpy as np
import pandas as pd
from utils import DatasetLoader
from model import ModelBuilder

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# Load dataset
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support


loader = DatasetLoader(dataset_name="dnagpt/dna_core_promoter", text_column="sequence", label_column="label")
dataset = loader.load()

# Build model and pipeline
model_name = "zhangtaolab/dnabert2-promoter" 
builder = ModelBuilder(model_name=model_name)
tokenizer, model = builder.load()

# This can take time depending on dataset and GPU availability
trainer = builder.train_model(
    dataset,
    output_dir="./results",
    epochs=10,           # Increase for better results
    batch_size=8,
    learning_rate=2e-5
)

# Evaluate after training
metrics = trainer.evaluate()
print("Training metrics:", metrics)

pipe = builder.build_pipeline(top_k=1)  # top_k=1 for single-label classification

# Predict on Test Set and Compute Metrics
test_texts = dataset["validation"]["sequence"]  # Using validation as test
true_labels = dataset["validation"]["label"]

# Get predictions
pred_labels = []
for text in test_texts:
    pred = pipe(text)[0]["label"]
    # Map string labels to integers (SST-2: 'POSITIVE'=1, 'NEGATIVE'=0)
    pred_labels.append(1 if pred.upper() == "POSITIVE" else 0)

# Compute metrics
acc = accuracy_score(true_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average="macro")

print(f"Test Accuracy: {acc:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1: {f1:.4f}")


# Save Report
report = pd.DataFrame({
    "text": test_texts,
    "true_label": true_labels,
    "pred_label": pred_labels
})

# Create results folder if it doesn't exist
os.makedirs("results", exist_ok=True)

report_file = "results/test_predictions_report.csv"
report.to_csv(report_file, index=False)
print(f"Report saved to {report_file}")


# Summary Metrics

summary_metrics = pd.DataFrame([{
    "accuracy": acc,
    "precision": precision,
    "recall": recall,
    "f1": f1
}])

summary_file = "results/summary_metrics.csv"
summary_metrics.to_csv(summary_file, index=False)
print(f"Summary metrics saved to {summary_file}")


  trainer = Trainer(


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']