In [31]:
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import (
    BertTokenizerFast,
    BertPreTrainedModel,
    BertModel,
    TrainingArguments,
    Trainer
)

# ----------- GloVe loading functions ----------- #
def load_glove_embeddings(glove_path):
    """
    Loads GloVe embeddings from a .txt file into a dict: word -> np.array(float32)
    """
    embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

def text_to_avg_glove(text, embeddings_dict, embed_dim=300):
    """
    Splits text on whitespace, looks up each token in embeddings_dict,
    and returns the average embedding. If no known tokens, returns a zero vector.
    """
    tokens = text.split()
    vectors = []
    for token in tokens:
        if token in embeddings_dict:
            vectors.append(embeddings_dict[token])
    if len(vectors) == 0:
        return np.zeros(embed_dim, dtype=np.float32)
    else:
        return np.mean(vectors, axis=0)


In [34]:
# ----- Load GloVe (42B 300d, e.g. glove.42B.300d.txt) -----
GLOVE_PATH = "glove.42B.300d.txt"
print("Loading GloVe (this might take a while)...")
glove_dict = load_glove_embeddings(GLOVE_PATH)
print("Done loading GloVe!")

# ----- Load labeled data -----
with open("QTL_text.json", "r", encoding="utf-8") as f:
    data = json.load(f)
df_labeled = pd.DataFrame(data)

# Minimal text preprocessing
df_labeled["text"] = (
    df_labeled["Title"].fillna("").str.lower() + " " +
    df_labeled["Abstract"].fillna("").str.lower()
)
df_labeled["Category"] = df_labeled["Category"].astype(int)

# For debug: check distribution
print("Label distribution in entire dataset:")
print(df_labeled["Category"].value_counts())

# ----- Load unlabeled test data -----
df_test = pd.read_csv("test_unlabeled.tsv", sep="\t", dtype={"PMID": str})
df_test["text"] = (
    df_test["Title"].fillna("").str.lower() + " " +
    df_test["Abstract"].fillna("").str.lower()
)


Loading GloVe (this might take a while)...
Done loading GloVe!
Label distribution in entire dataset:
Category
0    10271
1     1007
Name: count, dtype: int64


In [35]:
X = df_labeled["text"].values
y = df_labeled["Category"].values

X_train, X_dev, y_train, y_dev = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train distribution:", pd.Series(y_train).value_counts())
print("Dev distribution:", pd.Series(y_dev).value_counts())


Train distribution: 0    8216
1     806
Name: count, dtype: int64
Dev distribution: 0    2055
1     201
Name: count, dtype: int64


In [36]:
from torch.utils.data import Dataset

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

class BertGloveDataset(Dataset):
    def __init__(self, texts, labels=None, glove_dict=None, tokenizer=None,
                 max_length=256, glove_dim=300):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.glove_dim = glove_dim
        
        # Precompute GloVe embeddings
        self.glove_embs = [
            text_to_avg_glove(t, glove_dict, embed_dim=glove_dim) for t in self.texts
        ]
        
        # BERT tokenization
        self.encodings = self.tokenizer(
            list(self.texts),
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        item = {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "glove_emb": torch.tensor(self.glove_embs[idx], dtype=torch.float)
        }
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


In [37]:
def collate_fn(batch):
    # batch is a list of dicts from __getitem__()
    input_ids = torch.stack([x["input_ids"] for x in batch])
    attention_mask = torch.stack([x["attention_mask"] for x in batch])
    glove_emb = torch.stack([x["glove_emb"] for x in batch])
    
    labels = None
    if "labels" in batch[0]:
        labels = torch.stack([x["labels"] for x in batch])
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "glove_emb": glove_emb,
        "labels": labels
    }


In [23]:
# class BertWithGlove(BertPreTrainedModel):
#     def __init__(self, config, glove_dim=300):
#         super().__init__(config)
#         self.bert = BertModel(config)
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
#         # BERT hidden size is config.hidden_size (for bert-base-uncased, 768)
#         combined_dim = config.hidden_size + glove_dim
#         self.classifier = nn.Linear(combined_dim, config.num_labels)
        
#         self.post_init()  # For transformers >=4.20; otherwise use self.init_weights()

#     def forward(
#         self,
#         input_ids=None,
#         attention_mask=None,
#         glove_emb=None,
#         labels=None
#     ):
#         # Standard BERT pass
#         outputs = self.bert(
#             input_ids=input_ids,
#             attention_mask=attention_mask
#         )
#         # outputs: (last_hidden_state, pooler_output, hidden_states, attentions)
#         pooled_output = outputs.pooler_output  # (batch_size, hidden_size)
#         pooled_output = self.dropout(pooled_output)
        
#         # Concat GloVe
#         if glove_emb is None:
#             # If for some reason it's missing, fallback
#             glove_emb = torch.zeros(pooled_output.size(0), 300).to(pooled_output.device)
#         combined = torch.cat((pooled_output, glove_emb), dim=1)  # shape (batch_size, 768+300)
        
#         logits = self.classifier(combined)
        
#         loss = None
#         if labels is not None:
#             loss_fct = nn.CrossEntropyLoss()
#             loss = loss_fct(logits, labels)
        
#         return {
#             "loss": loss,
#             "logits": logits
#         }
from transformers import BertPreTrainedModel, BertModel
import torch
import torch.nn as nn

class BertWithGlove(BertPreTrainedModel):
    def __init__(self, config, glove_dim=300):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        combined_dim = config.hidden_size + glove_dim  # 768 + 300
        self.classifier = nn.Linear(combined_dim, config.num_labels)
        
        self.post_init()  # For Transformers >=4.20

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        glove_emb=None,
        labels=None
    ):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output  # shape: (batch_size, 768)
        pooled_output = self.dropout(pooled_output)
        
        if glove_emb is None:
            glove_emb = torch.zeros(pooled_output.size(0), 300).to(pooled_output.device)

        combined = torch.cat((pooled_output, glove_emb), dim=1)  # shape: (batch_size, 1068)
        logits = self.classifier(combined)

        loss = None
        if labels is not None:
            # Weighted CrossEntropy: class 0 = 1.0, class 1 = 3.0
            class_weights = torch.tensor([1.0, 3.0]).to(logits.device)
            loss_fct = nn.CrossEntropyLoss(weight=class_weights)
            loss = loss_fct(logits, labels)

        return {
            "loss": loss,
            "logits": logits
        }


In [24]:
model = BertWithGlove.from_pretrained(
    "bert-base-uncased",
    glove_dim=300,   # pass the extra argument
    num_labels=2
)

training_args = TrainingArguments(
    output_dir="./bert_glove_results",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    weight_decay=0.01,
    load_best_model_at_end=True
)

# Basic metric function
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,   # Optional for classification, but recommended
    data_collator=collate_fn,
    compute_metrics=compute_metrics
)


Some weights of BertWithGlove were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [25]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.45,0.476006,0.911348,0.666667,0.00995,0.019608
2,0.3907,0.21838,0.929521,0.563253,0.930348,0.701689
3,0.3574,0.291374,0.906472,0.486034,0.865672,0.62254
4,0.2649,0.346541,0.953014,0.791411,0.641791,0.708791
5,0.2282,0.267686,0.956117,0.755,0.751244,0.753117
6,0.172,0.246005,0.959663,0.777778,0.766169,0.77193


TrainOutput(global_step=6768, training_loss=0.31051809117021856, metrics={'train_runtime': 749.5018, 'train_samples_per_second': 72.224, 'train_steps_per_second': 9.03, 'total_flos': 7121413712424960.0, 'train_loss': 0.31051809117021856, 'epoch': 6.0})

In [26]:
pred_out = trainer.predict(dev_dataset)
dev_logits = pred_out.predictions
y_pred_dev = np.argmax(dev_logits, axis=1)

print("=== Dev Set Results (argmax) ===")
print(classification_report(y_dev, y_pred_dev))


=== Dev Set Results (argmax) ===
              precision    recall  f1-score   support

           0       0.99      0.93      0.96      2055
           1       0.56      0.93      0.70       201

    accuracy                           0.93      2256
   macro avg       0.78      0.93      0.83      2256
weighted avg       0.95      0.93      0.94      2256



# or

In [15]:
dev_probs = torch.softmax(torch.tensor(dev_logits), dim=1).numpy()[:, 1]
threshold = 0.22  # you can tune
y_pred_dev_custom = (dev_probs >= threshold).astype(int)

print(f"=== Dev Set Results (threshold={threshold}) ===")
print(classification_report(y_dev, y_pred_dev_custom))


=== Dev Set Results (threshold=0.22) ===
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2055
           1       0.80      0.78      0.79       201

    accuracy                           0.96      2256
   macro avg       0.89      0.88      0.88      2256
weighted avg       0.96      0.96      0.96      2256



In [27]:
test_dataset = BertGloveDataset(
    df_test["text"].values,
    labels=[0] * len(df_test),
    glove_dict=glove_dict,
    tokenizer=tokenizer
)
test_out = trainer.predict(test_dataset)
test_logits = test_out.predictions
test_probs = torch.softmax(torch.tensor(test_logits), dim=1).numpy()[:, 1]

# Use the threshold that works best for you
final_threshold = 0.3
df_test["Label"] = (test_probs >= final_threshold).astype(int)

df_test[["PMID", "Label"]].to_csv("bert_glove_test_2.csv", index=False)
print("Saved predictions to 'bert_glove_test_2.csv'!")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved predictions to 'bert_glove_test_2.csv'!


In [28]:
for t in [0.2, 0.25, 0.3, 0.35]:
    pred_labels = (test_probs >= t).astype(int)
    print(f"Threshold {t:.2f}: {sum(pred_labels)} predicted 1s out of {len(pred_labels)}")


Threshold 0.20: 162 predicted 1s out of 1097
Threshold 0.25: 161 predicted 1s out of 1097
Threshold 0.30: 161 predicted 1s out of 1097
Threshold 0.35: 161 predicted 1s out of 1097


In [29]:
from sklearn.metrics import f1_score

best_f1 = 0
best_t = 0.5
for t in np.arange(0.05, 0.5, 0.01):
    y_pred = (dev_probs >= t).astype(int)
    f1 = f1_score(y_dev, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_t = t

print(f"Best threshold on dev = {best_t:.2f} with F1 = {best_f1:.4f}")


Best threshold on dev = 0.12 with F1 = 0.7905
