In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer
from safetensors.torch import load_file
import joblib
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

In [None]:
# Download required NLTK data
nltk.download('punkt', quiet=True)

In [None]:
# Set device for computation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Model paths and hyperparameters
LSTM_MODEL_DIR = 'utbert'
GLOVE_PATH = "glove.6B.100d.txt"
MAX_LEN = 100
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 2

class TextDataset(Dataset):
    """Custom Dataset class for text data."""
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

class LSTMClassifier(nn.Module):
    """LSTM-based text classifier with pre-trained embeddings."""
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embeddings=None):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if embeddings is not None:
            self.embedding.weight = nn.Parameter(embeddings)
            self.embedding.weight.requires_grad = True
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=1)
        self.dropout = nn.Dropout(p=0.5)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

def load_glove_embeddings(glove_path, word_to_idx, embedding_dim):
    """Load GloVe embeddings for the vocabulary."""
    embeddings = np.zeros((len(word_to_idx), embedding_dim), dtype=np.float32)
    embeddings[1] = np.random.normal(0, 0.1, embedding_dim)  # <UNK>
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word, vector = values[0], np.array(values[1:], dtype=np.float32)
            if word in word_to_idx:
                embeddings[word_to_idx[word]] = vector
    return torch.tensor(embeddings, dtype=torch.float32)

def preprocess_text(text, word_to_idx, max_len=MAX_LEN):
    """Convert text to tensor of token indices for LSTM."""
    tokens = word_tokenize(str(text).lower())
    indices = [word_to_idx.get(token, word_to_idx["<UNK>"]) for token in tokens][:max_len]
    indices += [word_to_idx["<PAD>"]] * (max_len - len(indices)) if len(indices) < max_len else []
    return torch.tensor([indices], dtype=torch.long).to(device)

def get_bert_predictions(texts, batch_size=16):
    """Generate BERT predictions in batches."""
    dataset = TextDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    all_probs = []
    
    bert_model.eval()
    with torch.no_grad():
        for batch_texts in dataloader:
            inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = bert_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()
            all_probs.extend(probs)
            torch.cuda.empty_cache()
    return np.array(all_probs)

def load_glove_dict(glove_path):
    """Load GloVe embeddings into a dictionary."""
    glove = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            glove[values[0]] = np.array(values[1:], dtype=np.float32)
    return glove

def texts_to_glove_embeddings(texts, glove):
    """Convert texts to averaged GloVe embeddings."""
    embeddings = np.zeros((len(texts), EMBEDDING_DIM))
    for i, text in enumerate(texts):
        tokens = word_tokenize(str(text).lower())
        vectors = [glove.get(word, np.zeros(EMBEDDING_DIM)) for word in tokens]
        if vectors:
            embeddings[i] = np.mean(vectors, axis=0)
    return embeddings

def get_logreg_predictions(texts, glove, lr_model):
    """Generate Logistic Regression predictions."""
    embeddings = texts_to_glove_embeddings(texts, glove)
    return lr_model.predict_proba(embeddings)[:, 1]

def get_lstm_predictions(texts, batch_size=32):
    """Generate LSTM predictions in batches."""
    dataset = TextDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    all_probs = []
    
    lstm_model.eval()
    with torch.no_grad():
        for batch_texts in dataloader:
            inputs = preprocess_texts(batch_texts, word_to_idx)
            logits = lstm_model(inputs)
            probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
            all_probs.extend(probs)
            torch.cuda.empty_cache()
    return np.array(all_probs)

def evaluate_model(y_true, y_pred, y_prob, model_name):
    """Evaluate model performance with various metrics."""
    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred),
        "F1-Score": f1_score(y_true, y_pred),
        "AUC-ROC": roc_auc_score(y_true, y_prob)
    }
    print(f"\n{model_name} Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print(classification_report(y_true, y_pred, target_names=['Unbiased', 'Biased']))

def stacking_predict(texts, bert_batch_size=16, lstm_batch_size=32):
    """Predict bias using the stacking ensemble."""
    bert_probs = get_bert_predictions(texts, bert_batch_size)
    logreg_probs = get_logreg_predictions(texts, glove, lr_model)
    lstm_probs = get_lstm_predictions(texts, lstm_batch_size)
    meta_features = np.column_stack((bert_probs, logreg_probs, lstm_probs))
    final_preds = meta_model.predict(meta_features)
    final_probs = meta_model.predict_proba(meta_features)[:, 1]
    return final_preds, final_probs

In [None]:
# Initialize models and resources
tokenizer = BertTokenizer.from_pretrained(LSTM_MODEL_DIR)
bert_model = BertForSequenceClassification.from_pretrained(LSTM_MODEL_DIR).to(device).eval()
lr_model = joblib.load('logreg_model.pkl')
word_to_idx = joblib.load('word_to_idx.pkl')
embeddings = load_glove_embeddings(GLOVE_PATH, word_to_idx, EMBEDDING_DIM)
lstm_model = LSTMClassifier(len(word_to_idx), EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, embeddings)
lstm_model.load_state_dict(torch.load('best_lstm_model.pt', map_location=device))
lstm_model.to(device).eval()
glove = load_glove_dict(GLOVE_PATH)

In [None]:
# Load and preprocess data
data = pd.read_csv('train.csv')
data = data.drop(columns=data.columns[0]).rename(columns={'0': 'text', '1': 'label'})
train, test = train_test_split(data, test_size=0.2, shuffle=True, stratify=data['label'], random_state=42)

In [None]:
# Prepare validation and test sets
val_texts, val_labels = train['text'].tolist(), train['label'].values
test_texts, test_labels = test['text'].tolist(), test['label'].values

In [None]:
# Generate meta-features
bert_preds_val = get_bert_predictions(val_texts)
logreg_preds_val = get_logreg_predictions(val_texts, glove, lr_model)
lstm_preds_val = get_lstm_predictions(val_texts)
meta_features_val = np.column_stack((bert_preds_val, logreg_preds_val, lstm_preds_val))

In [None]:
# Train meta-model
meta_model = RandomForestClassifier(
    n_estimators=250,
    max_depth=12,
    min_samples_split=5,
    class_weight='balanced_subsample',
    random_state=42
)

In [None]:
meta_model.fit(meta_features_val, val_labels)

In [None]:
# Test set predictions
bert_preds_test = get_bert_predictions(test_texts)
logreg_preds_test = get_logreg_predictions(test_texts, glove, lr_model)
lstm_preds_test = get_lstm_predictions(test_texts)
meta_features_test = np.column_stack((bert_preds_test, logreg_preds_test, lstm_preds_test))
final_preds = meta_model.predict(meta_features_test)
final_probs = meta_model.predict_proba(meta_features_test)[:, 1]

In [None]:
# Evaluate stacking ensemble
evaluate_model(test_labels, final_preds, final_probs, "Stacking Ensemble")

In [None]:
# Example inference
sample_text = "Sample Text"
preds, probs = stacking_predict([sample_text])
print(f"\nText: {sample_text}")
print(f"Predicted Label: {'Biased' if preds[0] == 1 else 'Unbiased'}")
print(f"Probability of Bias: {probs[0]:.4f}")