In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import pickle
from collections import Counter
import joblib

In [None]:
# Download necessary NLTK tokenizers
nltk.download('punkt')

In [None]:
glove_path = 'glove.6B.100d.txt'

In [None]:
# Load dataset
df = pd.read_csv('train.csv', index_col=None)
df.drop(df.columns[0], axis=1, inplace=True)  # Remove unnecessary index column
df = df.rename(columns={'0': 'comment_text', '1': 'label'})  # Rename columns for clarity

# Split dataset into training and validation sets
train_df_balanced, val_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True, stratify=df['label'])

In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Hyperparameters
embedding_dim = 100
hidden_dim = 256
output_dim = 2
max_len = 100

In [None]:
# Function to build vocabulary from training data and GloVe embeddings
def build_vocab(texts, glove_path, min_freq=1):
    word_freq = Counter()
    for text in texts:
        tokens = word_tokenize(str(text).lower())
        word_freq.update(tokens)
    
    word_to_idx = {'<PAD>': 0, '<UNK>': 1}
    idx = 2
    for word, freq in word_freq.items():
        if freq >= min_freq and word not in word_to_idx:
            word_to_idx[word] = idx
            idx += 1
    
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            word = line.split()[0]
            if word not in word_to_idx:
                word_to_idx[word] = idx
                idx += 1
    
    return word_to_idx

# Function to load pre-trained GloVe embeddings
def load_glove_embeddings(glove_path, word_to_idx, embedding_dim):
    embeddings = np.zeros((len(word_to_idx), embedding_dim), dtype=np.float32)
    embeddings[1] = np.random.normal(0, 0.1, embedding_dim)  # Initialize <UNK> embeddings
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            if word in word_to_idx:
                idx = word_to_idx[word]
                embeddings[idx] = np.array(values[1:], dtype=np.float32)
    return torch.tensor(embeddings, dtype=torch.float32)

# Custom dataset class for handling text data
class TextDataset(Dataset):
    def __init__(self, dataframe, word_to_idx, max_len=100):
        self.dataframe = dataframe.reset_index(drop=True)
        self.word_to_idx = word_to_idx
        self.max_len = max_len
        self.texts = dataframe['comment_text'].values
        self.labels = dataframe['label'].values

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = str(self.texts[idx]).lower()
        label = self.labels[idx]
        tokens = word_tokenize(text)
        indices = [self.word_to_idx.get(token, self.word_to_idx["<UNK>"]) for token in tokens][:self.max_len]
        if len(indices) < self.max_len:
            indices += [self.word_to_idx["<PAD>"]] * (self.max_len - len(indices))
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# Define LSTM model class
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embeddings=None):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if embeddings is not None:
            self.embedding.weight = nn.Parameter(embeddings)
            self.embedding.weight.requires_grad = True
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=1, dropout=0.0)
        self.dropout = nn.Dropout(p=0.5)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

In [None]:
# Build vocabulary and load embeddings
word_to_idx = build_vocab(train_df_balanced['comment_text'], glove_path)
embeddings = load_glove_embeddings(glove_path, word_to_idx, embedding_dim)

In [None]:
# Create dataset loaders
train_dataset = TextDataset(train_df_balanced, word_to_idx, max_len)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = TextDataset(val_df, word_to_idx, max_len)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
# Initialize model
vocab_size = len(word_to_idx)
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, embeddings).to(device)

In [None]:
# Evaluation function
def evaluate_model(model, val_loader, device, criterion):
    model.eval()
    val_loss = 0
    all_preds, all_labels, all_probs = [], [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch[0].to(device)
            labels = batch[1].to(device).long()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            probs = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs)
    
    avg_val_loss = val_loss / len(val_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)
    
    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")

In [None]:
# Evaluate
criterion = nn.CrossEntropyLoss()
evaluate_model(model, val_loader, device, criterion)

In [None]:
def preprocess_text(text, word_to_idx, max_len=100):
    if not isinstance(text, str) or not text.strip():
        text = "<UNK>"
    tokens = word_tokenize(text.lower())
    indices = [word_to_idx.get(token, word_to_idx["<UNK>"]) for token in tokens][:max_len]
    if len(indices) < max_len:
        indices += [word_to_idx["<PAD>"]] * (max_len - len(indices))
    return torch.tensor([indices], dtype=torch.long).to(device)

def identify_biased_sentences_lstm(text, model, word_to_idx, threshold=0.3, max_len=100):
    try:
        sentences = sent_tokenize(text)
        if not sentences:
            raise ValueError("No sentences detected in the input text.")
        
        biased_sentences = []
        scores = []
        labels = []
        
        model.eval()
        with torch.no_grad():
            for sentence in sentences:
                input_tensor = preprocess_text(sentence, word_to_idx, max_len)
                logits = model(input_tensor)
                probs = torch.softmax(logits, dim=1)
                prob_biased = probs[0, 1].item()  # Class 1 = biased
                label = 1 if prob_biased >= threshold else 0
                
                scores.append(prob_biased)
                labels.append(label)
                if label == 1:
                    biased_sentences.append(sentence)
        
        return biased_sentences, scores, labels
    except Exception as e:
        print(f"Error during inference: {e}")
        return [], [], []

# Load and run inference
try:
    model.load_state_dict(torch.load('lstm_bias_model.pth', map_location=device, weights_only=True))
    model.to(device)
except FileNotFoundError:
    print("Model file 'lstm_bias_model.pth' not found.")
    exit(1)
except Exception as e:
    print(f"Error loading model: {e}")
    exit(1)

In [None]:
# Sample inference
sample_text = '''Test text'''

biased_sentences, scores, labels = identify_biased_sentences_lstm(sample_text, model, word_to_idx, threshold=0.3)

print("\n=== Inference Results ===")
for sentence, score, label in zip(sent_tokenize(sample_text), scores, labels):
    print(f"Sentence: '{sentence}'")
    print(f"Score (Prob Biased): {score:.4f}, Label: {'Biased' if label == 1 else 'Unbiased'}")
    print()

print("Biased Sentences Only:")
for sentence in biased_sentences:
    print(f"- {sentence}")

In [None]:
# Save model and vocabulary
torch.save(model.state_dict(), 'lstm_model.pth')
joblib.dump(word_to_idx, 'word_to_idx.pkl')

# Provide download links for saved models
from IPython.display import FileLink
display(FileLink('lstm_model.pth'))
display(FileLink('word_to_idx.pkl'))