# NLP Coursework

- Rohit Midha (rm1623)
- Harsh Agarwal (hra23)
- Utsav Rai (ur23)

## Install and Imports

In [None]:
!pip install -q datasets transformers[torch]
!pip install accelerate -U -q
!pip install -q huggingface_hub
!pip install -q wandb

In [None]:
import torch

import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from torch.utils.data import DataLoader

import torch.optim as optim
import numpy as np

import sys
import os
import os.path
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from urllib import request
import csv
import pandas as pd
from datasets import Dataset, DatasetDict

In [None]:
!huggingface-cli login --token=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

# Model 2: Hybrid LSTM

## Experiment: Unprocessed and Downsampled + Keyword Text

### Load Dataset

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data = load_dataset("ImperialIndians23/nlp_cw_data_unprocessed_downsampled")

In [None]:
data

### Tokenization

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_function(examples):
    # Concatenate 'community' and 'text' for each example in the batch
    combined_text = [community + " " + text for community, text in zip(examples['community'], examples['text'])]
    
    return tokenizer(combined_text, padding="max_length", truncation=True, max_length=512)


In [None]:
# Tokenize all texts
tokenized_data = data.map(tokenize_function, batched=True)

In [None]:
tokenized_data

In [None]:
tokenized_data.set_format(type='torch', columns=['input_ids', 'label'])

In [None]:


train_dataset = tokenized_data["train"]
valid_dataset = tokenized_data["valid"]

batch_size = 8  # Adjust based on your GPU memory and model requirements

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)


### Model

In [None]:

class HybridLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, pretrained_embeddings=None):
        super(HybridLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
            self.embedding.weight.requires_grad = True
        
        self.conv1d = nn.Conv1d(embedding_dim, 64, kernel_size=2, stride=1, padding='valid')
        self.dropout1 = nn.Dropout(0.5)
        self.maxpool = nn.MaxPool1d(kernel_size=4, stride=4)
        # Removed return_sequences=True
        self.bidirectional_lstm1 = nn.LSTM(64, 64, batch_first=True, bidirectional=True)
        self.bidirectional_lstm2 = nn.LSTM(128, 32, batch_first=True, bidirectional=True)
        self.dense1 = nn.Linear(64 , 64)  # Adjusted for bidirectional output
        self.dropout2 = nn.Dropout(0.5)
        self.dense2 = nn.Linear(64, 1)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        x = x.permute(0, 2, 1)  # Change to (batch, channels, sequence_length) for Conv1D
        x = F.relu(self.conv1d(x))
        x = self.dropout1(x)
        x = self.maxpool(x)
        x = x.permute(0, 2, 1)  # Back to (batch, seq_len, channels) for LSTM
        x, _ = self.bidirectional_lstm1(x)
        x, (hidden, _) = self.bidirectional_lstm2(x)
        # Concatenate the final forward and backward hidden states
        x = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        x = F.relu(self.dense1(x))
        x = self.dropout2(x)
        x = self.dense2(x)
        return x


In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model, optimizer, and loss function
embedding_dim = 256  
vocab_size = tokenizer.vocab_size 
model = HybridLSTM(vocab_size, embedding_dim).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.BCEWithLogitsLoss()  # Use BCEWithLogitsLoss for binary classification


In [None]:
from sklearn.metrics import f1_score

def train(model, train_loader, optimizer, loss_fn, device, epoch):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].float().unsqueeze(1).to(device)  # Adjust shape for BCEWithLogitsLoss
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        if (batch_idx + 1) % 100 == 0:  # Print after every 100 batches
            print(f'Epoch: {epoch+1}, Batch: {batch_idx+1}/{len(train_loader)}, Loss: {loss.item()}')
    
    avg_loss = total_loss / len(train_loader)
    print(f"End of Epoch {epoch+1}, Training Loss: {avg_loss}")
    wandb.log({"Avg Training Loss": avg_loss, "Epoch": epoch+1})


In [None]:
def evaluate(model, test_loader, loss_fn, device, epoch):
    model.eval()
    total_loss = 0
    total = 0
    correct_predictions = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].float().unsqueeze(1).to(device)
            outputs = model(input_ids)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()
            predictions = torch.round(torch.sigmoid(outputs))  # Sigmoid to get [0,1] range and round off to get class prediction
            correct_predictions += torch.sum(predictions == labels).item()
            
            # Collect all labels and predictions for F1 score calculation
            all_labels.extend(labels.view(-1).cpu().numpy())
            all_predictions.extend(predictions.view(-1).cpu().numpy())
    
    avg_loss = total_loss / len(test_loader)
    accuracy = correct_predictions / len(test_loader.dataset)
    f1 = f1_score(all_labels, all_predictions)  # Compute F1 score
    print(f"End of Epoch {epoch+1}, Validation Loss: {avg_loss}, Accuracy: {accuracy}, F1 Score: {f1}")
    wandb.log({"Validation Loss": avg_loss, "Accuracy": accuracy, "F1 Score": f1, "Epoch": epoch+1})

### Metrics

### Training

In [None]:
HF_TOKEN = "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"

In [None]:
!huggingface-cli login --token={HF_TOKEN}

In [None]:
import wandb

# WANDB_TOKEN = userdata.get('WANDB_TOKEN')
wandb.login(key="26b0c0ff3251f094fd91c1472199ea71e4edaa45")

In [None]:

wandb.init(project='nlp_cw', name="LSTM - unprocessed + keyword downsampled text")

# Optionally, you can add configuration that will be logged with your run
wandb.config = {
  "learning_rate": 1e-4,
  "epochs": 100,
  "batch_size": 8
}

In [None]:
# Training and Evaluation Loop
num_epochs = 100  # Adjust as needed
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train(model, train_loader, optimizer, loss_fn, device,epoch)
    evaluate(model, test_loader, loss_fn, device,epoch)

In [None]:
model_path = "LSTM_unprocessed_keyword.pth"
torch.save(model, model_path)

### Evaluation for model trained on unprocessed and downsampled + keyword text

In [None]:
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(pi)])+'\n')

In [None]:

def generate_predictions(model, data_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            outputs = model(input_ids)
            preds = torch.round(torch.sigmoid(outputs))  # Assuming binary classification
            predictions.extend(preds.view(-1).cpu().numpy().astype(int))
    return predictions

# Assuming your model and device setup is done
predictions = generate_predictions(model, test_loader, device)

# Ensure the output directory exists
os.makedirs(os.path.join('res/'), exist_ok=True)

labels2file(predictions, os.path.join('res/', 'LSTM_unprocessed_keyword_task1.txt'))


In [None]:

output_dir = './'
outf = open(os.path.join(output_dir,'LSTM_unprocessed_keyword_task1_scores.txt'),'w')
from datasets import load_dataset
dataset = load_dataset("ImperialIndians23/nlp_cw_data_unprocessed_downsampled")
valid_dataset = dataset["valid"]

predictions = [] # put your preds here

gold = dataset["valid"]["label"]
labels2file(gold, os.path.join('ref/', 'LSTM_unprocessed_keyword_task1.txt') )

input_dir = "./"
output_dir = "./"

# define gold data path
ref_dir = os.path.join(input_dir, 'ref')

# define submission data path
submission_dir = os.path.join(input_dir, 'res')
files = os.listdir(submission_dir)

# evaluating on task 1
if 'LSTM_unprocessed_keyword_task1.txt' in files:
    task1_res = []
    task1_gold = []
    with open(os.path.join(submission_dir,'LSTM_unprocessed_keyword_task1.txt')) as f:
        for line in f:
            task1_res.append(int(line.strip()))
    with open(os.path.join(ref_dir,'LSTM_unprocessed_keyword_task1.txt')) as f:
        for line in f:
            task1_gold.append(int(line.strip()))
    # task 1 scores
    t1p = precision_score(task1_gold, task1_res)
    t1r = recall_score(task1_gold, task1_res)
    t1f = f1_score(task1_gold, task1_res)
    # task1
    outf.write('task1_precision:'+str(t1p)+'\n')
    outf.write('task1_recall:'+str(t1r)+'\n')
    outf.write('task1_f1:'+str(t1f)+'\n')    
    
outf.close()

## Experiment: Processed and Downsampled + Keyword Text

In [None]:
data = load_dataset("ImperialIndians23/nlp_cw_data_processed_downsampled")

In [None]:
# Tokenize all texts
tokenized_data = data.map(tokenize_function, batched=True)

In [None]:
tokenized_data.set_format(type='torch', columns=['input_ids', 'label'])

In [None]:

train_dataset = tokenized_data["train"]
valid_dataset = tokenized_data["valid"]

batch_size = 8  # Adjust based on your GPU memory and model requirements

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)


In [None]:
wandb.init(project='nlp_cw', name="LSTM - processed + keyword downsampled text")

# Optionally, you can add configuration that will be logged with your run
wandb.config = {
  "learning_rate": 1e-4,
  "epochs": 100,
  "batch_size": 8
}

### Training

In [None]:
# Training and Evaluation Loop
num_epochs = 100  # Adjust as needed
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    train(model, train_loader, optimizer, loss_fn, device,epoch)
    evaluate(model, test_loader, loss_fn, device,epoch)

In [None]:
model_path = "LSTM_processed_keyword.pth"
torch.save(model, model_path)

### Evaluation for model trained on processed and downsampled + keyword text

In [None]:

def generate_predictions(model, data_loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            outputs = model(input_ids)
            preds = torch.round(torch.sigmoid(outputs))  # Assuming binary classification
            predictions.extend(preds.view(-1).cpu().numpy().astype(int))
    return predictions

# Assuming your model and device setup is done
predictions = generate_predictions(model, test_loader, device)

# Ensure the output directory exists
os.makedirs(os.path.join('res/'), exist_ok=True)

labels2file(predictions, os.path.join('res/', 'LSTM_processed_keyword_task1.txt'))


In [None]:
import sys
import os
import os.path
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
import numpy as np


output_dir = './'
outf = open(os.path.join(output_dir,'LSTM_processed_keyword_task1_scores.txt'),'w')
from datasets import load_dataset
dataset = load_dataset("ImperialIndians23/nlp_cw_data_processed_downsampled")
valid_dataset = dataset["valid"]

predictions = [] # put your preds here

gold = dataset["valid"]["label"]
labels2file(gold, os.path.join('ref/', 'LSTM_processed_keyword_task1.txt') )

input_dir = "./"
output_dir = "./"

# define gold data path
ref_dir = os.path.join(input_dir, 'ref')

# define submission data path
submission_dir = os.path.join(input_dir, 'res')
files = os.listdir(submission_dir)

# evaluating on task 1
if 'LSTM_processed_keyword_task1.txt' in files:
    task1_res = []
    task1_gold = []
    with open(os.path.join(submission_dir,'LSTM_processed_keyword_task1.txt')) as f:
        for line in f:
            task1_res.append(int(line.strip()))
    with open(os.path.join(ref_dir,'LSTM_processed_keyword_task1.txt')) as f:
        for line in f:
            task1_gold.append(int(line.strip()))
    # task 1 scores
    t1p = precision_score(task1_gold, task1_res)
    t1r = recall_score(task1_gold, task1_res)
    t1f = f1_score(task1_gold, task1_res)
    # task1
    outf.write('task1_precision:'+str(t1p)+'\n')
    outf.write('task1_recall:'+str(t1r)+'\n')
    outf.write('task1_f1:'+str(t1f)+'\n')    
    
outf.close()