<a href="https://colab.research.google.com/github/junaid537/pytorch/blob/main/finalHW4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
if not os.path.exists('/content/drive/MyDrive'):
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print("Google Drive is already mounted.")


Mounted at /content/drive


In [None]:
train_file = "/content/drive/MyDrive/dataHW4csci544/data/train"

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_file = "/content/drive/MyDrive/dataHW4csci544/data/train"
dev_file = "/content/drive/MyDrive/dataHW4csci544/data/dev"

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import os

# Paths
train_file = "/content/drive/MyDrive/hw4csci544/data/train"
dev_file = "/content/drive/MyDrive/hw4csci544/data/dev"
test_file = "/content/drive/MyDrive/hw4csci544/data/test"
output_dir = "/content/drive/MyDrive/hw4csci544/"

# Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
LSTM_LAYERS = 1
DROPOUT = 0.25  # Adjusted dropout for better precision-recall balance
LINEAR_OUT_DIM = 128
BATCH_SIZE = 48  # Adjusted batch size for better updates
LEARNING_RATE = 0.007
MOMENTUM = 0.99
WEIGHT_DECAY = 1e-4
EPOCHS = 20
STEP_SIZE = 7  # Learning rate scheduler step size
GAMMA = 0.3  # Learning rate decay factor

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load dataset
def load_data(filename, is_test=False):
    sentences, labels = [], []
    with open(filename, 'r', encoding='utf-8') as f:
        sentence, label = [], []
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
                continue
            parts = line.split()
            if len(parts) < 2:
                continue
            sentence.append(parts[1])
            if not is_test:
                label.append(parts[2] if len(parts) > 2 else "O")
            else:
                label.append("O")
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

# Load train, dev, and test data
train_sentences, train_labels = load_data(train_file)
dev_sentences, dev_labels = load_data(dev_file)
test_sentences, _ = load_data(test_file, is_test=True)

# Build vocabulary
word_to_ix = {"PAD": 0, "UNK": 1}
tag_to_ix = {}
for sentence in train_sentences:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

for labels in train_labels:
    for tag in labels:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

ix_to_tag = {v: k for k, v in tag_to_ix.items()}

# Dataset class
class NERDataset(Dataset):
    def __init__(self, sentences, labels, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.labels = labels
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        word_indices = [self.word_to_ix.get(w, 1) for w in sentence]  # Default to UNK
        label_indices = [self.tag_to_ix.get(l, 0) for l in label]  # Default to O-tag
        return torch.tensor(word_indices), torch.tensor(label_indices)

# Custom collate function
def collate_fn(batch):
    words, labels = zip(*batch)
    words_padded = pad_sequence(words, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-1)
    return words_padded, labels_padded

# Create datasets and loaders
train_dataset = NERDataset(train_sentences, train_labels, word_to_ix, tag_to_ix)
dev_dataset = NERDataset(dev_sentences, dev_labels, word_to_ix, tag_to_ix)
test_dataset = NERDataset(test_sentences, [["O"] * len(s) for s in test_sentences], word_to_ix, tag_to_ix)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Define BLSTM Model
class BLSTM_NER(nn.Module):
    def __init__(self, vocab_size, tagset_size):
        super(BLSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, EMBEDDING_DIM, padding_idx=0)
        self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, num_layers=LSTM_LAYERS,
                            bidirectional=True, dropout=DROPOUT, batch_first=True)
        self.fc = nn.Linear(HIDDEN_DIM * 2, LINEAR_OUT_DIM)
        self.elu = nn.ELU()
        self.classifier = nn.Linear(LINEAR_OUT_DIM, tagset_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        x = self.elu(x)
        x = self.classifier(x)
        return x

# Initialize model, loss, optimizer, and scheduler
model = BLSTM_NER(len(word_to_ix), len(tag_to_ix))
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for words, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(words)

        # Reshape for loss computation
        outputs = outputs.view(-1, outputs.shape[-1])
        labels = labels.view(-1).long()

        loss = criterion(outputs, labels)
        loss.backward()

        # ✅ Apply gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)

        optimizer.step()
        total_loss += loss.item()

    # Update learning rate
    scheduler.step()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss:.4f}, LR: {scheduler.get_last_lr()[0]}")

# Save model
torch.save(model.state_dict(), os.path.join(output_dir, "blstm1.pt"))
print("Model saved!")

# Evaluation function
def evaluate(model, data_loader, sentences, output_file):
    model.eval()
    predictions = []
    batch_offset = 0

    with torch.no_grad():
        for words, _ in data_loader:
            outputs = model(words)
            outputs = torch.argmax(outputs, dim=-1)

            for batch_idx, preds in enumerate(outputs):
                sentence = sentences[batch_offset]
                for word_idx, pred in enumerate(preds[:len(sentence)]):
                    predictions.append(f"{word_idx + 1} {sentence[word_idx]} {ix_to_tag[pred.item()]}\n")
                predictions.append("\n")
                batch_offset += 1

    with open(output_file, "w", encoding="utf-8") as f:
        f.writelines(predictions)

    print(f"Saved predictions to {output_file}")

# Evaluate on dev set
evaluate(model, dev_loader, dev_sentences, os.path.join(output_dir, "dev.out"))

# Evaluate on test set
evaluate(model, test_loader, test_sentences, os.path.join(output_dir, "test.out"))

print("Training completed. Model and outputs saved!")




Epoch 1/20, Loss: 279.8087, LR: 0.007
Epoch 2/20, Loss: 185.5096, LR: 0.007
Epoch 3/20, Loss: 142.2065, LR: 0.007
Epoch 4/20, Loss: 110.7185, LR: 0.007
Epoch 5/20, Loss: 87.2723, LR: 0.007
Epoch 6/20, Loss: 69.5188, LR: 0.007
Epoch 7/20, Loss: 54.3767, LR: 0.0021
Epoch 8/20, Loss: 40.3082, LR: 0.0021
Epoch 9/20, Loss: 35.5232, LR: 0.0021
Epoch 10/20, Loss: 31.9133, LR: 0.0021
Epoch 11/20, Loss: 29.2835, LR: 0.0021
Epoch 12/20, Loss: 26.2779, LR: 0.0021
Epoch 13/20, Loss: 23.8541, LR: 0.0021
Epoch 14/20, Loss: 21.9110, LR: 0.0006299999999999999
Epoch 15/20, Loss: 18.7126, LR: 0.0006299999999999999
Epoch 16/20, Loss: 17.7542, LR: 0.0006299999999999999
Epoch 17/20, Loss: 17.2455, LR: 0.0006299999999999999
Epoch 18/20, Loss: 16.7165, LR: 0.0006299999999999999
Epoch 19/20, Loss: 16.1416, LR: 0.0006299999999999999
Epoch 20/20, Loss: 15.7052, LR: 0.0006299999999999999
Model saved!
Saved predictions to /content/drive/MyDrive/hw4csci544/dev.out
Saved predictions to /content/drive/MyDrive/hw4csc

In [3]:
!python /content/drive/MyDrive/hw4csci544/eval.py -p /content/drive/MyDrive/hw4csci544/dev.out -g /content/drive/MyDrive/hw4csci544/data/dev


processed 51578 tokens with 5942 phrases; found: 5906 phrases; correct: 4186.
accuracy:  94.28%; precision:  70.88%; recall:  70.45%; FB1:  70.66
              LOC: precision:  84.65%; recall:  78.33%; FB1:  81.37  1700
             MISC: precision:  79.22%; recall:  68.22%; FB1:  73.31  794
              ORG: precision:  66.67%; recall:  59.21%; FB1:  62.72  1191
              PER: precision:  59.61%; recall:  71.88%; FB1:  65.17  2221


In [None]:
#python eval.py -p /content/drive/MyDrive/dataHW4csci544/dev1.out -g /content/drive/MyDrive/dataHW4csci544/data/dev


## Task 2

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import os
import numpy as np
import gzip

# Paths
train_file = "/content/drive/MyDrive/hw4csci544/data/train"
dev_file = "/content/drive/MyDrive/hw4csci544/data/dev"
test_file = "/content/drive/MyDrive/hw4csci544/data/test"
glove_path = "/content/drive/MyDrive/hw4csci544/glove.6B.100d.gz"  # GloVe embeddings file
output_dir = "/content/drive/MyDrive/hw4csci544/"

# Hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
LSTM_LAYERS = 1
DROPOUT = 0.1
LINEAR_OUT_DIM = 128
BATCH_SIZE = 32
LEARNING_RATE = 0.02
MOMENTUM = 0.99
WEIGHT_DECAY = 1e-5
EPOCHS = 15

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Extract GloVe embeddings from .gz file
def load_glove_embeddings(glove_path, embedding_dim=100):
    glove_embeddings = {}
    with gzip.open(glove_path, "rt", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            glove_embeddings[word] = vector
    return glove_embeddings

glove_embeddings = load_glove_embeddings(glove_path, EMBEDDING_DIM)

# Load dataset
def load_data(filename, is_test=False):
    sentences, labels = [], []
    with open(filename, 'r', encoding='utf-8') as f:
        sentence, label = [], []
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
                continue
            parts = line.split()
            if len(parts) < 2:
                continue
            sentence.append(parts[1])
            if not is_test:
                label.append(parts[2] if len(parts) > 2 else "O")
            else:
                label.append("O")
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

# Load train, dev, and test data
train_sentences, train_labels = load_data(train_file)
dev_sentences, dev_labels = load_data(dev_file)
test_sentences, _ = load_data(test_file, is_test=True)

# Build vocabulary with GloVe embeddings
word_to_ix = {"<PAD>": 0, "<UNK>": 1}
tag_to_ix = {}

# Step 1: Create an embedding matrix
#embedding_matrix = np.random.uniform(-0.1, 0.1, (len(word_to_ix), EMBEDDING_DIM))# Extra space for new words
embedding_matrix = np.random.uniform(-0.1, 0.1, (len(word_to_ix), EMBEDDING_DIM))

for sentence in train_sentences:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

# Assign GloVe vectors to words (handling case sensitivity)
for word, idx in word_to_ix.items():
    lowercase_word = word.lower()
    if lowercase_word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[lowercase_word]
    elif word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[word]

# Map entity labels to indexes
for labels in train_labels:
    for tag in labels:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

ix_to_tag = {v: k for k, v in tag_to_ix.items()}

# Dataset class
class NERDataset(Dataset):
    def __init__(self, sentences, labels, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.labels = labels
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        word_indices = [self.word_to_ix.get(w, 1) for w in sentence]
        label_indices = [self.tag_to_ix.get(l, 0) for l in label]
        return torch.tensor(word_indices), torch.tensor(label_indices)

# Custom collate function
def collate_fn(batch):
    words, labels = zip(*batch)
    words_padded = pad_sequence(words, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-1)
    return words_padded, labels_padded

# Create datasets and loaders
train_dataset = NERDataset(train_sentences, train_labels, word_to_ix, tag_to_ix)
dev_dataset = NERDataset(dev_sentences, dev_labels, word_to_ix, tag_to_ix)
test_dataset = NERDataset(test_sentences, [["O"] * len(s) for s in test_sentences], word_to_ix, tag_to_ix)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Define BLSTM Model with GloVe Embeddings
class BLSTM_NER(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_matrix):
        super(BLSTM_NER, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=False)
        self.lstm = nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, num_layers=LSTM_LAYERS,
                            bidirectional=True, dropout=DROPOUT, batch_first=True)
        self.fc = nn.Linear(HIDDEN_DIM * 2, LINEAR_OUT_DIM)
        self.elu = nn.ELU(alpha=0.01)
        self.classifier = nn.Linear(LINEAR_OUT_DIM, tagset_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        x = self.elu(x)
        x = self.classifier(x)
        return x

# Initialize model, loss, optimizer, and scheduler
model = BLSTM_NER(len(word_to_ix), len(tag_to_ix), embedding_matrix)
criterion = nn.CrossEntropyLoss(ignore_index=-1)
#optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.85, nesterov=True, weight_decay=WEIGHT_DECAY)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY, nesterov=True)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.8)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.5)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for words, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(words)
        outputs = outputs.view(-1, outputs.shape[-1])
        labels = labels.view(-1).long()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    scheduler.step()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, LR: {scheduler.get_last_lr()[0]}")

# Save model
torch.save(model.state_dict(), os.path.join(output_dir, "blstm2.pt"))
print("Model saved!")

# **Save predictions as dev2 and test2**
def evaluate(model, data_loader, sentences, output_file):
    model.eval()
    predictions = []
    batch_offset = 0

    with torch.no_grad():
        for words, _ in data_loader:
            outputs = model(words)
            outputs = torch.argmax(outputs, dim=-1)

            for batch_idx, preds in enumerate(outputs):
                sentence = sentences[batch_offset]
                for word_idx, pred in enumerate(preds[:len(sentence)]):
                    predictions.append(f"{word_idx + 1} {sentence[word_idx]} {ix_to_tag[pred.item()]}\n")
                predictions.append("\n")
                batch_offset += 1

    with open(output_file, "w", encoding="utf-8") as f:
        f.writelines(predictions)

# Save outputs as dev2 and test2
evaluate(model, dev_loader, dev_sentences, os.path.join(output_dir, "dev2.out"))
evaluate(model, test_loader, test_sentences, os.path.join(output_dir, "test2.out"))

print("Training completed!")




Epoch 1, Loss: 220.7482, LR: 0.02
Epoch 2, Loss: 70.0817, LR: 0.02
Epoch 3, Loss: 45.0103, LR: 0.02
Epoch 4, Loss: 33.4547, LR: 0.01
Epoch 5, Loss: 24.5399, LR: 0.01
Epoch 6, Loss: 20.6272, LR: 0.01
Epoch 7, Loss: 18.6101, LR: 0.01
Epoch 8, Loss: 15.7267, LR: 0.005
Epoch 9, Loss: 12.4048, LR: 0.005
Epoch 10, Loss: 10.9942, LR: 0.005
Epoch 11, Loss: 10.1961, LR: 0.005
Epoch 12, Loss: 9.5262, LR: 0.0025
Epoch 13, Loss: 8.1570, LR: 0.0025
Epoch 14, Loss: 7.3447, LR: 0.0025
Epoch 15, Loss: 6.9792, LR: 0.0025
Model saved!
Training completed!


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [3]:
!python /content/drive/MyDrive/hw4csci544/eval.py -p /content/drive/MyDrive/hw4csci544/dev2.out -g /content/drive/MyDrive/hw4csci544/data/dev


processed 51578 tokens with 5942 phrases; found: 5456 phrases; correct: 4880.
accuracy:  96.92%; precision:  89.44%; recall:  82.13%; FB1:  85.63
              LOC: precision:  93.82%; recall:  87.53%; FB1:  90.57  1714
             MISC: precision:  87.20%; recall:  78.31%; FB1:  82.51  828
              ORG: precision:  84.69%; recall:  78.37%; FB1:  81.41  1241
              PER: precision:  89.60%; recall:  81.38%; FB1:  85.29  1673


In [None]:
file_path = "/content/drive/MyDrive/hw4csci544/eval.py"

# Check if file exists
if os.path.exists(file_path):
    print("✅ File exists!")
else:
    print("❌ File does NOT exist!")

✅ File exists!


## Task 3

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import os
import numpy as np
import gzip

# Paths
train_file = "/content/drive/MyDrive/hw4csci544/data/train"
dev_file = "/content/drive/MyDrive/hw4csci544/data/dev"
test_file = "/content/drive/MyDrive/hw4csci544/data/test"
glove_path = "/content/drive/MyDrive/hw4csci544/glove.6B.100d.gz"
output_dir = "/content/drive/MyDrive/hw4csci544/"

# Hyperparameters
EMBEDDING_DIM = 100  # Word embedding dimension
CHAR_EMBEDDING_DIM = 30  # Character embedding dimension
HIDDEN_DIM = 256  # LSTM hidden dimension
LSTM_LAYERS = 1  # Number of LSTM layers
CNN_OUT_DIM = 50  # CNN output dimension
CNN_KERNEL_SIZE = 3  # CNN kernel size
DROPOUT = 0.1  # Dropout rate
LINEAR_OUT_DIM = 128  # Linear layer output dimension
BATCH_SIZE = 32  # Batch size
LEARNING_RATE = 0.025  # Updated learning rate
MOMENTUM = 0.99  # Momentum for SGD
WEIGHT_DECAY = 1e-5  # Weight decay
EPOCHS = 20  # Number of epochs
ALPHA_ELU = 0.01
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load GloVe embeddings
def load_glove_embeddings(glove_path, embedding_dim=100):
    glove_embeddings = {}
    with gzip.open(glove_path, "rt", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            glove_embeddings[word] = vector
    return glove_embeddings

glove_embeddings = load_glove_embeddings(glove_path, EMBEDDING_DIM)

# Load dataset
def load_data(filename, is_test=False):
    sentences, labels = [], []
    with open(filename, 'r', encoding='utf-8') as f:
        sentence, label = [], []
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
                continue
            parts = line.split()
            if len(parts) < 2:
                continue
            sentence.append(parts[1])
            if not is_test:
                label.append(parts[2] if len(parts) > 2 else "O")
            else:
                label.append("O")
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

# Load train, dev, and test data
train_sentences, train_labels = load_data(train_file)
dev_sentences, dev_labels = load_data(dev_file)
test_sentences, _ = load_data(test_file, is_test=True)

# Build vocabulary and character vocabulary
word_to_ix = {"<PAD>": 0, "<UNK>": 1}
char_to_ix = {"<PAD>": 0, "<UNK>": 1}

tag_to_ix = {}

# Create word and character vocabularies
for sentence in train_sentences:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        for char in word:
            if char not in char_to_ix:
                char_to_ix[char] = len(char_to_ix)

# Create embedding matrix for words
embedding_matrix = np.random.uniform(-0.1, 0.1, (len(word_to_ix), EMBEDDING_DIM))

for word, idx in word_to_ix.items():
    lowercase_word = word.lower()
    if lowercase_word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[lowercase_word]
    elif word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[word]

# Map entity labels to indexes
for labels in train_labels:
    for tag in labels:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

ix_to_tag = {v: k for k, v in tag_to_ix.items()}

# Dataset class
class NERDataset(Dataset):
    def __init__(self, sentences, labels, word_to_ix, char_to_ix, tag_to_ix):
        self.sentences = sentences
        self.labels = labels
        self.word_to_ix = word_to_ix
        self.char_to_ix = char_to_ix
        self.tag_to_ix = tag_to_ix

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        word_indices = [self.word_to_ix.get(w, 1) for w in sentence]
        char_indices = [[self.char_to_ix.get(c, 1) for c in word] for word in sentence]
        label_indices = [self.tag_to_ix.get(l, 0) for l in label]
        return torch.tensor(word_indices), char_indices, torch.tensor(label_indices)

# Custom collate function
def collate_fn(batch):
    words, chars, labels = zip(*batch)

    # Pad word sequences
    words_padded = pad_sequence(words, batch_first=True, padding_value=0)

    # Pad label sequences
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-1)

    # Pad character sequences
    max_word_len = max([max([len(word) for word in sentence]) for sentence in chars])  # Longest word in the batch
    max_sentence_len = max([len(sentence) for sentence in chars])  # Longest sentence in the batch

    # Initialize a tensor for padded character sequences
    chars_padded = torch.zeros((len(chars), max_sentence_len, max_word_len), dtype=torch.long)

    # Fill the tensor with character indices
    for i, sentence in enumerate(chars):
        for j, word in enumerate(sentence):
            chars_padded[i, j, :len(word)] = torch.tensor(word, dtype=torch.long)

    return words_padded, chars_padded, labels_padded

# Create datasets and loaders
train_dataset = NERDataset(train_sentences, train_labels, word_to_ix, char_to_ix, tag_to_ix)
dev_dataset = NERDataset(dev_sentences, dev_labels, word_to_ix, char_to_ix, tag_to_ix)
test_dataset = NERDataset(test_sentences, [["O"] * len(s) for s in test_sentences], word_to_ix, char_to_ix, tag_to_ix)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Define LSTM-CNN Model
class LSTM_CNN_NER(nn.Module):
    def __init__(self, vocab_size, char_vocab_size, tagset_size, embedding_matrix, char_embedding_dim=30):
        super(LSTM_CNN_NER, self).__init__()
        # Word embeddings
        self.word_embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), freeze=False)
        # Character embeddings
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim, padding_idx=0)
        # CNN for character-level features
        self.cnn = nn.Conv1d(char_embedding_dim, CNN_OUT_DIM, CNN_KERNEL_SIZE, padding=(CNN_KERNEL_SIZE // 2))

        # BLSTM for word-level features
        self.lstm = nn.LSTM(EMBEDDING_DIM + CNN_OUT_DIM, HIDDEN_DIM, num_layers=LSTM_LAYERS,
                            bidirectional=True, dropout=DROPOUT, batch_first=True)
        # Fully connected layer
        self.fc = nn.Linear(HIDDEN_DIM * 2, LINEAR_OUT_DIM)
        self.elu = nn.ELU(alpha=ALPHA_ELU)

        # Output layer
        self.classifier = nn.Linear(LINEAR_OUT_DIM, tagset_size)

    def forward(self, words, chars):
        # Word embeddings
        word_embeds = self.word_embedding(words)
        # Character embeddings
        batch_size, seq_len, word_len = chars.size()
        chars = chars.view(batch_size * seq_len, word_len)
        char_embeds = self.char_embedding(chars)
        char_embeds = char_embeds.permute(0, 2, 1)
        # CNN for character-level features
        char_features = torch.relu(self.cnn(char_embeds))  # Apply ReLU
        char_features = nn.functional.normalize(char_features, p=2, dim=2)  # Normalize features
        char_features, _ = torch.max(char_features, dim=2)  # Max pooling

        char_features = char_features.view(batch_size, seq_len, -1)
        # Concatenate word and character features
        combined_embeds = torch.cat((word_embeds, char_features), dim=2)
        # BLSTM
        lstm_out, _ = self.lstm(combined_embeds)
        # Fully connected layer
        fc_out = self.fc(lstm_out)
        fc_out = self.elu(fc_out)
        # Output layer
        logits = self.classifier(fc_out)
        return logits

# Initialize model, loss, optimizer, and scheduler
model = LSTM_CNN_NER(len(word_to_ix), len(char_to_ix), len(tag_to_ix), embedding_matrix)
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY, nesterov=True)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.4)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for words, chars, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(words, chars)
        outputs = outputs.view(-1, outputs.shape[-1])
        labels = labels.view(-1).long()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    scheduler.step()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}, LR: {scheduler.get_last_lr()[0]}")

# Save model
torch.save(model.state_dict(), os.path.join(output_dir, "blstm3.pt"))
print("Model saved!")

# Evaluation function
def evaluate(model, data_loader, sentences, output_file):
    model.eval()
    predictions = []
    batch_offset = 0

    with torch.no_grad():
        for words, chars, _ in data_loader:
            outputs = model(words, chars)
            outputs = torch.argmax(outputs, dim=-1)

            for batch_idx, preds in enumerate(outputs):
                sentence = sentences[batch_offset]
                for word_idx, pred in enumerate(preds[:len(sentence)]):
                    predictions.append(f"{word_idx + 1} {sentence[word_idx]} {ix_to_tag[pred.item()]}\n")
                predictions.append("\n")
                batch_offset += 1

    with open(output_file, "w", encoding="utf-8") as f:
        f.writelines(predictions)

# Save outputs as dev3.out and test3.out
evaluate(model, dev_loader, dev_sentences, os.path.join(output_dir, "dev3.out"))
evaluate(model, test_loader, test_sentences, os.path.join(output_dir, "test3.out"))

print("Training completed!")



Epoch 1, Loss: 190.5833, LR: 0.025
Epoch 2, Loss: 47.7642, LR: 0.025
Epoch 3, Loss: 31.7900, LR: 0.025
Epoch 4, Loss: 25.1390, LR: 0.025
Epoch 5, Loss: 19.3169, LR: 0.010000000000000002
Epoch 6, Loss: 13.2859, LR: 0.010000000000000002
Epoch 7, Loss: 11.2691, LR: 0.010000000000000002
Epoch 8, Loss: 9.5929, LR: 0.010000000000000002
Epoch 9, Loss: 8.4127, LR: 0.010000000000000002
Epoch 10, Loss: 7.2778, LR: 0.004000000000000001
Epoch 11, Loss: 5.5381, LR: 0.004000000000000001
Epoch 12, Loss: 4.7083, LR: 0.004000000000000001
Epoch 13, Loss: 4.2993, LR: 0.004000000000000001
Epoch 14, Loss: 3.9853, LR: 0.004000000000000001
Epoch 15, Loss: 3.5638, LR: 0.0016000000000000005
Epoch 16, Loss: 3.1695, LR: 0.0016000000000000005
Epoch 17, Loss: 2.9620, LR: 0.0016000000000000005
Epoch 18, Loss: 2.8661, LR: 0.0016000000000000005
Epoch 19, Loss: 2.7542, LR: 0.0016000000000000005
Epoch 20, Loss: 2.6139, LR: 0.0006400000000000003
Model saved!
Training completed!


In [5]:
!python /content/drive/MyDrive/hw4csci544/eval.py -p /content/drive/MyDrive/hw4csci544/dev3.out -g /content/drive/MyDrive/hw4csci544/data/dev


processed 51578 tokens with 5942 phrases; found: 6110 phrases; correct: 5253.
accuracy:  97.90%; precision:  85.97%; recall:  88.40%; FB1:  87.17
              LOC: precision:  94.04%; recall:  91.83%; FB1:  92.92  1794
             MISC: precision:  76.00%; recall:  82.10%; FB1:  78.94  996
              ORG: precision:  76.80%; recall:  83.45%; FB1:  79.99  1457
              PER: precision:  90.71%; recall:  91.75%; FB1:  91.23  1863
