In [1]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn-crfsuite-0.5.0


In [2]:
import math
import os
import json
import re
import random

import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils

from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

from collections import defaultdict

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
train = pd.read_csv('/content/drive/My Drive/Datasets/Coleridge/datasets/train.csv')
train_items = train.sample(n=500, random_state=42)

X_train, X_test = train_test_split(train_items, test_size=0.1, random_state=42)
train_papers = {}
test_papers = {}

for i in range(len(X_train)):
    curr_path = os.path.join(
        os.getcwd(),
        'drive',
        'My Drive',
        'Datasets',
        'Coleridge',
        'datasets',
        'train',
        X_train.iloc[i]['Id'] + '.json')
    with open(curr_path, 'r') as file:
        curr_json = json.load(file)
        train_papers[X_train.iloc[i]['Id']] = curr_json

for i in range(len(X_test)):
    curr_path = os.path.join(
        os.getcwd(),
        'drive',
        'My Drive',
        'Datasets',
        'Coleridge',
        'datasets',
        'train',
        X_test.iloc[i]['Id'] + '.json')
    with open(curr_path, 'r') as file:
        curr_json = json.load(file)
        test_papers[X_test.iloc[i]['Id']] = curr_json

In [5]:
# Hyperparameters
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

MAX_SAMPLE = None # set a small number for experimentation, set None for production.


def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)

    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()

    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))

    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, list(zip(sentence_words, nes))

In [6]:
import re
from tqdm import tqdm
import random

cnt_pos, cnt_neg = 0, 0 # number of sentences that contain/not contain labels
ner_data = []

pbar = tqdm(total=len(train_items))
for i, id, dataset_label in X_train[['Id', 'dataset_label']].itertuples():
    # paper
    paper = train_papers[id]

    # labels
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]

    # sentences
    sentences = set([clean_training_text(sentence) for section in paper
                 for sentence in section['text'].split('.')
                ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars

    # positive sample
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']):
            ner_data.append(tags)
            cnt_neg += 1

    # process bar
    pbar.update(1)
    pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")

# shuffling
random.shuffle(ner_data)

Training data size: 1153 positives + 18588 negatives:  90%|█████████ | 450/500 [00:08<00:00, 79.02it/s]

In [15]:
print(len(sentences[2]))
print(sentences[2])

11
1 a b and c


In [99]:
def build_mappings(data):
    word_set = set()
    tag_set = set()

    # Iterate through the dataset to collect all unique words and tags
    for sentence in data:
        for word, tag in sentence:
            word_set.add(word)
            tag_set.add(tag)

    # Create word2idx mapping, starting with special tokens
    word2idx = {'<PAD>': 0, '<UNK>': 1}
    for idx, word in enumerate(word_set, start=2):  # Start indexing from 2 to leave 0 and 1 for special tokens
        word2idx[word] = idx

    # Create tag2idx mapping
    tag2idx = {'O': 0}  # Often 'O' is the default label, so start indexing from 0
    for idx, tag in enumerate(tag_set):  # Start from 1 because 0 is reserved for 'O'
        if tag != 'O':  # Ensure 'O' is always mapped to 0
            tag2idx[tag] = idx

    return word2idx, tag2idx


word2idx, tag2idx = build_mappings(ner_data)
print(word2idx)
print(tag2idx)

{'O': 0, 'I': 0, 'B': 1}


In [85]:
positive_samples = [sample for sample in ner_data if any(tag in ['B', 'I'] for _, tag in sample)]
negative_samples = [sample for sample in ner_data if all(tag == 'O' for _, tag in sample)]

In [87]:
class NERDataset(Dataset):
    def __init__(self, data, word2idx, tag2idx):
        """
        Args:
            data (list of list of tuples): The dataset where each element is a list of (word, tag) tuples.
            word2idx (dict): Mapping from words to indices.
            tag2idx (dict): Mapping from tags to indices.
            max_len (int): Maximum length for padding.
        """
        self.data = data
        self.word2idx = word2idx
        self.tag2idx = tag2idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]

        # Separate words and tags from the tuples
        words = [w for w, t in sentence]
        tags = [t for w, t in sentence]

        # Convert words and tags to their corresponding indices
        word_ids = [self.word2idx.get(w, self.word2idx['<UNK>']) for w in words]
        tag_ids = [self.tag2idx.get(t, self.tag2idx['O']) for t in tags]

        # Convert to tensors
        return torch.tensor(word_ids), torch.tensor(tag_ids)

def collate_fn(batch):
    words, tags = zip(*batch)

    words_padded = pad_sequence(words, batch_first=True, padding_value=word2idx['<PAD>'])
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=tag2idx['O'])

    return words_padded, tags_padded

In [88]:
batch_size = 32
batches = []

# Loop through positive samples and add balanced batches
for i in range(0, len(positive_samples), batch_size // 2):
    pos_batch = positive_samples[i:i + batch_size // 2]
    neg_batch = negative_samples[i:i + batch_size // 2]
    batch = pos_batch + neg_batch
    random.shuffle(batch)
    batches.append(batch)


dataloaders = []
for batch in batches:
    dataset = NERDataset(batch, word2idx, tag2idx)
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)
    dataloaders.append(dataloader)

In [59]:
class NERLSTM(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=128, hidden_dim=128):
        super(NERLSTM, self).__init__()

        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx['<PAD>'])

        # LSTM
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x):
        # x: (batch_size, max_len)
        embeds = self.embedding_layer(x)
        # print('embeds: ', embeds)

        lstm_out, _  = self.lstm(embeds)
        # print('lstm out: ', lstm_out)

        tag_scores = self.fc(lstm_out)
        # print('tag scores', tag_scores)

        return tag_scores


def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            m.bias.data.fill_(0.01)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                param.data.fill_(0)

In [101]:
import torch
import math

embedding_dim = 128
hidden_dim = 32
vocab_size = len(word2idx)
tagset_size = len(tag2idx)
max_len = 50

model = NERLSTM(vocab_size, tagset_size, embedding_dim, hidden_dim)
model.apply(init_weights)
# model = SimpleNERModel(vocab_size, tagset_size, embedding_dim)

class_weights = torch.tensor([1.0, 10.0, 10.0])  # Example weights for 'O', 'B', 'I'
loss_function = nn.CrossEntropyLoss(weight=class_weights, ignore_index=word2idx['<PAD>'])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)

dataset = NERDataset(ner_data, word2idx, tag2idx)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

epochs = 10
for epoch in range(epochs):
    total_loss = 0.0
    for batch in dataloader:
        words, tags = batch

        if torch.all(words == word2idx['<PAD>']):
            print("All words in this batch are padding!")
            continue  # Skip this batch

        if torch.isnan(words).any():
            print("NaN detected in input!")

        optimizer.zero_grad()

        outputs = model(words)

        outputs = outputs.view(-1, tagset_size)
        tags = tags.view(-1)

        loss = loss_function(outputs, tags)

        # Check for NaNs in loss
        if math.isnan(loss.item()):
            # print(words.values(), tags.values())
            print("NaN loss encountered, exiting.")
            break

        loss.backward()

        for name, param in model.named_parameters():
            if param.grad is not None and torch.isnan(param.grad).any():
                print(f"NaN gradient found in {name}")

        # Clip gradients to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)

        optimizer.step()

        loss_item = loss.item()

        # Check for NaNs in model parameters
        for name, param in model.named_parameters():
            if torch.isnan(param.grad).any():
                print(f"NaN in gradients of {name}")

        total_loss += loss_item

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader)}")


NaN loss encountered, exiting.
Epoch 1/10, Loss: 0.0038712244589351914
NaN loss encountered, exiting.
Epoch 2/10, Loss: 0.030452446644360195
NaN loss encountered, exiting.
Epoch 3/10, Loss: 0.8995907611445702
NaN loss encountered, exiting.
Epoch 4/10, Loss: 0.12569370277491201
NaN loss encountered, exiting.
Epoch 5/10, Loss: 0.19587250049060217
NaN loss encountered, exiting.
Epoch 6/10, Loss: 0.27960007264004555
NaN loss encountered, exiting.
Epoch 7/10, Loss: 0.05816899514892726
NaN loss encountered, exiting.
Epoch 8/10, Loss: 0.6942051977015622
NaN loss encountered, exiting.
Epoch 9/10, Loss: 0.3267792370712873
NaN loss encountered, exiting.
Epoch 10/10, Loss: 0.35727347486613253


In [94]:
cnt_pos_test, cnt_neg_test = 0, 0  # number of sentences that contain/not contain labels
ner_data_test = []

pbar = tqdm(total=len(X_test))
for i, id, dataset_label in X_test[['Id', 'dataset_label']].itertuples():
    # paper
    paper = test_papers[id]

    # labels
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]

    # sentences
    sentences = set([clean_training_text(sentence) for section in paper
                 for sentence in section['text'].split('.')
                ])
    sentences = shorten_sentences(sentences)  # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10]  # only accept sentences with length > 10 chars

    # positive sample
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos_test += 1
            ner_data_test.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']):
            ner_data_test.append(tags)
            cnt_neg_test += 1

    # process bar
    pbar.update(1)
    pbar.set_description(f"Test data size: {cnt_pos_test} positives + {cnt_neg_test} negatives")

# Shuffle the test data
random.shuffle(ner_data_test)

Test data size: 93 positives + 2049 negatives: 100%|██████████| 50/50 [21:53<00:00, 26.28s/it]
Test data size: 93 positives + 2049 negatives: 100%|██████████| 50/50 [00:00<00:00, 81.78it/s]

In [95]:
# Create the test dataset and dataloader
test_dataset = NERDataset(ner_data_test, word2idx, tag2idx)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

In [97]:
from sklearn.metrics import classification_report

# Assuming test set is loaded similarly as the training set
model.eval()
true_tags, pred_tags = [], []
with torch.no_grad():
    for batch in test_dataloader:
        words, tags = batch
        outputs = model(words)
        predictions = torch.argmax(outputs, dim=2)  # (batch_size, max_len)

        true_tags.extend(tags.view(-1).tolist())
        pred_tags.extend(predictions.view(-1).tolist())

# Filter out padding tokens and calculate metrics
true_tags = [tag for tag in true_tags if tag != word2idx['<PAD>']]
pred_tags = [tag for tag in pred_tags if tag != word2idx['<PAD>']]


In [102]:
from sklearn.metrics import classification_report

def evaluate_model(model, dataloader, word2idx, tag2idx):
    model.eval()  # Set the model to evaluation mode
    all_true_tags = []
    all_pred_tags = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for batch in dataloader:
            words, true_tags = batch

            # Get the model predictions
            outputs = model(words)
            predictions = torch.argmax(outputs, dim=-1)  # Get the tag with the highest score

            # Convert the predictions and true tags to lists
            true_tags = true_tags.view(-1).cpu().numpy()
            predictions = predictions.view(-1).cpu().numpy()

            # Mask out padding tokens (ignore_index for the '<PAD>' token)
            mask = (words.view(-1) != word2idx['<PAD>']).cpu().numpy()

            true_tags = true_tags[mask]
            predictions = predictions[mask]

            # Collect all true and predicted tags
            all_true_tags.extend(true_tags)
            all_pred_tags.extend(predictions)

    # Print classification report
    print(classification_report(all_true_tags, all_pred_tags, target_names=[key for key in tag2idx.keys()]))

evaluate_model(model, test_dataloader, word2idx, tag2idx)

              precision    recall  f1-score   support

           O       1.00      0.30      0.46     54266
           I       0.00      0.33      0.00       102
           B       0.00      0.00      0.00         0

    accuracy                           0.30     54368
   macro avg       0.33      0.21      0.16     54368
weighted avg       0.99      0.30      0.46     54368



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
