In [51]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import numpy as np
import stanza

In [52]:
def extract_text(path):
    with open(path, "r") as f:
        train = f.read().split("\n\n")
        train = [x.split("\n") for x in train]

    sentences = []
    labels = []

    for i in range(len(train)):
        sentence = train[i][0].split(" ")
        label = train[i][1].split(" ")
        if len(sentence) == len(label):
            sentences.append(sentence)
            labels.append(label)
    
    return sentences, labels

In [53]:
# Initialize Stanza NLP pipeline
nlp = stanza.Pipeline('id', processors='tokenize,pos')

# Data Sample
sentences, labels = extract_text("TAGGED REVISI DIKIT.txt")


# Create Vocabulary for words, POS tags, and labels
word2idx = defaultdict(lambda: len(word2idx))
pos2idx = defaultdict(lambda: len(pos2idx))
label2idx = defaultdict(lambda: len(label2idx))

# Add special tokens
word2idx["<PAD>"] = 0
pos2idx["<PAD>"] = 0
label2idx["O"] = 0

2024-11-27 22:54:26 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-27 22:54:26 INFO: Downloaded file to C:\Users\jonat\stanza_resources\resources.json
2024-11-27 22:54:26 INFO: Loading these models for language: id (Indonesian):
| Processor | Package    |
--------------------------
| tokenize  | gsd        |
| mwt       | gsd        |
| pos       | gsd_charlm |

2024-11-27 22:54:26 INFO: Using device: cuda
2024-11-27 22:54:26 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-27 22:54:26 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-27 22:54:26 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
2024-11-27 22:54:26 INFO: Done loading processors!


In [54]:

# Preprocess training data for initial vocabulary
def preprocess_training_data(sentences, labels):
    processed_data = []
    for sentence, sent_labels in zip(sentences, labels):
        # Use Stanza for POS tagging
        doc = nlp(" ".join(sentence))
        
        processed_words = []
        processed_pos = []
        processed_labels = []
        
        for sent in doc.sentences:
            for word in sent.words:
                processed_words.append(word.text)
                processed_pos.append(word.upos)
        
        # Match labels to processed words (assuming same order)
        processed_labels = sent_labels[:len(processed_words)]
        
        processed_data.append((processed_words, processed_pos, processed_labels))
        
        # Update vocabularies
        for word in processed_words:
            word2idx[word]
        for pos in processed_pos:
            pos2idx[pos]
        for label in processed_labels:
            label2idx[label]
    
    return processed_data

# Preprocess the initial training data
processed_training_data = preprocess_training_data(sentences, labels)

In [55]:
# Convert words, POS tags, and labels to indices
input_words = [[word2idx[word] for word in sentence[0]] for sentence in processed_training_data]
input_pos = [[pos2idx[pos] for pos in sentence[1]] for sentence in processed_training_data]
label_data = [[label2idx[label] for label in sentence[2]] for sentence in processed_training_data]

# Dynamic Padding: Use the length of the longest sentence in the data
MAX_LEN = max([len(sentence) for sentence in input_words])

# Padding the sequences
input_words = [sentence + [0]*(MAX_LEN - len(sentence)) for sentence in input_words]
input_pos = [pos + [0]*(MAX_LEN - len(pos)) for pos in input_pos]
label_data = [label + [0]*(MAX_LEN - len(label)) for label in label_data]

In [56]:
# Dataset Class
class NERDataset(Dataset):
    def __init__(self, words, pos, labels):
        self.words = words
        self.pos = pos
        self.labels = labels

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        return self.words[idx], self.pos[idx], self.labels[idx]
    
# Sorting by length (this is necessary for pack_padded_sequence)
def collate_fn(batch):
    words, pos, labels = zip(*batch)
    
    # Sorting by length of the words (descending order)
    lengths = torch.tensor([len(w) for w in words])
    sorted_idx = torch.argsort(lengths, descending=True)
    
    words = [words[i] for i in sorted_idx]
    pos = [pos[i] for i in sorted_idx]
    labels = [labels[i] for i in sorted_idx]
    lengths = lengths[sorted_idx]

    # Padding the sequences
    words_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(w) for w in words], batch_first=True, padding_value=0)
    pos_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(p) for p in pos], batch_first=True, padding_value=0)
    labels_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(l) for l in labels], batch_first=True, padding_value=0)

    return words_padded, pos_padded, labels_padded, lengths

# Create dataset and dataloader
dataset = NERDataset(input_words, input_pos, label_data)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

In [57]:
# Enhanced NER Model with POS tag input
class NERModel(nn.Module):
    def __init__(self, vocab_size, pos_size, tagset_size, 
                 embedding_dim=50, pos_embedding_dim=20, hidden_dim=100, dropout=0.3):
        super(NERModel, self).__init__()
        # Embedding layers for words and POS tags
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embedding = nn.Embedding(pos_size, pos_embedding_dim)
        
        # Combine word and POS embeddings
        combined_dim = embedding_dim + pos_embedding_dim
        
        # LSTM layer with dropout
        self.lstm = nn.LSTM(combined_dim, hidden_dim, batch_first=True, dropout=dropout)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, words, pos):
        # Embed words and POS tags
        word_emb = self.word_embedding(words)
        pos_emb = self.pos_embedding(pos)
        
        # Concatenate word and POS embeddings
        combined_emb = torch.cat((word_emb, pos_emb), dim=2)
        
        # LSTM and classification
        lstm_out, _ = self.lstm(combined_emb)
        output = self.fc(lstm_out)
        return output

In [58]:
# Set model parameters
vocab_size = len(word2idx)
pos_size = len(pos2idx)
tagset_size = len(label2idx)

# Create model
model = NERModel(vocab_size, pos_size, tagset_size)

# Define loss and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding in loss
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [59]:
# Training loop
for epoch in range(100):  # for simplicity, we use 10 epochs
    model.train()  # Set model to training mode
    total_loss = 0
    for words, pos, labels, lengths in dataloader:
        # Forward pass
        outputs = model(words, pos)

        # Flatten the outputs and labels for the loss function
        outputs = outputs.view(-1, tagset_size)
        labels = labels.view(-1)

        # Compute loss, for all tokens (including padding)
        loss = loss_fn(outputs, labels)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')


Epoch 1, Loss: 1.9562294483184814
Epoch 2, Loss: 1.9049075841903687
Epoch 3, Loss: 1.8546748161315918
Epoch 4, Loss: 1.8050137758255005
Epoch 5, Loss: 1.7554737329483032
Epoch 6, Loss: 1.7056702375411987
Epoch 7, Loss: 1.6552882194519043
Epoch 8, Loss: 1.6040890216827393
Epoch 9, Loss: 1.551914095878601
Epoch 10, Loss: 1.498694658279419
Epoch 11, Loss: 1.4444619417190552
Epoch 12, Loss: 1.3893502950668335
Epoch 13, Loss: 1.3335877656936646
Epoch 14, Loss: 1.2774722576141357
Epoch 15, Loss: 1.2213279008865356
Epoch 16, Loss: 1.1654452085494995
Epoch 17, Loss: 1.110026240348816
Epoch 18, Loss: 1.05516517162323
Epoch 19, Loss: 1.0008927583694458
Epoch 20, Loss: 0.947274386882782
Epoch 21, Loss: 0.8945138454437256
Epoch 22, Loss: 0.8429822325706482
Epoch 23, Loss: 0.7931394577026367
Epoch 24, Loss: 0.7453895807266235
Epoch 25, Loss: 0.6999677419662476
Epoch 26, Loss: 0.6569318175315857
Epoch 27, Loss: 0.6162347793579102
Epoch 28, Loss: 0.5778046250343323
Epoch 29, Loss: 0.5415812134742737


In [60]:
# Predict function
def predict(sentence):
    # Use Stanza for POS tagging
    doc = nlp(" ".join(sentence))
    
    # Extract words and POS tags
    processed_words = []
    processed_pos = []
    
    for sent in doc.sentences:
        for word in sent.words:
            processed_words.append(word.text)
            processed_pos.append(word.upos)
    
    # Convert to indices, handling out-of-vocabulary words
    input_words = [word2idx.get(word, word2idx["<PAD>"]) for word in processed_words]
    input_pos = [pos2idx.get(pos, pos2idx["<PAD>"]) for pos in processed_pos]

    print(input_pos)
    
    # Pad sequences
    input_words = input_words + [0] * (MAX_LEN - len(input_words))
    input_pos = input_pos + [0] * (MAX_LEN - len(input_pos))
    
    # Convert to tensors
    input_words = torch.tensor([input_words]).long()
    input_pos = torch.tensor([input_pos]).long()

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_words, input_pos)
        _, predicted = torch.max(outputs, dim=2)

    print(outputs)
    
    # Convert indices back to labels
    predicted_labels = [list(label2idx.keys())[i] for i in predicted[0]]

    # Truncate to original sentence length
    predicted_labels = predicted_labels[:len(processed_words)]

    return predicted_labels

In [61]:
# Test the model
test_sentence = ["Maka", "Les", "Parisiens", "dituntut", "bangkit", "meski", "punya", "catatan", "negatif", "saat", "bertemu", "Bayern", "Munich"]
predictions = predict(test_sentence)
print("Predicted Labels:", predictions)

# Print out vocabularies for reference
print("\nWord to Index:", dict(word2idx))
print("\nPOS to Index:", dict(pos2idx))
print("\nLabel to Index:", dict(label2idx))

[7, 2, 2, 3, 3, 7, 3, 8, 6, 7, 3, 2, 2]
tensor([[[-0.6450, -0.3318, -0.2371,  0.9593,  0.4024, -0.9449, -0.6222],
         [-1.4061, -2.0003, -0.7597,  5.2985,  0.6206, -2.0661, -1.7204],
         [-2.4557, -2.7283, -1.0173,  0.5376,  5.6073, -1.4381, -1.2368],
         [-2.1081, -2.8489, -2.1193,  0.5886,  4.2446,  1.0029, -0.8788],
         [-1.4612, -1.6668, -2.7151, -0.8080,  2.5049,  2.5858, -0.3093],
         [-0.9785, -1.6390, -2.2709,  0.3266,  1.3683,  0.9302,  1.0329],
         [-1.3084, -1.8412, -2.0618,  0.9541,  0.9491,  0.0946,  0.9662],
         [-1.3093, -1.8284, -2.0625,  1.9060,  0.9044, -0.4293,  0.5342],
         [-1.3405, -1.0176, -2.1933,  2.8393,  0.8701, -0.3405, -0.6533],
         [-1.4826, -0.6979, -2.7498,  3.4376,  0.6182, -0.6211, -1.4563],
         [-1.9147, -1.4211, -1.6628,  4.3297,  0.6069, -1.1083, -2.1393],
         [-2.0505, -2.4103, -1.5526,  6.5060,  0.9951, -1.8541, -2.2841],
         [-2.9026, -3.1289, -1.7211,  0.7781,  6.0488, -1.1341, -1.4557]

In [62]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import stanza

def extract_text(path):
    with open(path, "r") as f:
        train = f.read().split("\n\n")
        train = [x.split("\n") for x in train]

    sentences = []
    labels = []

    for i in range(len(train)):
        print(train[i])
        sentence = train[i][0].split(" ")
        label = train[i][1].split(" ")
        if len(sentence) == len(label):
            sentences.append(sentence)
            labels.append(label)
    
    return sentences, labels

# Initialize Stanza NLP pipeline
nlp = stanza.Pipeline('id', processors='tokenize,pos')

# Data Sample
sentences, labels = extract_text("TAGGED REVISI DIKIT.txt")
print(sentences)

# Create Vocabulary for POS tags and labels
pos2idx = defaultdict(lambda: len(pos2idx))
label2idx = defaultdict(lambda: len(label2idx))

# Add special tokens
pos2idx["<PAD>"] = 0
label2idx["O"] = 0

# Preprocess training data for initial vocabulary
def preprocess_training_data(sentences, labels):
    processed_data = []
    for sentence, sent_labels in zip(sentences, labels):
        # Use Stanza for POS tagging
        doc = nlp(" ".join(sentence))
        
        processed_pos = []
        processed_labels = []
        
        for sent in doc.sentences:
            for word in sent.words:
                processed_pos.append(word.upos)
        
        # Match labels to processed POS tags (assuming same order)
        processed_labels = sent_labels[:len(processed_pos)]
        
        processed_data.append((processed_pos, processed_labels))
        
        # Update vocabularies
        for pos in processed_pos:
            pos2idx[pos]
        for label in processed_labels:
            label2idx[label]
    
    return processed_data

# Preprocess the initial training data
processed_training_data = preprocess_training_data(sentences, labels)

# Convert POS tags and labels to indices
input_pos = [[pos2idx[pos] for pos in sentence[0]] for sentence in processed_training_data]
label_data = [[label2idx[label] for label in sentence[1]] for sentence in processed_training_data]

# Dynamic Padding: Use the length of the longest sentence in the data
MAX_LEN = max([len(sentence) for sentence in input_pos])

# Padding the sequences
input_pos = [pos + [0]*(MAX_LEN - len(pos)) for pos in input_pos]
label_data = [label + [0]*(MAX_LEN - len(label)) for label in label_data]

# Dataset Class
class NERDataset(Dataset):
    def __init__(self, pos, labels):
        self.pos = pos
        self.labels = labels

    def __len__(self):
        return len(self.pos)

    def __getitem__(self, idx):
        return self.pos[idx], self.labels[idx]
    
# Sorting by length (this is necessary for pack_padded_sequence)
def collate_fn(batch):
    pos, labels = zip(*batch)
    
    # Sorting by length of the POS tags (descending order)
    lengths = torch.tensor([len(p) for p in pos])
    sorted_idx = torch.argsort(lengths, descending=True)
    
    pos = [pos[i] for i in sorted_idx]
    labels = [labels[i] for i in sorted_idx]
    lengths = lengths[sorted_idx]

    # Padding the sequences
    pos_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(p) for p in pos], batch_first=True, padding_value=0)
    labels_padded = torch.nn.utils.rnn.pad_sequence([torch.tensor(l) for l in labels], batch_first=True, padding_value=0)

    return pos_padded, labels_padded, lengths

# Create dataset and dataloader
dataset = NERDataset(input_pos, label_data)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Enhanced NER Model with POS tag input
class NERModel(nn.Module):
    def __init__(self, pos_size, tagset_size, pos_embedding_dim=100, hidden_dim=100, dropout=0):
        super(NERModel, self).__init__()
        # Embedding layer for POS tags
        self.pos_embedding = nn.Embedding(pos_size, pos_embedding_dim)
        
        # LSTM layer with dropout
        self.lstm = nn.LSTM(pos_embedding_dim, hidden_dim, batch_first=True, dropout=dropout)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, pos):
        # Embed POS tags
        pos_emb = self.pos_embedding(pos)
        
        # LSTM and classification
        lstm_out, _ = self.lstm(pos_emb)
        output = self.fc(lstm_out)
        return output

# Set model parameters
pos_size = len(pos2idx)
tagset_size = len(label2idx)

# Create model
model = NERModel(pos_size, tagset_size)

# Define loss and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding in loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(100):  # for simplicity, we use 100 epochs
    model.train()  # Set model to training mode
    total_loss = 0
    for pos, labels, lengths in dataloader:
        # Forward pass
        outputs = model(pos)

        # Flatten the outputs and labels for the loss function
        outputs = outputs.view(-1, tagset_size)
        labels = labels.view(-1)

        # Create mask to ignore padding tokens in the loss calculation
        mask = labels != 0  # Only valid tokens, ignore padding (0)

        # Check if there are any valid tokens to compute loss
        if mask.sum() == 0:  # No valid tokens to calculate loss
            continue
        
        # Apply the mask to the outputs and labels (flattened)
        outputs = outputs[mask]
        labels = labels[mask]

        # Compute loss, only for valid tokens (non-padding)
        loss = loss_fn(outputs, labels)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')

# Predict function (using only POS tags as input)
def predict(sentence):
    # Use Stanza for POS tagging
    doc = nlp(" ".join(sentence))
    
    # Extract POS tags
    processed_pos = [word.upos for sent in doc.sentences for word in sent.words]
    
    # Convert to indices, handling out-of-vocabulary POS tags
    input_pos = [pos2idx.get(pos, pos2idx["<PAD>"]) for pos in processed_pos]
    
    # Pad sequences
    input_pos = input_pos + [0] * (MAX_LEN - len(input_pos))
    
    # Convert to tensor
    input_pos = torch.tensor([input_pos]).long()

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_pos)
        _, predicted = torch.max(outputs, dim=2)

    # Convert indices back to labels
    predicted_labels = [list(label2idx.keys())[i] for i in predicted[0]]

    # Truncate to original sentence length
    predicted_labels = predicted_labels[:len(processed_pos)]

    return predicted_labels

# Test the model
test_sentence = ["Maka", "Les", "Parisiens", "dituntut", "bangkit", "meski", "punya", "catatan", "negatif", "saat", "bertemu", "Bayern", "Munich"]
predictions = predict(test_sentence)
print("Predicted Labels:", predictions)

# Print out vocabularies for reference
print("\nPOS to Index:", dict(pos2idx))
print("\nLabel to Index:", dict(label2idx))


2024-11-27 22:54:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-27 22:54:27 INFO: Downloaded file to C:\Users\jonat\stanza_resources\resources.json
2024-11-27 22:54:28 INFO: Loading these models for language: id (Indonesian):
| Processor | Package    |
--------------------------
| tokenize  | gsd        |
| mwt       | gsd        |
| pos       | gsd_charlm |

2024-11-27 22:54:28 INFO: Using device: cuda
2024-11-27 22:54:28 INFO: Loading: tokenize
2024-11-27 22:54:28 INFO: Loading: mwt
2024-11-27 22:54:28 INFO: Loading: pos
2024-11-27 22:54:28 INFO: Done loading processors!


['ï»¿Luis Enrique tahu start Paris Saint Germain di Liga Champions tidak bagus', 'B-PER I-PER O O B-ORG I-ORG I-ORG O B-LEAGUE I-LEAGUE O O']
['Maka Les Parisiens dituntut bangkit meski punya catatan negatif saat bertemu Bayern Munich', 'O B-ORG I-ORG O O O O O O O O B-ORG I-ORG']
[['ï»¿Luis', 'Enrique', 'tahu', 'start', 'Paris', 'Saint', 'Germain', 'di', 'Liga', 'Champions', 'tidak', 'bagus'], ['Maka', 'Les', 'Parisiens', 'dituntut', 'bangkit', 'meski', 'punya', 'catatan', 'negatif', 'saat', 'bertemu', 'Bayern', 'Munich']]
Epoch 1, Loss: 1.9498374462127686
Epoch 2, Loss: 1.8568024635314941
Epoch 3, Loss: 1.7719604969024658
Epoch 4, Loss: 1.694122314453125
Epoch 5, Loss: 1.622365951538086
Epoch 6, Loss: 1.5561259984970093
Epoch 7, Loss: 1.4950679540634155
Epoch 8, Loss: 1.438909649848938
Epoch 9, Loss: 1.3872342109680176
Epoch 10, Loss: 1.3393231630325317
Epoch 11, Loss: 1.294113278388977
Epoch 12, Loss: 1.2503514289855957
Epoch 13, Loss: 1.20686674118042
Epoch 14, Loss: 1.162819743156