In [104]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import numpy as np
import stanza

In [105]:
# Initialize Stanza NLP pipeline
nlp = stanza.Pipeline('id', processors='tokenize,pos')

# Create Vocabulary for POS tags and labels
pos2idx = defaultdict(lambda: len(pos2idx))
label2idx = defaultdict(lambda: len(label2idx))

# Add special tokens
pos2idx["<PAD>"] = 0
label2idx["O"] = 0

2024-11-30 19:38:08 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-30 19:38:08 INFO: Downloaded file to C:\Users\jonat\stanza_resources\resources.json
2024-11-30 19:38:09 INFO: Loading these models for language: id (Indonesian):
| Processor | Package    |
--------------------------
| tokenize  | gsd        |
| mwt       | gsd        |
| pos       | gsd_charlm |

2024-11-30 19:38:09 INFO: Using device: cuda
2024-11-30 19:38:09 INFO: Loading: tokenize
2024-11-30 19:38:09 INFO: Loading: mwt
2024-11-30 19:38:09 INFO: Loading: pos
2024-11-30 19:38:09 INFO: Done loading processors!


In [106]:
def calculate_class_weights(labels):
    """
    Menghitung bobot untuk setiap kelas dengan sangat menurunkan bobot kelas 'O'
    """
    # Hitung frekuensi label
    label_counts = defaultdict(int)
    for sentence_labels in labels:
        for label in sentence_labels:
            label_counts[label] += 1
    
    # Temukan label 'O'
    total_labels = sum(label_counts.values())
    
    # Hitung bobot terbalik dari frekuensi
    class_weights = {}
    for label, count in label_counts.items():
        if label == 'O':
            # Kurangi bobot untuk label 'O' secara drastis
            class_weights[label] = (total_labels / (len(label_counts) * count)) * 1
        else:
            # Naikkan bobot untuk label entitas
            class_weights[label] = (total_labels / (len(label_counts) * count)) * 1
    
    # Konversi ke tensor
    ordered_weights = [class_weights.get(label, 1.0) for label in label2idx.keys()]
    return torch.FloatTensor(ordered_weights)

In [107]:
def preprocess_training_data_from_file(file_path):
    pos_tags = []
    labels = []
    match = 0
    mismatch = 0
    
    with open(file_path, "r", encoding="utf-8") as f:
        # Read file content
        content = f.read().strip().split('\n\n')
        
        for block in content:
            # Split block into text and labels
            lines = block.split('\n')
            if len(lines) != 2:
                continue
            
            text_line = lines[0]
            label_line = lines[1]
            
            # Tokenize text and labels
            text_tokens = text_line.split()
            label_tokens = label_line.split()
            
            # Ensure tokens and labels match
            if len(text_tokens) != len(label_tokens):
                print(f"mismatch in tokens and labels: {text_tokens}, {label_tokens}")
                match += 1
                continue
            # Use Stanza for POS tagging
            doc = nlp(text_line)
            mismatch += 1
            processed_pos = []
            processed_labels = []
            
            for sent in doc.sentences:
                for word in sent.words:
                    processed_pos.append(word.upos)
            
            # Match labels to processed POS tags
            processed_labels = label_tokens[:len(processed_pos)]
            
            # Add to lists
            pos_tags.append(processed_pos)
            labels.append(processed_labels)
            
            # Update vocabularies
            for pos in processed_pos:
                pos2idx[pos]
            for label in processed_labels:
                label2idx[label]
                
    print(f"match : {match}")
    print(f"mismatch : {mismatch}")
    
    return pos_tags, labels

In [108]:
# Simplified NER Model using only POS tags
class NERModel(nn.Module):
    def __init__(self, pos_size, tagset_size, 
                 pos_embedding_dim=20, hidden_dim=50):
        super(NERModel, self).__init__()
        # Embedding layer for POS tags
        self.pos_embedding = nn.Embedding(pos_size, pos_embedding_dim)
        
        # LSTM layer
        self.lstm = nn.LSTM(pos_embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)  # Bidirectional, so multiply hidden_dim by 2

    def forward(self, pos):
        # Embed POS tags
        pos_emb = self.pos_embedding(pos)
        
        # LSTM and classification
        lstm_out, _ = self.lstm(pos_emb)
        output = self.fc(lstm_out)
        return output

In [109]:
# Prepare training data
def prepare_training_data(pos_tags, labels, max_len=100):
    # Convert POS tags and labels to indices
    input_pos = [[pos2idx[pos] for pos in sentence_pos] for sentence_pos in pos_tags]
    label_data = [[label2idx[label] for label in sentence_labels] for sentence_labels in labels]

    # Padding the sequences
    input_pos = [pos + [0]*(max_len - len(pos)) for pos in input_pos]
    label_data = [label + [0]*(max_len - len(label)) for label in label_data]

    return input_pos, label_data

In [110]:
# Dataset Class
class NERDataset(Dataset):
    def __init__(self, pos, labels):
        self.pos = torch.tensor(pos)
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.pos)

    def __getitem__(self, idx):
        return self.pos[idx], self.labels[idx]


In [111]:
file_path = "TAGGED.txt"

# Preprocess the training data from the file
pos_tags, labels = preprocess_training_data_from_file(file_path)

# Prepare training data
input_pos, label_data = prepare_training_data(pos_tags, labels)

# Hitung bobot kelas
class_weights = calculate_class_weights(labels)

# Create dataset and dataloader
dataset = NERDataset(input_pos, label_data)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

mismatch in tokens and labels: ['Bahkan', 'tidak', 'satu', 'gol', 'pun', 'dibuat', 'PSG', 'yang', 'membuat', 'ini', 'jadi', 'periode', 'terlama', 'mereka', 'nirgol', 'menghadapi', 'lawan', 'di', 'Eropa'], ['O', 'O', 'O', 'O', 'O', 'O', 'ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PER', 'O', 'O', 'O', 'O', 'LOC']
mismatch in tokens and labels: ['Dalam', 'daftarnya', 'cuma', 'ada', 'tiga', 'pemain', 'naturalisasi', 'yang', 'familiar', 'Rafael', 'Struick', 'Brisbane', 'Roar', 'Ivar', 'Jenner', 'FC', 'Utrecht', 'dan', 'Justin', 'Hubner', 'Wolverhampton', 'Wanderers', 'yang', 'sudah', 'membantu', 'Indonesia', 'mengalahkan', 'Vietnam', 'di', 'kualifikasi', 'Piala', 'Dunia', '2026'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'PER', 'PER', 'ORG', 'ORG', 'PER', 'PER', 'ORG', 'ORG', 'O', 'PER', 'PER', 'ORG', 'ORG', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'O', 'O', 'EVT', 'EVT', 'EVT', 'DATE']
mismatch in tokens and labels: ['Keduanya', 'akan', 'berlaga', 'di', 'partai', 'terakhir', 'China', 'Masters

In [112]:
# Set model parameters
pos_size = len(pos2idx)
tagset_size = len(label2idx)

# Create model
model = NERModel(pos_size, tagset_size)

# Definisikan loss dengan bobot
loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [113]:
# Training loop
print("Starting training...")
for epoch in range(2000):
    total_loss = 0
    for pos, labels in dataloader:
        # Forward pass
        outputs = model(pos)
        
        # Flatten the outputs and labels for the loss function
        outputs = outputs.view(-1, tagset_size)
        labels = labels.view(-1)
        
        # Compute loss
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()
        
        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}, Average Loss: {total_loss/len(dataloader)}')


Starting training...
Epoch 1, Average Loss: 1.1176768448587853
Epoch 2, Average Loss: 0.9007847544727559
Epoch 3, Average Loss: 0.8515986486232465
Epoch 4, Average Loss: 0.8075205526439492
Epoch 5, Average Loss: 0.7691613425951951
Epoch 6, Average Loss: 0.7311564905319324
Epoch 7, Average Loss: 0.6914130998775363
Epoch 8, Average Loss: 0.6488355271825471
Epoch 9, Average Loss: 0.6046420742191144
Epoch 10, Average Loss: 0.5663187470210275
Epoch 11, Average Loss: 0.5288950032794598
Epoch 12, Average Loss: 0.49176005489605756
Epoch 13, Average Loss: 0.4517088083828771
Epoch 14, Average Loss: 0.4188415575004423
Epoch 15, Average Loss: 0.3870085069343217
Epoch 16, Average Loss: 0.3672342345492972
Epoch 17, Average Loss: 0.3286065709604354
Epoch 18, Average Loss: 0.30331746126841946
Epoch 19, Average Loss: 0.27930881812707664
Epoch 20, Average Loss: 0.26085426281542357
Epoch 21, Average Loss: 0.24867917641219636
Epoch 22, Average Loss: 0.22410942001997963
Epoch 23, Average Loss: 0.2012333592

In [114]:
# Prediction function
def predict(sentence):
    doc = nlp(sentence)
    processed_pos = []
    
    for sent in doc.sentences:
        for word in sent.words:
            processed_pos.append(word.upos)
    
    # Convert to indices
    input_pos = [pos2idx.get(pos, pos2idx["<PAD>"]) for pos in processed_pos]
    
    # Pad sequences
    max_len = 50
    input_pos = input_pos + [0] * (max_len - len(input_pos))
    
    # Convert to tensors
    input_pos = torch.tensor([input_pos]).long()

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_pos)
        _, predicted = torch.max(outputs, dim=2)
    
    # Convert indices back to labels
    predicted_labels = [list(label2idx.keys())[i] for i in predicted[0]]
    predicted_labels = predicted_labels[:len(processed_pos)]
    
    return list(zip([word.text for sent in doc.sentences for word in sent.words], 
                    predicted_labels))

In [115]:
# Test predictions
print("\nTesting predictions:")
test_sentences = [
    "Celtics menjadi penguasa wilayah timur sejak regular season",
    "Final NBA 2024 akan mulai digulirkan pada 7 Juni 2024"
]

for sentence in test_sentences:
    predictions = predict(sentence)
    print("\nSentence:", sentence)
    for word, label in predictions:
        print(f"{word}: {label}")


Testing predictions:

Sentence: Celtics menjadi penguasa wilayah timur sejak regular season
Celtics: ORG
menjadi: O
penguasa: O
wilayah: O
timur: O
sejak: O
regular: O
season: O

Sentence: Final NBA 2024 akan mulai digulirkan pada 7 Juni 2024
Final: EVT
NBA: LEAGUE
2024: LEAGUE
akan: O
mulai: O
digulirkan: O
pada: O
7: DATE
Juni: LEAGUE
2024: DATE


In [116]:
def prepare_test_data(path):
    with open(path,"r") as f:
        test = f.read().split("\n\n\n")
    mismatch = 0
    sentences = []
    tags = []

    for i in range(len(test)):
        if len(test[i].split("\n")[0].split(" ")) != len(test[i].split("\n")[1].split(" ")):
            mismatch += 1
            print(test[i])
            print("\n")
            continue
        sentences.append(test[i].split("\n")[0])
        tags.append(test[i].split("\n")[1])

    print(mismatch)
    return sentences,tags


test_sentences,test_tags = prepare_test_data("test.txt")

0


In [117]:
test_sentences

['ï»¿Geovane Magno berposisi sebagai striker dan juga bisa bermain sebagai gelandang serang Pemain berusia 30 tahun itu telah catatkan 102 penampilan di Liga Vietnam dengan mengemas 32 gol dan 36 assist',
 'Magno tertarik untuk jadi warga negara Vietnam Meski begitu federasi sepakbola setempat masih mempertimbangkan untuk menaturalisasi Magno',
 'Magno dinilai sebagai salah satu pemain asing terbaik dalam tiga tahun terakhir di Liga Vietnam',
 'Keluarganya pun sudah betah tinggal di sana',
 'Ganda putra andalan Indonesia Marcus Fernaldi Gideon dan Kevin Sanjaya Sukamuljo kembali mengukir prestasi gemilang dengan menjuarai All England 2024',
 'Dalam pertandingan final yang berlangsung di Birmingham Arena pasangan berjuluk The Minions ini berhasil mengalahkan pasangan China Li Junhui dan Liu Yuchen',
 'Dengan skor 21 18 22 20 dalam pertandingan yang berlangsung selama 45 menit',
 'Pertandingan dimulai dengan dominasi Marcus Kevin yang langsung unggul jauh di awal gim pertama',
 'Kecepata

In [118]:

hasil = []
for sentence in test_sentences:
    predictions = predict(sentence)
    hasil.append(label)
    print("\nSentence:", sentence)
    for word, label in predictions:
        print(f"{word}: {label}")


Sentence: ï»¿Geovane Magno berposisi sebagai striker dan juga bisa bermain sebagai gelandang serang Pemain berusia 30 tahun itu telah catatkan 102 penampilan di Liga Vietnam dengan mengemas 32 gol dan 36 assist
ï»: O
¿Geovane: PER
Magno: PER
berposisi: O
sebagai: O
striker: O
dan: O
juga: O
bisa: O
bermain: O
sebagai: O
gelandang: O
serang: O
Pemain: O
berusia: O
30: O
tahun: O
itu: O
telah: O
catatkan: O
102: O
penampilan: LEAGUE
di: LEAGUE
Liga: O
Vietnam: O
dengan: O
mengemas: O
32: O
gol: O
dan: O
36: O
assist: O

Sentence: Magno tertarik untuk jadi warga negara Vietnam Meski begitu federasi sepakbola setempat masih mempertimbangkan untuk menaturalisasi Magno
Magno: ORG
tertarik: O
untuk: O
jadi: O
warga: O
negara: PER
Vietnam: ORG
Meski: O
begitu: O
federasi: O
sepakbola: O
setempat: O
masih: O
mempertimbangkan: O
untuk: O
menaturalisasi: ORG
Magno: ORG

Sentence: Magno dinilai sebagai salah satu pemain asing terbaik dalam tiga tahun terakhir di Liga Vietnam
Magno: PER
dinilai: O

In [119]:
n_tag = 0
accuracy = 0

for i in range(len(hasil)):
    n_tag += len(hasil[i])
    for j in range(len(hasil[i])):
        if hasil[i][j] == test_tags[i][j]:
            accuracy += 1
print(f"accuracy : {accuracy/n_tag}")

accuracy : 0.22695035460992907
