# Named Entity Recognition using CRF model
Total per tag

* B-PER 2508
* I-PER 3111
* B-ADJ 402 
* I-ADJ 442
* B-ANM 2556
* I-ANM 2478
* B-GODS 467
* I-GODS 549
* B-OBJ 1661
* I-OBJ 986
* O 74768

#### Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score, flat_classification_report
from sklearn_crfsuite import scorers
from sklearn.metrics import make_scorer
import scipy.stats

In [2]:
# Membaca file 
df = pd.read_excel('ner_dataset.xlsx')

In [3]:
#Display first 10 rows
df.head(10)

Unnamed: 0,StoryID,Sentence,SentenceID,Word,Tag
0,0,anak_ririh,0,Pan,B-PER
1,0,anak_ririh,0,Karsa,I-PER
2,0,anak_ririh,0,ajaka,O
3,0,anak_ririh,0,pianakne,B-OBJ
4,0,anak_ririh,0,muani,I-OBJ
5,0,anak_ririh,0,nanggap,O
6,0,anak_ririh,0,upah,O
7,0,anak_ririh,0,ngae,O
8,0,anak_ririh,0,semer,O
9,0,anak_ririh,0,di,O


In [4]:
df.describe()

Unnamed: 0,StoryID,SentenceID
count,89939.0,89939.0
mean,80.056527,37.846396
std,50.128946,38.093724
min,0.0,0.0
25%,34.0,13.0
50%,71.0,27.0
75%,132.0,50.0
max,168.0,289.0


In [14]:
# Menghitung jumlah kalimat berdasarkan StoryID dan SentenceID
sentence_counts = df.groupby(['StoryID', 'SentenceID']).size().reset_index(name='JumlahKata')
total_sentence_per_story = sentence_counts.groupby('StoryID')['SentenceID'].nunique().reset_index(name='JumlahSentence')
# Menghitung total semua jumlah kalimat (unik berdasarkan SentenceID)
total_kalimat = sentence_counts.groupby('StoryID')['SentenceID'].nunique().sum()
print(total_sentence_per_story)
print(f'Total Semua Jumlah Kalimat: {total_kalimat}')

     StoryID  JumlahSentence
0          0              29
1          1              44
2          2              14
3          3              38
4          4              34
..       ...             ...
119      164             111
120      165               1
121      166               1
122      167               1
123      168               2

[124 rows x 2 columns]
Total Semua Jumlah Kalimat: 6641


In [19]:
#Displaying the unique Tags
df['Tag'].unique()

array(['B-PER', 'I-PER', 'O', 'B-OBJ', 'I-OBJ', 'B-ANM', 'I-ANM',
       'B-GODS', 'I-GODS', 'B-ADJ', 'I-ADJ'], dtype=object)

In [22]:
# Memproses dataset untuk mengelompokkan per kalimat
class SentenceGetter:
    def __init__(self, data):
        self.data = data
        self.grouped = data.groupby("SentenceID").apply(
            lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                    s["Tag"].values.tolist())]
        )
        self.sentences = [sentence for sentence in self.grouped]

In [23]:
# Inisialisasi SentenceGetter
getter = SentenceGetter(df)
sentences = getter.sentences

# Menampilkan salah satu kalimat untuk verifikasi
print("Contoh kalimat pertama:", sentences[0])

Contoh kalimat pertama: [('Pan', 'B-PER'), ('Karsa', 'I-PER'), ('ajaka', 'O'), ('pianakne', 'B-OBJ'), ('muani', 'I-OBJ'), ('nanggap', 'O'), ('upah', 'O'), ('ngae', 'O'), ('semer', 'O'), ('di', 'O'), ('sisin', 'O'), ('rurunge', 'O'), ('gede', 'O'), ('.', 'O'), ('Ada', 'O'), ('katuturan', 'O'), ('satua', 'O'), ('I', 'B-ANM'), ('Angsa', 'I-ANM'), ('masawitra', 'O'), ('teken', 'O'), ('I', 'B-ANM'), ('Kerkuak', 'B-ANM'), ('.', 'O'), ('Kacerita', 'O'), ('Sang', 'B-GODS'), ('Hyang', 'I-GODS'), ('Indra', 'I-GODS'), ('kalintang', 'O'), ('sungsut', 'O'), ('santukan', 'O'), ('Ida', 'O'), ('mireng', 'O'), ('orti', 'O'), ('Indraloka', 'O'), ('pacang', 'O'), ('kagebugin', 'O'), ('olih', 'O'), ('Niwatakawaca', 'B-GODS'), ('ratun', 'O'), ('raksasane', 'O'), ('ring', 'O'), ('Manimantaka', 'O'), ('.', 'O'), ('Wenten', 'O'), ('kocap', 'O'), ('katuturan', 'O'), ('satua', 'O'), ('saking', 'O'), ('jagat', 'O'), ('Jembrana', 'O'), ('sane', 'O'), ('sampun', 'O'), ('lumbrah', 'O'), ('kabaosang', 'O'), ('antuk'

  self.grouped = data.groupby("SentenceID").apply(


#### Ekstraksi Fitur
fitur bawaan yang digunakan oleh NER di nltk.

In [24]:
# Fungsi ekstraksi fitur dari setiap kata

def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.istitle()': word.istitle(),
    }
    
    if i > 0:
        word1 = sent[i - 1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# Ekstraksi fitur untuk satu kalimat
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for _, label in sent]

def sent2tokens(sent):
    return [token for token, _ in sent]

# Ekstraksi fitur untuk kalimat pertama
features_first_sentence = sent2features(sentences[0])

# Tampilkan hasil ekstraksi fitur
for i, feature in enumerate(features_first_sentence):
    print(f"Kata: {sentences[0][i][0]}")
    for k, v in feature.items():
        print(f"  {k}: {v}")
    print()

Kata: Pan
  bias: 1.0
  word.lower(): pan
  word[-3:]: Pan
  word[-2:]: an
  word.istitle(): True
  BOS: True
  +1:word.lower(): karsa
  +1:word.istitle(): True

Kata: Karsa
  bias: 1.0
  word.lower(): karsa
  word[-3:]: rsa
  word[-2:]: sa
  word.istitle(): True
  -1:word.lower(): pan
  -1:word.istitle(): True
  +1:word.lower(): ajaka
  +1:word.istitle(): False

Kata: ajaka
  bias: 1.0
  word.lower(): ajaka
  word[-3:]: aka
  word[-2:]: ka
  word.istitle(): False
  -1:word.lower(): karsa
  -1:word.istitle(): True
  +1:word.lower(): pianakne
  +1:word.istitle(): False

Kata: pianakne
  bias: 1.0
  word.lower(): pianakne
  word[-3:]: kne
  word[-2:]: ne
  word.istitle(): False
  -1:word.lower(): ajaka
  -1:word.istitle(): False
  +1:word.lower(): muani
  +1:word.istitle(): False

Kata: muani
  bias: 1.0
  word.lower(): muani
  word[-3:]: ani
  word[-2:]: ni
  word.istitle(): False
  -1:word.lower(): pianakne
  -1:word.istitle(): False
  +1:word.lower(): nanggap
  +1:word.istitle(): Fals

In [25]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
crf_best = CRF(algorithm='lbfgs', 
               max_iterations=200, 
               all_possible_transitions=True, 
               c1=0.2846, 
               c2=0.0043)
crf_best.fit(X_train, y_train)

In [28]:
# Prediksi dan evaluasi
y_pred = crf_best.predict(X_test)

#### Evaluating the model performance.
We will use precision, recall and f1-score metrics to evaluate the performance of the model since the accuracy is not a good metric for this dataset because we have an unequal number of data points in each class.

In [29]:
f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

0.9548197452341741


In [30]:
report = flat_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       B-ADJ       0.99      0.94      0.96        81
       B-ANM       0.88      0.84      0.86       495
      B-GODS       0.78      0.76      0.77        82
       B-OBJ       0.78      0.65      0.71       331
       B-PER       0.89      0.85      0.87       505
       I-ADJ       0.99      0.94      0.96        98
       I-ANM       0.89      0.87      0.88       475
      I-GODS       0.78      0.81      0.79       108
       I-OBJ       0.59      0.49      0.54       157
       I-PER       0.90      0.92      0.91       634
           O       0.97      0.98      0.98     14301

    accuracy                           0.96     17267
   macro avg       0.86      0.82      0.84     17267
weighted avg       0.95      0.96      0.95     17267



In [31]:
import joblib

# Menyimpan model
joblib.dump(crf_best, 'nercrf_model.pkl')

['nercrf_model.pkl']

This looks quite nice.

In [33]:
# Fungsi untuk mengetes model dengan input kalimat baru
def test_model(sentence, crf_model):
    words = sentence.split()
    
    sent_features = [
        {'bias': 1.0, 
         'word.lower()': word.lower(),
         'word[-3:]': word[-3:],
         'word[-2:]': word[-2:],
         'word.istitle()': word.istitle(),} 
        for word in words
    ]
    
    for i, word_features in enumerate(sent_features):
        if i > 0:
            word_features.update({
                '-1:word.lower()': words[i-1].lower(),
                '-1:word.istitle()': words[i-1].istitle(),
            })
        else:
            word_features['BOS'] = True
        
        if i < len(words) - 1:
            word_features.update({
                '+1:word.lower()': words[i+1].lower(),
                '+1:word.istitle()': words[i+1].istitle(),
            })
        else:
            word_features['EOS'] = True

    predicted_tags = crf_model.predict([sent_features])[0]
    result = list(zip(words, predicted_tags))
    return result

kalimat_input = "I Raksasa nyagjang, jaga nyedayang Ida Sang Rama."
hasil = test_model(kalimat_input, crf_best)

print("Hasil Prediksi:")
for kata, tag in hasil:
    print(f"{kata}\t{tag}")

Hasil Prediksi:
I	B-GODS
Raksasa	I-GODS
nyagjang,	O
jaga	O
nyedayang	O
Ida	B-PER
Sang	I-PER
Rama.	I-PER
