# Named Entity Recognition using CRF model
Total per tag

* B-PER 2508
* I-PER 3111
* B-ADJ 402 
* I-ADJ 442
* B-ANM 2556
* I-ANM 2478
* B-GODS 467
* I-GODS 549
* B-OBJ 1661
* I-OBJ 986
* O 74768

#### Importing Libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score, flat_classification_report
from sklearn_crfsuite import scorers
from sklearn.metrics import make_scorer
import scipy.stats

In [4]:
# Membaca file 
df = pd.read_excel('ner_dataset.xlsx')

In [5]:
#Display first 10 rows
df.head(10)

Unnamed: 0,StoryID,Sentence,SentenceID,Word,Tag
0,0,anak_ririh,0,Pan,B-PER
1,0,anak_ririh,0,Karsa,I-PER
2,0,anak_ririh,0,ajaka,O
3,0,anak_ririh,0,pianakne,B-OBJ
4,0,anak_ririh,0,muani,I-OBJ
5,0,anak_ririh,0,nanggap,O
6,0,anak_ririh,0,upah,O
7,0,anak_ririh,0,ngae,O
8,0,anak_ririh,0,semer,O
9,0,anak_ririh,0,di,O


In [6]:
df.describe()

Unnamed: 0,StoryID,SentenceID
count,89939.0,89939.0
mean,80.056527,37.846396
std,50.128946,38.093724
min,0.0,0.0
25%,34.0,13.0
50%,71.0,27.0
75%,132.0,50.0
max,168.0,289.0


In [7]:
#Displaying the unique Tags
df['Tag'].unique()

array(['B-PER', 'I-PER', 'O', 'B-OBJ', 'I-OBJ', 'B-ANM', 'I-ANM',
       'B-GODS', 'I-GODS', 'B-ADJ', 'I-ADJ'], dtype=object)

In [8]:
#Checking null values, if any.
df.isnull().sum()

StoryID       0
Sentence      0
SentenceID    0
Word          0
Tag           0
dtype: int64

There are lots of missing values in 'Sentence #' attribute. So we will use pandas fillna technique and use 'ffill' method which propagates last valid observation forward to next.

In [9]:
df = df.fillna(method = 'ffill')

  df = df.fillna(method = 'ffill')


In [10]:
# Memproses dataset untuk mengelompokkan per kalimat
class SentenceGetter:
    def __init__(self, data):
        self.data = data
        self.grouped = data.groupby("SentenceID").apply(
            lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                    s["Tag"].values.tolist())]
        )
        self.sentences = [sentence for sentence in self.grouped]

In [11]:
# Inisialisasi SentenceGetter
getter = SentenceGetter(df)
sentences = getter.sentences

# Menampilkan salah satu kalimat untuk verifikasi
print("Contoh kalimat pertama:", sentences[0])

Contoh kalimat pertama: [('Pan', 'B-PER'), ('Karsa', 'I-PER'), ('ajaka', 'O'), ('pianakne', 'B-OBJ'), ('muani', 'I-OBJ'), ('nanggap', 'O'), ('upah', 'O'), ('ngae', 'O'), ('semer', 'O'), ('di', 'O'), ('sisin', 'O'), ('rurunge', 'O'), ('gede', 'O'), ('.', 'O'), ('Ada', 'O'), ('katuturan', 'O'), ('satua', 'O'), ('I', 'B-ANM'), ('Angsa', 'I-ANM'), ('masawitra', 'O'), ('teken', 'O'), ('I', 'B-ANM'), ('Kerkuak', 'B-ANM'), ('.', 'O'), ('Kacerita', 'O'), ('Sang', 'B-GODS'), ('Hyang', 'I-GODS'), ('Indra', 'I-GODS'), ('kalintang', 'O'), ('sungsut', 'O'), ('santukan', 'O'), ('Ida', 'O'), ('mireng', 'O'), ('orti', 'O'), ('Indraloka', 'O'), ('pacang', 'O'), ('kagebugin', 'O'), ('olih', 'O'), ('Niwatakawaca', 'B-GODS'), ('ratun', 'O'), ('raksasane', 'O'), ('ring', 'O'), ('Manimantaka', 'O'), ('.', 'O'), ('Wenten', 'O'), ('kocap', 'O'), ('katuturan', 'O'), ('satua', 'O'), ('saking', 'O'), ('jagat', 'O'), ('Jembrana', 'O'), ('sane', 'O'), ('sampun', 'O'), ('lumbrah', 'O'), ('kabaosang', 'O'), ('antuk'

  self.grouped = data.groupby("SentenceID").apply(


Getting all the sentences in the dataset.

#### Feature Preparation
These are the default features used by the NER in nltk. We can also modify it for our customization.

In [12]:
# Fungsi ekstraksi fitur dari setiap kata
# Hanya memproses word dan tag saja, postag dihapus

def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    
    if i > 0:
        word1 = sent[i - 1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        word1 = sent[i + 1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# Ekstraksi fitur untuk satu kalimat
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for _, label in sent]

def sent2tokens(sent):
    return [token for token, _ in sent]


In [13]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Hyperparameter tuning menggunakan RandomizedSearchCV
crf = CRF(algorithm='lbfgs', max_iterations=200, all_possible_transitions=True)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(flat_f1_score, average='weighted')

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

rs.fit(X_train, y_train)

# Model terbaik
crf_best = rs.best_estimator_

print("Best params:", rs.best_params_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best params: {'c1': np.float64(0.28458916688293656), 'c2': np.float64(0.004311116039164944)}


In [16]:
# Prediksi dan evaluasi
y_pred = crf_best.predict(X_test)

#### Evaluating the model performance.
We will use precision, recall and f1-score metrics to evaluate the performance of the model since the accuracy is not a good metric for this dataset because we have an unequal number of data points in each class.

In [17]:
f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

0.9549887618396182


In [18]:
report = flat_classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       B-ADJ       0.99      0.94      0.96        81
       B-ANM       0.88      0.83      0.86       495
      B-GODS       0.78      0.76      0.77        82
       B-OBJ       0.78      0.65      0.71       331
       B-PER       0.90      0.86      0.88       505
       I-ADJ       0.99      0.94      0.96        98
       I-ANM       0.88      0.87      0.88       475
      I-GODS       0.77      0.81      0.79       108
       I-OBJ       0.60      0.50      0.54       157
       I-PER       0.91      0.92      0.91       634
           O       0.97      0.98      0.98     14301

    accuracy                           0.96     17267
   macro avg       0.86      0.82      0.84     17267
weighted avg       0.95      0.96      0.95     17267



In [19]:
import joblib

# Menyimpan model
joblib.dump(crf_best, 'nercrf_model.pkl')

['nercrf_model.pkl']

This looks quite nice.

In [1]:
# Fungsi untuk mengetes model dengan input kalimat baru
def test_model(sentence, crf_model):
    words = sentence.split()
    
    sent_features = [
        {'bias': 1.0, 
         'word.lower()': word.lower(),
         'word[-3:]': word[-3:],
         'word[-2:]': word[-2:],
         'word.isupper()': word.isupper(),
         'word.istitle()': word.istitle(),
         'word.isdigit()': word.isdigit()} 
        for word in words
    ]
    
    for i, word_features in enumerate(sent_features):
        if i > 0:
            word_features.update({
                '-1:word.lower()': words[i-1].lower(),
                '-1:word.istitle()': words[i-1].istitle(),
                '-1:word.isupper()': words[i-1].isupper(),
            })
        else:
            word_features['BOS'] = True
        
        if i < len(words) - 1:
            word_features.update({
                '+1:word.lower()': words[i+1].lower(),
                '+1:word.istitle()': words[i+1].istitle(),
                '+1:word.isupper()': words[i+1].isupper(),
            })
        else:
            word_features['EOS'] = True

    predicted_tags = crf_model.predict([sent_features])[0]
    result = list(zip(words, predicted_tags))
    return result

kalimat_input = "I raksasa nyagjang, jaga nyedayang Ida Sang Rama."
hasil = test_model(kalimat_input, crf_best)

print("Hasil Prediksi:")
for kata, tag in hasil:
    print(f"{kata}\t{tag}")

NameError: name 'crf_best' is not defined