In [1]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

# Read data file and parse the XML
with codecs.open("reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, "html5lib")

docs = []
for elem in soup.find_all("document"):
    texts = []

    # Loop through each child of the element under "textwithnamedentities"
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)

In [2]:
import nltk
#nltk.download('averaged_perceptron_tagger') #telah dijalankan
data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

In [3]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [4]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 13767
Seconds required: 0.112

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5380.591310
Feature norm: 1.000000
Error norm: 5874.711290
Active features: 13351
Line search trials: 1
Line search step: 0.000041
Seconds required for this iteration: 0.021

***** Iteration #2 *****
Loss: 4402.027587
Feature norm: 0.854095
Error norm: 5301.262964
Active features: 13383
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #3 *****
Loss: 4228.078040
Feature norm: 0.779915
Error norm: 13351.353162
Active features: 8439
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

***** Iteration #53 *****
Loss: 218.139257
Feature norm: 44.306259
Error norm: 6.654440
Active features: 2100
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #54 *****
Loss: 217.816320
Feature norm: 44.329131
Error norm: 13.928853
Active features: 2056
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #55 *****
Loss: 217.410178
Feature norm: 44.258069
Error norm: 18.359476
Active features: 2029
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #56 *****
Loss: 217.078172
Feature norm: 44.330783
Error norm: 12.660862
Active features: 1987
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.013

***** Iteration #57 *****
Loss: 216.820724
Feature norm: 44.313669
Error norm: 6.353304
Active features: 1970
Line search trials: 1
Line search step: 1.000000
Seconds required for this 

***** Iteration #106 *****
Loss: 213.408818
Feature norm: 44.949581
Error norm: 10.444865
Active features: 1769
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #107 *****
Loss: 213.375021
Feature norm: 44.956788
Error norm: 4.330374
Active features: 1768
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #108 *****
Loss: 213.362168
Feature norm: 44.964357
Error norm: 9.604947
Active features: 1770
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #109 *****
Loss: 213.331766
Feature norm: 44.969188
Error norm: 5.259147
Active features: 1770
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #110 *****
Loss: 213.330280
Feature norm: 44.973657
Error norm: 11.625461
Active features: 1768
Line search trials: 1
Line search step: 1.000000
Seconds required for t

***** Iteration #155 *****
Loss: 212.528865
Feature norm: 45.025334
Error norm: 3.851652
Active features: 1717
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #156 *****
Loss: 212.523522
Feature norm: 45.024326
Error norm: 6.221698
Active features: 1719
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #157 *****
Loss: 212.511322
Feature norm: 45.023540
Error norm: 5.295774
Active features: 1717
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #158 *****
Loss: 212.503432
Feature norm: 45.025088
Error norm: 6.501078
Active features: 1717
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #159 *****
Loss: 212.491586
Feature norm: 45.023477
Error norm: 4.431878
Active features: 1714
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

In [6]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

britain (N)
intends (I)
to (I)
negotiate (I)
further (I)
agreements (I)
on (I)
international (I)
securities (I)
regulation (I)
to (I)
match (I)
those (I)
now (I)
in (I)
force (I)
with (I)
the (I)
u.s (N)
. (I)
and (I)
japan (N)
, (I)
government (I)
sources (I)
said. (I)
the (I)
department (I)
of (I)
trade (N)
and (N)
industry (N)
said (I)
earlier (I)
it (I)
had (I)
concluded (I)
a (I)
memorandum (I)
of (I)
understanding (I)
with (I)
japans (N)
finance (N)
ministry (N)
to (I)
swap (I)
supervisory (I)
and (I)
investigatory (I)
information. (I)
the (I)
agreement (I)
covers (I)
stocks, (I)
shares (I)
and (I)
government (I)
bonds, (I)
but (I)
not (I)
commodity (I)
futures, (I)
whose (I)
regulation (I)
in (I)
japan (N)
does (I)
not (I)
come (I)
under (I)
the (I)
finance (N)
ministry (N)
. (I)
it (I)
therefore (I)
does (I)
not (I)
cover (I)
some (I)
transactions (I)
on (I)
the (I)
london (N)
international (N)
financial (N)
futures (N)
exchange (N)
and (I)
the (I)
london (N)
commodities (I)
an

In [7]:
import numpy as np
from sklearn.metrics import classification_report

# Create a mapping of labels to indices
labels = {"N": 1, "I": 0}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

              precision    recall  f1-score   support

           I       0.98      0.98      0.98      2774
           N       0.88      0.90      0.89       432

    accuracy                           0.97      3206
   macro avg       0.93      0.94      0.94      3206
weighted avg       0.97      0.97      0.97      3206

