In [1]:
import pandas as pd
import ast
import pycrfsuite

In [2]:
# Training
training = pd.read_csv('./data/training.csv').drop('Unnamed: 0', axis=1)
training['features'] = training['features'].apply(lambda x: ast.literal_eval(x))
training['y'] = training['y'].apply(lambda x: ast.literal_eval(x))
# training.head()

# Testing
testing = pd.read_csv('./data/testing.csv').drop('Unnamed: 0', axis=1)
testing['features'] = testing['features'].apply(lambda x: ast.literal_eval(x))
testing['y'] = testing['y'].apply(lambda x: ast.literal_eval(x))
testing.head()

Unnamed: 0,features,y
0,"[[Về, E, 0.0, 0, 0], [cơ_cấu, N, 0.06368156800...","[1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, ..."
1,"[[Theo, V, 0.0, 0, 0], [đó, P, 0.0295018618683...","[1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, ..."
2,"[[Ông, Nc, 0.0, 0, 1], [Trần_Kinh_Doanh_Tổng_g...","[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, ..."
3,"[[Giá, N, 0.0, 0, 1], [khởi_điểm, V, 0.0445162...","[1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, ..."
4,"[[Đồng_thời, C, 0.0, 0, 0], [PNJ, Ny, 0.0, 1, ...","[1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, ..."


In [4]:
training_data = training['features'].tolist()
testing_data = testing['features'].tolist()

In [3]:
def convert(y):
    y_ = y.copy()
    for i in range(len(y)):
        if y_[i] == 0:
            y_[i] = 'N'
        else:
            y_[i] = 'I'
    return y_

training['y_convert'] = training['y'].apply(lambda x: convert(x))
testing['y_convert'] = testing['y'].apply(lambda x: convert(x))

In [5]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]
    tfidf = doc[i][2]
    ner = doc[i][3]
    title = doc[i][4]
    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word.isner=%s' % ner,
        'tfidf=%f' % tfidf,
        'word.istitle=%s' % title,
        'postag=' + postag
    ]

    # Features for words that are not at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        tfidf1 = doc[i-1][2]
        ner1 = doc[i-1][3]
        title1 = doc[i-1][4]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.isner=%s' % ner1,
            '-1:tfidf=%f' % tfidf1,
            '-1:word.istitle=%s' % title1,
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        tfidf1 = doc[i+1][2]
        ner1 = doc[i+1][3]
        title1 = doc[i+1][4]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.isner=%s' % ner1,
            '+1:tfidf=%f' % tfidf1,
            '+1:word.istitle=%s' % title1,
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

In [6]:
X_train = [extract_features(doc) for doc in training_data]
X_test = [extract_features(doc) for doc in testing_data]
y_train = training['y_convert'].tolist()
y_test = testing['y_convert'].tolist()

In [7]:
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('./model/crf.model')

required for this iteration: 0.054

***** Iteration #106 *****
Loss: 2134.468202
Feature norm: 204.012150
Error norm: 6.188053
Active features: 9606
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.056

***** Iteration #107 *****
Loss: 2134.441325
Feature norm: 204.016807
Error norm: 4.637565
Active features: 9600
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.067

***** Iteration #108 *****
Loss: 2134.413354
Feature norm: 204.028006
Error norm: 4.414719
Active features: 9597
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.059

***** Iteration #109 *****
Loss: 2134.379525
Feature norm: 204.037259
Error norm: 2.119729
Active features: 9576
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.057

***** Iteration #110 *****
Loss: 2134.348941
Feature norm: 204.046059
Error norm: 8.035007
Active features: 9568
Line search trials: 1
Line 

In [8]:
tagger = pycrfsuite.Tagger()
tagger.open('./model/crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

việc (N)
ông (N)
lê_trí_thông (I)
vào (N)
hội_đồng_quản_trị (N)
hđqt (N)
công_ty (I)
cổ_phần (I)
vàng_bạc (I)
đá_quý (I)
phú_nhuận (I)
– (N)
pnj (I)
đã (I)
được (I)
cổ_đông (I)
thông_qua (I)
tại (I)
đại_hội (N)
cổ_đông (I)
thường_niên (N)
2017 (N)
diễn (N)
ra (N)
ngày (I)
27/4 (N)
cổ_đông (I)
chất_vấn (N)
về (I)
kế_hoạch (I)
kinh_doanh (I)
của (I)
pnj (I)
theo (I)
tài_liệu (I)
trình (N)
cổ_đông (I)
năm (I)
2016 (N)
tổng (N)
doanh_thu (I)
của (I)
pnj (I)
đạt (N)
8 (N)
720 (N)
tỷ (I)
đồng (I)
tăng (I)
12% (N)
so (N)
với (I)
năm (I)
2015 (N)
riêng (N)
doanh_thu (I)
trang_sức (I)
bán_lẻ (N)
tăng (I)
26% (N)
con_số (I)
này (N)
góp_phần (N)
đưa (N)
lợi_nhuận (I)
gộp (N)
cả (N)
năm (I)
lên (I)
khoảng (I)
1 (N)
381 (N)
tỷ (I)
đồng (I)
tăng (I)
21% (N)
so (N)
với (I)
2015 (N)
; (N)
lợi_nhuận (I)
trước (I)
thuế (N)
đạt (N)
608 (N)
tỷ (I)
đồng (I)
tăng (I)
220% (N)
so (N)
với (I)
năm (I)
trước (I)
đó (N)
pnj (I)
cũng (I)
vừa (N)
kết_thúc (I)
hoạt_động (I)
kinh_doanh (I)
quý (I)
1/2017 (N)
với (I)

In [9]:
import numpy as np
from sklearn.metrics import classification_report

# Create a mapping of labels to indices
labels = {"N": 0, "I": 1}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["N", "I"]))

              precision    recall  f1-score   support

           N       0.99      0.99      0.99     12860
           I       1.00      0.99      0.99     21110

    accuracy                           0.99     33970
   macro avg       0.99      0.99      0.99     33970
weighted avg       0.99      0.99      0.99     33970



In [10]:
y_train_pred = [tagger.tag(xseq) for xseq in X_train]

predictions = np.array([labels[tag] for row in y_train_pred for tag in row])
truths = np.array([labels[tag] for row in y_train for tag in row])

print(classification_report(
    truths, predictions,
    target_names=["N", "I"]))

              precision    recall  f1-score   support

           N       1.00      1.00      1.00     48149
           I       1.00      1.00      1.00     84389

    accuracy                           1.00    132538
   macro avg       1.00      1.00      1.00    132538
weighted avg       1.00      1.00      1.00    132538



In [13]:
# New test
newtest = pd.read_csv('./data/newtest.csv')[['features', 'y']]
newtest['features'] = newtest['features'].apply(lambda x: ast.literal_eval(x))
newtest['y'] = newtest['y'].apply(lambda x: ast.literal_eval(x))
newtest.head()

Unnamed: 0,features,y
0,"[[Tổng_Công_ty, N, 0.0, 1, 0], [Khí, Np, 0.0, ...","[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, ..."
1,"[[Tên, N, 0.0, 0, 0], [người, N, 0.07284574411...","[1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, ..."
2,"[[Trong, E, 0.0, 0, 0], [quý, N, 0.03915081706...","[1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, ..."
3,"[[Tính, V, 0.0, 0, 0], [riêng, A, 0.1191558601...","[1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, ..."
4,"[[Cụ_thể, A, 0.0, 0, 0], [GAS, Ny, 0.0, 0, 1],...","[1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, ..."


In [14]:
newtest_data = newtest['features'].tolist()
X_newtest = [extract_features(doc) for doc in newtest_data]

newtest['y_convert'] = newtest['y'].apply(lambda x: convert(x))
y_newtest = newtest['y'].apply(lambda x: convert(x)).tolist()

In [15]:
tagger = pycrfsuite.Tagger()
tagger.open('./model/crf.model')
y_newpred = [tagger.tag(xseq) for xseq in X_newtest]

In [16]:
import numpy as np
from sklearn.metrics import classification_report

# Create a mapping of labels to indices
labels = {"N": 0, "I": 1}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_newpred for tag in row])
truths = np.array([labels[tag] for row in y_newtest for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["N", "I"]))

              precision    recall  f1-score   support

           N       0.98      0.99      0.99      7222
           I       1.00      0.99      0.99     10681

    accuracy                           0.99     17903
   macro avg       0.99      0.99      0.99     17903
weighted avg       0.99      0.99      0.99     17903

