In [1]:
!pip install ndjson --quiet
!pip install underthesea --quiet
!pip install sklearn_crfsuite --quiet


[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import ndjson
import underthesea
import nltk
import re
from sklearn_crfsuite import CRF
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split

# Load dataset
[link data set on github](https://github.com/VinAIResearch/PhoNER_COVID19)
## **COVID-19 Named Entity Recognition for Vietnamese**
PhoNER_COVID19 is a dataset for recognizing COVID-19 related named entities in Vietnamese, consisting of 35K entities over 10K sentences. We define 10 entity types with the aim of extracting key information related to COVID-19 patients, which are especially useful in downstream applications. In general, these entity types can be used in the context of not only the COVID-19 pandemic but also in other future epidemics:

![](https://user-images.githubusercontent.com/2412555/112276540-741ca100-8cb3-11eb-8191-9ba6cb83c72c.png)

In [5]:
path = "D:/04. code/nlp/phoNER_covid19/PhoNER_COVID19/data/word/"
file_names = [
    "dev_word.json",
    "test_word.json",
    "train_word.json"
]

In [7]:
with open(path + file_names[0], encoding='utf-8') as devfile:
    dev = ndjson.load(devfile)

with open(path + file_names[1], encoding='utf-8') as testfile:
    test = ndjson.load(testfile)

with open(path + file_names[2], encoding='utf-8') as trainfile:
    train = ndjson.load(trainfile)

In [8]:
len(dev), len(test), len(train)

(2000, 3000, 5027)

In [9]:
all_sents = []
for para in dev:
    all_sents.append(list(zip(para["words"], para["tags"])))

for para in test:
    all_sents.append(list(zip(para["words"], para["tags"])))

for para in train:
    all_sents.append(list(zip(para["words"], para["tags"])))

In [10]:
print(len(all_sents))

10027


---

In [11]:
train_set, test_set = train_test_split(all_sents, test_size=0.2, shuffle=False)

In [12]:
print("trainset size",len(train_set))
print("testset size",len(test_set))

trainset size 8021
testset size 2006


In [13]:
test_word = []
test_tag = []
for sent in test_set:
    for word, tag in sent:
        test_word.append(word)
        test_tag.append(tag)

In [14]:
len(test_word)

53209

In [43]:
train_set[:2]
# 🥲🥲🥲🥲🥲🥲🥲🥲🥲

[[('Bác_sĩ', 'O'),
  ('Nguyễn_Trung_Nguyên', 'O'),
  (',', 'O'),
  ('Giám_đốc', 'O'),
  ('Trung_tâm', 'B-ORGANIZATION'),
  ('Chống', 'I-ORGANIZATION'),
  ('độc', 'I-ORGANIZATION'),
  (',', 'I-ORGANIZATION'),
  ('Bệnh_viện', 'I-ORGANIZATION'),
  ('Bạch_Mai', 'I-ORGANIZATION'),
  (',', 'O'),
  ('cho', 'O'),
  ('biết', 'O'),
  ('bệnh_nhân', 'O'),
  ('được', 'O'),
  ('chuyển', 'O'),
  ('đến', 'O'),
  ('bệnh_viện', 'O'),
  ('ngày', 'O'),
  ('7/3', 'B-DATE'),
  (',', 'O'),
  ('chẩn_đoán', 'O'),
  ('ngộ_độc', 'B-SYMPTOM_AND_DISEASE'),
  ('thuốc', 'I-SYMPTOM_AND_DISEASE'),
  ('điều_trị', 'O'),
  ('sốt_rét', 'O'),
  ('chloroquine', 'O'),
  ('.', 'O')],
 [('"', 'O'),
  ('Bệnh_nhân', 'O'),
  ('812', 'B-PATIENT_ID'),
  ('"', 'O'),
  (',', 'O'),
  ('nam', 'B-GENDER'),
  (',', 'O'),
  ('62', 'B-AGE'),
  ('tuổi', 'O'),
  (',', 'O'),
  ('là', 'O'),
  ('nhân_viên', 'B-JOB'),
  ('giao', 'I-JOB'),
  ('bánh', 'I-JOB'),
  ('tiệm', 'B-LOCATION'),
  ('pizza', 'I-LOCATION'),
  ('phố', 'I-LOCATION'),
  ('Trần_

# Modeling
## 1. HMM

In [15]:
tagger = nltk.HiddenMarkovModelTagger.train(train_set)

In [16]:
y_pred_hmm_ner = [tag for _,tag in tagger.tag(test_word)]

In [17]:
f1_score(y_pred=y_pred_hmm_ner, y_true=test_tag,average='weighted')

0.9411453046546298

In [18]:
accuracy_score(y_pred=y_pred_hmm_ner, y_true=test_tag)

0.9391080456313782

# 2. CRF

In [19]:
def features(sentence,index):
    return {
        # 'is_first_capital':int(sentence[index][0].isupper()),
        'is_first_word': int(index==0),
        'is_last_word':int(index==len(sentence)-1),
        'is_complete_capital': int(sentence[index].upper()==sentence[index]),
        'prev_word':'' if index==0 else sentence[index-1],
        'next_word':'' if index==len(sentence)-1 else sentence[index+1],
        'is_numeric':int(sentence[index].isdigit()),
        'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
        # 'word_has_hyphen': 1 if '-' in sentence[index] else 0
    }
  
def prepareData(tagged_sentences):
    X=[]
    for sentences in tagged_sentences:
        X.append([features(sentences, index) for index in range(len(sentences))])
    return X

In [20]:
sentences = []
tags = []
for sent in train_set:
    sent_word = []
    sent_tag = []
    for word, tag in sent:
        sent_word.append(word)
        sent_tag.append(tag)
    
    sentences.append(sent_word)
    tags.append(sent_tag)

In [21]:
len(sentences)

8021

In [22]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences, tags, test_size=0.2)

In [23]:
X_train = prepareData(train_sentences)
y_train = train_tags
X_test = prepareData(test_sentences)
y_test =  test_tags

In [24]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [25]:
y_pred=crf.predict(X_test)
y_pred_train=crf.predict(X_train)

In [26]:
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers

#F1 score test
print(metrics.flat_f1_score(y_test, y_pred,average='weighted',labels=crf.classes_))
#Accuracy score test
print(metrics.flat_accuracy_score(y_test,y_pred))

0.943701969574168
0.9456862523965265


In [27]:
#F1 score train
print(metrics.flat_f1_score(y_train, y_pred_train,average='weighted',labels=crf.classes_))
#Accuracy score train
print(metrics.flat_accuracy_score(y_train,y_pred_train))

0.9635470757254156
0.9646070718032194


In [28]:
labels1=crf.classes_
labels1

['O',
 'B-PATIENT_ID',
 'B-ORGANIZATION',
 'I-ORGANIZATION',
 'B-LOCATION',
 'B-GENDER',
 'B-AGE',
 'B-SYMPTOM_AND_DISEASE',
 'I-SYMPTOM_AND_DISEASE',
 'B-DATE',
 'I-LOCATION',
 'B-TRANSPORTATION',
 'I-TRANSPORTATION',
 'B-NAME',
 'B-JOB',
 'I-JOB',
 'I-DATE',
 'I-NAME',
 'I-PATIENT_ID',
 'I-AGE']

## 3. Logistic Regression

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [30]:
Encoder = LabelEncoder()
Tfidf_vect = TfidfVectorizer()

In [31]:
all_tags = []
all_words = []

for sent in all_sents:
    sent_word = []
    sent_tag = []
    for word, tag in sent:
        sent_word.append(word)
        sent_tag.append(tag)
    
    all_words.extend(sent_word)
    all_tags.extend(sent_tag)

In [32]:
len(all_words), len(all_tags)

(274472, 274472)

In [33]:
wordvec = Tfidf_vect.fit_transform(all_words)
wordvec.shape

(274472, 6493)

In [34]:
len(Tfidf_vect.vocabulary_)
# print(Tfidf_vect.vocabulary_)

6493

In [35]:
Train_X, Test_X, Train_Y, Test_Y = train_test_split(wordvec, all_tags,test_size=0.2, random_state = 5)

In [36]:
classifier_lgr = LogisticRegression()
classifier_lgr.fit(Train_X,Train_Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
predictions_LGR = classifier_lgr.predict(Test_X)
accuracy_score(predictions_LGR, Test_Y)

0.8633573185171691

In [38]:
from sklearn.metrics import classification_report

In [39]:
print(classification_report(predictions_LGR, Test_Y))

  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

                B-AGE       0.40      0.56      0.47       218
               B-DATE       0.66      0.65      0.65      1082
             B-GENDER       0.94      0.74      0.83       321
                B-JOB       0.26      0.67      0.37        36
           B-LOCATION       0.69      0.64      0.66      2769
               B-NAME       0.10      1.00      0.18        16
       B-ORGANIZATION       0.67      0.68      0.67       476
         B-PATIENT_ID       0.57      0.83      0.68       889
B-SYMPTOM_AND_DISEASE       0.82      0.79      0.80       649
     B-TRANSPORTATION       0.41      0.95      0.58        42
                I-AGE       0.00      0.00      0.00         0
               I-DATE       0.00      0.67      0.00         3
                I-JOB       0.00      0.00      0.00         0
           I-LOCATION       0.49      0.66      0.56      2008
               I-NAME       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
