In [5]:
##資料來源：https://medium.com/analytics-vidhya/pos-tagging-using-conditional-random-fields-92077e5eaa31
#pip install nltk
import nltk
#nltk.download()
import re
#pip install sklearn_crfsuite
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
#pip install scikit-learn
from sklearn.model_selection import train_test_split

## Load dataset

In [7]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/jiaping/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [8]:
tagged_sentence = nltk.corpus.treebank.tagged_sents(tagset='universal')
print("Number of Tagged Sentences ",len(tagged_sentence))

tagged_words = [tup for sent in tagged_sentence for tup in sent]
print("Total Number of Tagged words", len(tagged_words))

vocab = set([word for word,tag in tagged_words])
print("Vocabulary of the Corpus",len(vocab))

tags = set([tag for word,tag in tagged_words])
print("Number of Tags in the Corpus ",len(tags))

Number of Tagged Sentences  3914
Total Number of Tagged words 100676
Vocabulary of the Corpus 12408
Number of Tags in the Corpus  12


In [18]:
tagged_sentence[0]

[('Pierre', 'NOUN'),
 ('Vinken', 'NOUN'),
 (',', '.'),
 ('61', 'NUM'),
 ('years', 'NOUN'),
 ('old', 'ADJ'),
 (',', '.'),
 ('will', 'VERB'),
 ('join', 'VERB'),
 ('the', 'DET'),
 ('board', 'NOUN'),
 ('as', 'ADP'),
 ('a', 'DET'),
 ('nonexecutive', 'ADJ'),
 ('director', 'NOUN'),
 ('Nov.', 'NOUN'),
 ('29', 'NUM'),
 ('.', '.')]

In [21]:
tagged_words[0:10]

[('Pierre', 'NOUN'),
 ('Vinken', 'NOUN'),
 (',', '.'),
 ('61', 'NUM'),
 ('years', 'NOUN'),
 ('old', 'ADJ'),
 (',', '.'),
 ('will', 'VERB'),
 ('join', 'VERB'),
 ('the', 'DET')]

In [30]:
# To print part of set
list({k: 1 for k in vocab})[:10]

['romance',
 'here',
 'scarce',
 'food',
 'split',
 'conduct',
 'anti-abortionists',
 '257',
 'closed-end',
 'distinct']

In [31]:
list({k: 1 for k in tags})[:10]

['.', 'CONJ', 'PRT', 'PRON', 'NOUN', 'VERB', 'ADJ', 'ADP', 'NUM', 'DET']

## Training/Testing set split

In [9]:
train_set, test_set = train_test_split(tagged_sentence,test_size=0.2,random_state=1234)
print("Number of Sentences in Training Data ",len(train_set))
print("Number of Sentences in Testing Data ",len(test_set))

Number of Sentences in Training Data  3131
Number of Sentences in Testing Data  783


In [17]:
train_set[0]

[('On', 'ADP'),
 ('Wall', 'NOUN'),
 ('Street', 'NOUN'),
 ('men', 'NOUN'),
 ('and', 'CONJ'),
 ('women', 'NOUN'),
 ('walk', 'VERB'),
 ('with', 'ADP'),
 ('great', 'ADJ'),
 ('purpose', 'NOUN'),
 (',', '.'),
 ('*-2', 'X'),
 ('noticing', 'VERB'),
 ('one', 'NUM'),
 ('another', 'DET'),
 ('only', 'ADV'),
 ('when', 'ADV'),
 ('they', 'PRON'),
 ('jostle', 'VERB'),
 ('for', 'ADP'),
 ('cabs', 'NOUN'),
 ('*T*-1', 'X'),
 ('.', '.')]

In [32]:
test_set[0]

[('Ruth', 'NOUN'),
 ('K.', 'NOUN'),
 ('Nelson', 'NOUN'),
 ('Cullowhee', 'NOUN'),
 (',', '.'),
 ('N.C', 'NOUN'),
 ('.', '.')]

## 準備 training data

In [11]:
import re
def features(sentence,index):
    ### sentence is of the form [w1,w2,w3,..], index is the position of the word in the sentence
    return {
        'is_first_capital':int(sentence[index][0].isupper()),
        'is_first_word': int(index==0),
        'is_last_word':int(index==len(sentence)-1),
        'is_complete_capital': int(sentence[index].upper()==sentence[index]),
        'prev_word':'' if index==0 else sentence[index-1],
        'next_word':'' if index==len(sentence)-1 else sentence[index+1],
        'is_numeric':int(sentence[index].isdigit()),
        'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
        'prefix_1':sentence[index][0],
        'prefix_2': sentence[index][:2],
        'prefix_3':sentence[index][:3],
        'prefix_4':sentence[index][:4],
        'suffix_1':sentence[index][-1],
        'suffix_2':sentence[index][-2:],
        'suffix_3':sentence[index][-3:],
        'suffix_4':sentence[index][-4:],
        'word_has_hyphen': 1 if '-' in sentence[index] else 0  
         }
def untag(sentence):
    return [word for word,tag in sentence]


def prepareData(tagged_sentences):
    X,y=[],[]
    for sentences in tagged_sentences:
        X.append([features(untag(sentences), index) for index in range(len(sentences))])
        y.append([tag for word,tag in sentences])
    return X,y

In [12]:
X_train,y_train = prepareData(train_set)
X_test,y_test = prepareData(test_set)

In [14]:
X_train[0][0]

{'is_first_capital': 1,
 'is_first_word': 1,
 'is_last_word': 0,
 'is_complete_capital': 0,
 'prev_word': '',
 'next_word': 'Wall',
 'is_numeric': 0,
 'is_alphanumeric': 0,
 'prefix_1': 'O',
 'prefix_2': 'On',
 'prefix_3': 'On',
 'prefix_4': 'On',
 'suffix_1': 'n',
 'suffix_2': 'On',
 'suffix_3': 'On',
 'suffix_4': 'On',
 'word_has_hyphen': 0}

In [16]:
y_train[0]

['ADP',
 'NOUN',
 'NOUN',
 'NOUN',
 'CONJ',
 'NOUN',
 'VERB',
 'ADP',
 'ADJ',
 'NOUN',
 '.',
 'X',
 'VERB',
 'NUM',
 'DET',
 'ADV',
 'ADV',
 'PRON',
 'VERB',
 'ADP',
 'NOUN',
 'X',
 '.']

In [33]:
X_test[0][0]

{'is_first_capital': 1,
 'is_first_word': 1,
 'is_last_word': 0,
 'is_complete_capital': 0,
 'prev_word': '',
 'next_word': 'K.',
 'is_numeric': 0,
 'is_alphanumeric': 0,
 'prefix_1': 'R',
 'prefix_2': 'Ru',
 'prefix_3': 'Rut',
 'prefix_4': 'Ruth',
 'suffix_1': 'h',
 'suffix_2': 'th',
 'suffix_3': 'uth',
 'suffix_4': 'Ruth',
 'word_has_hyphen': 0}

In [35]:
y_test[0]

['NOUN', 'NOUN', 'NOUN', 'NOUN', '.', 'NOUN', '.']

## Train

In [36]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.01, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

## Evaluation

In [37]:
y_pred = crf.predict(X_test)

print("F1 score on Test Data ")
print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=crf.classes_))

print("F score on Training Data ")
y_pred_train = crf.predict(X_train)
metrics.flat_f1_score(y_train, y_pred_train, average='weighted', labels=crf.classes_)

### Look at class wise score
print(metrics.flat_classification_report(
    y_test, y_pred, labels=crf.classes_, digits=3
))

F1 score on Test Data 
0.9738471726864286
F score on Training Data 
              precision    recall  f1-score   support

         ADP      0.979     0.985     0.982      1869
        NOUN      0.966     0.977     0.972      5606
        CONJ      0.994     0.994     0.994       480
        VERB      0.964     0.960     0.962      2722
         ADJ      0.911     0.874     0.892      1274
           .      1.000     1.000     1.000      2354
           X      1.000     0.997     0.998      1278
         NUM      0.991     0.993     0.992       671
         DET      0.994     0.995     0.994      1695
         ADV      0.927     0.909     0.918       585
        PRON      0.998     0.998     0.998       562
         PRT      0.979     0.982     0.980       614

    accuracy                          0.974     19710
   macro avg      0.975     0.972     0.974     19710
weighted avg      0.974     0.974     0.974     19710



