# CRF sequence tagging for Move Queries

In [None]:
import os
import sys

from copy import deepcopy
from collections import Counter
from nltk.tag import CRFTagger
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

from matplotlib import pyplot as plt
import numpy as np

import re
import unicodedata
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
def get_raw_data_from_bio_file(fpath):
    """A simple function to read in from a one-word-per-line BIO
    (Beginning, Inside, Outside) tagged corpus, tab separated
    and each example sentence/text separated with a blank line.
    The data is already tokenized in a simple way.
    e.g.:                     
    
    O	a
    O	great
    O	lunch
    O	spot
    O	but
    B-Hours	open
    I-Hours	till
    I-Hours	2
    I-Hours	a
    I-Hours	m
    B-Restaurant_Name	passims
    I-Restaurant_Name	kitchen
    
    returns a list of lists of tuples of (word, tag) tuples
    """
    f = open(fpath)
    data = []  # the data, a list of lists of (word, tag) tuples
    current_sent = []  # data for current sentence/example
    for line in f:
        if line == "\n":  # each instance has a blank line separating it from next one
            # solution
            data.append(current_sent)
            current_sent = []
            continue
        line_data = line.strip("\n").split("\t")
        current_sent.append((line_data[1], line_data[0]))
    f.close()
    return data

In [None]:
raw_training_data = get_raw_data_from_bio_file("engtrain.bio.txt") 

In [None]:
# have a look at the first example
print(raw_training_data[0])

In [None]:
print(len(raw_training_data), "instances")
print(sum([len(sent) for sent in raw_training_data]), "words")

In [None]:
def preProcess(example):
    """Function takes in list of (word, bio-tag) pairs, e.g.:
        [('what', 'O'), ('movies', 'O'), ('star', 'O'), ('bruce', 'B-ACTOR'), ('willis', 'I-ACTOR')]
    returns new (token, bio-tag) pairs with preprocessing applied to the words"""
       
        # a postagger for use in exercises
    posttagger = CRFTagger()
    posttagger.set_model_file("crf_pos.tagger")
    # example use:
    words = ["john", "likes", "mary", "and", "bill"]
   #print(posttagger.tag(words))
        # word tokenisation and POS tagging
    
    new_tokens = []
    for i in example:
           Li = list(i) #change the type of data to list
           Li[0] = word_tokenize(i[0]) # tokenize the data
  
           try:
              [(a,b,)] = posttagger.tag(Li[0]) # use CRFTagger() function, which has been trained before, and store the result in a ,b
           
           except ValueError:
              pass
           
           
           Li[0] = a + '!' + b # create the dat which is "word!POS_tagging"
          
           i = tuple(Li) # change the data type back to tuple
           new_tokens.append(i) # add the data in new_tokens
    
    
    preprocessed_example = new_tokens  
    
    return preprocessed_example

In [None]:
training_data = [preProcess(example) for example in raw_training_data]

In [None]:
# check the effect of pre-processing
print(training_data[0])

In [None]:
_pattern = re.compile(r"\d")  # to recognize numbers/digits

# This is the 'out-of-the-box' get_features function from the nltk CRF tagger
def get_features(tokens, idx):
    """
    Extract basic features about this word including
         - Current Word
         - Is Capitalized ?
         - Has Punctuation ?
         - Has Number ?
         - Suffixes up to length 3
    Note that : we might include feature over previous word, next word ect.

    :return : a list which contains the features
    :rtype : list(str)

    """
    token = tokens[idx]
    feature_list = []

    if not token:
        return feature_list

    # Capitalization
    if token[0].isupper():
        feature_list.append("CAPITALIZATION")

    # Number
    if re.search(_pattern, token) is not None:
        feature_list.append("HAS_NUM")

    # Punctuation
    punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
    if all(unicodedata.category(x) in punc_cat for x in token):
        feature_list.append("PUNCTUATION")
    
    #lemmatization    
    lemmatizer = WordNetLemmatizer() 
  
    token = lemmatizer.lemmatize(token)   

    # Suffix up to length 5
    if len(token) > 1:
        feature_list.append("SUF_" + token[-1:])
    if len(token) > 2:
        feature_list.append("SUF_" + token[-2:])
    if len(token) > 3:
        feature_list.append("SUF_" + token[-3:])
    if len(token) > 4:
        feature_list.append("SUF_" + token[-4:])
    if len(token) > 5:
        feature_list.append("SUF_" + token[-5:])
        
    # Prefix up to length 5
    if len(token) > 1:
        feature_list.append("PRE_" + token[:1])
    if len(token) > 2:
        feature_list.append("PRE_" + token[:2])
    if len(token) > 3:
        feature_list.append("PRE_" + token[:3])
    if len(token) > 4:
        feature_list.append("PRE_" + token[:4])
    if len(token) > 5:
        feature_list.append("PRE_" + token[:5])
      
    # Split the special character
    token_split_word = token.split('!') #split the word with special character and get the word & POS tag
    token = token_split_word[0] # store the word in token 
        
    feature_list.append("WORD_" + token)
    feature_list.append("POS_" + token_split_word[1]) # add POS tagging
    print(feature_list)
    return feature_list


In [None]:
#pip install python-crfsuite

In [None]:
# Train the CRF BIO-tag tagger
TAGGER_PATH = "crf_nlu.tagger"  # path to the tagger- it will save/access the model from here
ct = CRFTagger(feature_func=get_features)  # initialize tagger with get_features function

print("training tagger...")
ct.train(training_data, TAGGER_PATH)
print("done")

In [None]:
# load tagger from saved file
ct = CRFTagger(feature_func=get_features)  # initialize tagger
ct.set_model_file(TAGGER_PATH)  # load model from file

# prepare the test data:
raw_test_data = get_raw_data_from_bio_file("engtest.bio.txt") 
test_data = [preProcess(example) for example in raw_test_data]
print(len(test_data), "instances")
print(sum([len(sent) for sent in test_data]), "words")

In [None]:
print("testing tagger...")
preds = []
y_test = []
for sent in test_data:
    sent_preds = [x[1] for x in ct.tag([s[0] for s in sent])]
    sent_true = [s[1] for s in sent]
    preds.extend(sent_preds)
    y_test.extend(sent_true)
print("done")

In [None]:
# Output the classification report (which you should save each time for comparing your models)
print(classification_report(y_test, preds))

In [None]:
def confusion_matrix_heatmap(y_test, preds):
    """Function to plot a confusion matrix"""
    labels = list(set(y_test))   # get the labels in the y_test
    # print(labels)
    cm = confusion_matrix(y_test, preds, labels)
    fig = plt.figure(figsize=(20,20))
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    plt.title('Confusion matrix of the classifier')
    fig.colorbar(cax)
    ax.set_xticks(np.arange(len(labels)))
    ax.set_yticks(np.arange(len(labels)))
    ax.set_xticklabels( labels, rotation=45)
    ax.set_yticklabels( labels)

    for i in range(len(cm)):
        for j in range(len(cm)):
            text = ax.text(j, i, cm[i, j],
                           ha="center", va="center", color="w")

    plt.xlabel('Predicted')
    plt.ylabel('True')
    #fig.tight_layout()
    plt.show()

In [None]:
confusion_matrix_heatmap(y_test, preds)

# Feature experimentation

Experiment with different features by further adjusting the `get_features` function, and modifying it to get the best results in terms of `macro average f-score` (i.e. average f-score across all classes) on your 20% development data. 

In [None]:
# We adjust get_feature by adding suffix from 3 to 5, adding prefix to 5, tokenization, and lemmatization

In [None]:
raw_testing_data = get_raw_data_from_bio_file("engtest.bio.txt") 

In [None]:
# Train the CRF BIO-tag tagger
TAGGER_PATH = "crf_nlu.tagger"  # path to the tagger- it will save/access the model from here
ct = CRFTagger(feature_func=get_features)  # initialize tagger with get_features function

print("training tagger...")
ct.train(training_data, TAGGER_PATH)
print("done")

In [None]:
# load tagger from saved file
ct = CRFTagger(feature_func=get_features)  # initialize tagger
ct.set_model_file(TAGGER_PATH)  # load model from file

# prepare the test data:
raw_test_data = get_raw_data_from_bio_file("engtest.bio.txt") 
test_data = [preProcess(example) for example in raw_test_data]
print(len(test_data), "instances")
print(sum([len(sent) for sent in test_data]), "words")

In [None]:
print("testing tagger...")
preds = []
y_test = []
for sent in test_data:
    sent_preds = [x[1] for x in ct.tag([s[0] for s in sent])]
    sent_true = [s[1] for s in sent]
    preds.extend(sent_preds)
    y_test.extend(sent_true)
print("done")

In [None]:

percentage = 0.8 #separate the data in 80% of training data and 20% of testing_data
data_sample = len(training_data) # find the length of training_data
training_sample = int((percentage*data_sample)) # create the number of training data
train_dataset = training_data[:training_sample] # create the list which store train_data
test_dataset = training_data[training_sample:] # create the list which store test_data
    


In [None]:
print("testing tagger...")
# we will train the data by using CRFTagger()
predict_result = [] #create list
y_predict = []
sent_predict_result = []
sent_y_predict = []  
for i in test_dataset:

      # we use the previous code the train the data
      sent_predict_result = [k[1] for k in ct.tag([j[0] for j in i])] # we will tag the word(j) by using ct.tag() and store BIO- tag in sent_predict_result
      sent_y_predict = [j[1] for j in i] # we store all BIO-tag, which has one line, in sent_y_predict
      predict_result.extend(sent_predict_result) # add the data in predict_result 
      y_predict.extend(sent_y_predict) # add the data in y_predict    


print("done")

In [None]:
print(classification_report(y_predict,predict_result)) 

In [None]:
# we can see that the macro avg would improve from Q4 by  adding suffix from 3 to 5, adding prefix to 5, tokenization, and lemmatization

In [None]:
confusion_matrix_heatmap(y_predict,predict_result)