In [1]:
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from collections import Counter


ModuleNotFoundError: No module named 'sklearn_crfsuite'

**Introduction**

In this work, we will use CRF Classifier for POS Tagging. The dataset we will use in the PennTree Bank Corpus, with the universal Tag Set. This tag set has 12 unique POS Tags


In [None]:
tagged_sentence = nltk.corpus.treebank.tagged_sents(tagset='universal')

In [None]:
print("Number of Tagged Sentences ",len(tagged_sentence))
tagged_words=[tup for sent in tagged_sentence for tup in sent]
print("Total Number of Tagged words", len(tagged_words))
vocab=set([word for word,tag in tagged_words])
print("Vocabulary of the Corpus",len(vocab))
tags=set([tag for word,tag in tagged_words])
print("Number of Tags in the Corpus ",len(tags))

#### Splitting Data into train and test set - 80-20 split

In [None]:
train_set, test_set = train_test_split(tagged_sentence,test_size=0.2,random_state=1234)
print("Number of Sentences in Training Data ",len(train_set))
print("Number of Sentences in Testing Data ",len(test_set))

### Define the feature function. The following features can be used 
1. Is the first letter capitalised.
2. Is it the first word in the sentence?
3. Is it the last word?
4. What is the prefix of the word?
5. What is the suffix of the word?
6. Is the complete word captilised?
7. What is the previous word?
8. What is the next word?
9. Is it numeric?
10. Is it alphanumeric?
11. Is there an hyphen in the word?

In [None]:
def features(sentence,index):
    ### sentence is of the form [w1,w2,w3,..], index is the position of the word in the sentence
    return {
        'is_first_capital':int(sentence[index][0].isupper()),
        'is_first_word': int(index==0),
        'is_last_word':int(index==len(sentence)-1),
        'is_complete_capital': int(sentence[index].upper()==sentence[index]),
        'prev_word':'' if index==0 else sentence[index-1],
        'next_word':'' if index==len(sentence)-1 else sentence[index+1],
        'is_numeric':int(sentence[index].isdigit()),
        'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
        'prefix_1':sentence[index][0],
        'prefix_2': sentence[index][:2],
        'prefix_3':sentence[index][:3],
        'prefix_4':sentence[index][:4],
        'suffix_1':sentence[index][-1],
        'suffix_2':sentence[index][-2:],
        'suffix_3':sentence[index][-3:],
        'suffix_4':sentence[index][-4:],
        'word_has_hyphen': 1 if '-' in sentence[index] else 0
        
        
    }

#### Need to seperate labels and the sentences in both training and test data

In [None]:
def untag(sentence):
    return [word for word,tag in sentence]


def prepareData(tagged_sentences):
    X,y=[],[]
    for sentences in tagged_sentences:
        X.append([features(untag(sentences), index) for index in range(len(sentences))])
        y.append([tag for word,tag in sentences])
    return X,y

In [None]:
X_train,y_train=prepareData(train_set)
X_test,y_test=prepareData(test_set)


In [None]:
X_train[0]

In [None]:
y_train[0]

#### Let us fit a CRF model with the default Parameters

In [None]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [None]:
y_pred=crf.predict(X_test)

In [None]:
metrics.flat_f1_score(y_test, y_pred,average='weighted',labels=crf.classes_)

In [None]:
y_pred_train=crf.predict(X_train)
metrics.flat_f1_score(y_train, y_pred_train,average='weighted',labels=crf.classes_)

#### THE CRF Model had an F1 score of 0.97 on the test data and 0.996 on the train data. There is overfitting and we have to tune this model. 
But, before we tune the model, let us look at where the CRF failed and what are the important features used to identify different POS Tags

In [None]:
metrics.flat_accuracy_score(y_test,y_pred)

In [None]:
metrics.flat_accuracy_score(y_train,y_pred_train)

#### Let us look at class wise scores

In [None]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels=crf.classes_, digits=3
))

Adjectives have a low precision, recall and F1 score

### Let us look at Top Most likely Transition Features


In [None]:
print("Number of Transition Features ")
len(crf.transition_features_)

In [None]:
Counter(crf.transition_features_).most_common(20)

 If there is an adjective, it is more likely to be followed by a NOUN

In [None]:
Counter(crf.transition_features_).most_common()[-20:]

Its unlikely that sentence is this corpus begins qith a DET or PRT. Unknown is not followed by a NOUN



### What are the most likely state features

In [None]:
print("Number of State Features ",len(crf.state_features_))

In [None]:
Counter(crf.state_features_).most_common(20)

If the prev word is will or would or to then it is a verb and if the first letter of word in capitalised it is a nOUN. words ending with ed are verbs. 

In [None]:
Counter(crf.state_features_).most_common()[-20:]

if a word has hyphen, then it is least likely to be a verb, his is less likely to be followed by a verb. if a word ends with less, it is most likely not a noun.