# Importing the necessary libraries

In [1]:
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from collections import Counter

In [2]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 4.1MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [3]:
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers

# Downloading the Dataset
The dataset we will use in the PennTree Bank Corpus, with the universal Tag Set.

We will use the NLTK Treebank dataset with the Universal Tagset. The Universal tagset of NLTK comprises of 12 tag classes: Verb, Noun, Pronouns, Adjectives, Adverbs, Adpositions, Conjunctions, Determiners, Cardinal Numbers, Particles, Other/ Foreign words, Punctuations. This dataset has 3,914 tagged sentences and a vocabulary of 12,408 words.

In [4]:
import nltk
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [5]:
import nltk
nltk.download('treebank')
tagged_sentence = nltk.corpus.treebank.tagged_sents(tagset='universal')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


In [6]:
print("Number of Tagged Sentences ",len(tagged_sentence))
tagged_words=[tup for sent in tagged_sentence for tup in sent]
print("Total Number of Tagged words", len(tagged_words))
vocab=set([word for word,tag in tagged_words])
print("Vocabulary of the Corpus",len(vocab))
tags=set([tag for word,tag in tagged_words])
print("Number of Tags in the Corpus ",len(tags))

Number of Tagged Sentences  3914
Total Number of Tagged words 100676
Vocabulary of the Corpus 12408
Number of Tags in the Corpus  12


# Splitting the dataset into training and test

Next, we will split the data into Training and Test data in a 70:30 ratio 

In [7]:
train_set, test_set = train_test_split(tagged_sentence,test_size=0.3,random_state=123)
print("Number of Sentences in Training Data ",len(train_set))
print("Number of Sentences in Testing Data ",len(test_set))

Number of Sentences in Training Data  2739
Number of Sentences in Testing Data  1175


2739 sentences in the training set and 1175 sentences in the test set

# Creating the Feature Function

For identifying POS tags, we will create a function which returns a dictionary with the following features for each word in a sentence:

Is the first letter of the word capitalised (Generally Proper Nouns have the first letter capitalised)?

Is it the first word of the sentence?

Is it the last word of the sentence

Does the word contain both numbers and alphabets?

Does it have a hyphen (generally, adjectives have hyphens - for example, words like fast-growing, slow-moving)

Is the complete word capitalised?

Is it a number?

What are the first four suffixes and prefixes?(words ending with “ed” are generally verbs, words ending with “ous” like disastrous are adjectives)

In [9]:
def features(sentence,index):
    ### sentence is of the form [w1,w2,w3,..], index is the position of the word in the sentence
    return {
        'is_first_capital':int(sentence[index][0].isupper()),
        'is_first_word': int(index==0),
        'is_last_word':int(index==len(sentence)-1),
        'is_complete_capital': int(sentence[index].upper()==sentence[index]),
        'prev_word':'' if index==0 else sentence[index-1],
        'next_word':'' if index==len(sentence)-1 else sentence[index+1],
        'is_numeric':int(sentence[index].isdigit()),
        'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
        'prefix_1':sentence[index][0],
        'prefix_2': sentence[index][:2],
        'prefix_3':sentence[index][:3],
        'prefix_4':sentence[index][:4],
        'suffix_1':sentence[index][-1],
        'suffix_2':sentence[index][-2:],
        'suffix_3':sentence[index][-3:],
        'suffix_4':sentence[index][-4:],
        'word_has_hyphen': 1 if '-' in sentence[index] else 0}
        

In [10]:
def untag(sentence):
    return [word for word,tag in sentence]


def prepareData(tagged_sentences):
    X,y=[],[]
    for sentences in tagged_sentences:
        X.append([features(untag(sentences), index) for index in range(len(sentences))])
        y.append([tag for word,tag in sentences])
    return X,y

In [11]:
X_train,y_train=prepareData(train_set)
X_test,y_test=prepareData(test_set)

In [12]:
X_train[0]

[{'is_alphanumeric': 0,
  'is_complete_capital': 0,
  'is_first_capital': 1,
  'is_first_word': 1,
  'is_last_word': 0,
  'is_numeric': 0,
  'next_word': 'Fairlawn',
  'prefix_1': 'T',
  'prefix_2': 'Th',
  'prefix_3': 'The',
  'prefix_4': 'The',
  'prev_word': '',
  'suffix_1': 'e',
  'suffix_2': 'he',
  'suffix_3': 'The',
  'suffix_4': 'The',
  'word_has_hyphen': 0},
 {'is_alphanumeric': 0,
  'is_complete_capital': 0,
  'is_first_capital': 1,
  'is_first_word': 0,
  'is_last_word': 0,
  'is_numeric': 0,
  'next_word': ',',
  'prefix_1': 'F',
  'prefix_2': 'Fa',
  'prefix_3': 'Fai',
  'prefix_4': 'Fair',
  'prev_word': 'The',
  'suffix_1': 'n',
  'suffix_2': 'wn',
  'suffix_3': 'awn',
  'suffix_4': 'lawn',
  'word_has_hyphen': 0},
 {'is_alphanumeric': 0,
  'is_complete_capital': 1,
  'is_first_capital': 0,
  'is_first_word': 0,
  'is_last_word': 0,
  'is_numeric': 0,
  'next_word': 'Ohio-based',
  'prefix_1': ',',
  'prefix_2': ',',
  'prefix_3': ',',
  'prefix_4': ',',
  'prev_word':

In [13]:
y_train[0]

['DET',
 'NOUN',
 '.',
 'ADJ',
 'NOUN',
 'ADV',
 'VERB',
 'DET',
 'ADJ',
 'NOUN',
 'ADP',
 'VERB',
 'NOUN',
 'VERB',
 'VERB',
 'ADV',
 'ADP',
 'ADJ',
 'NOUN',
 'PRT',
 '.',
 'NUM',
 'NUM',
 'X',
 '.']

# Fitting the CRF model

In [15]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.01, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [16]:
y_pred=crf.predict(X_test)

In [17]:
metrics.flat_f1_score(y_test, y_pred,average='weighted',labels=crf.classes_)

0.9715055868300791

In [18]:

y_pred_train=crf.predict(X_train)
metrics.flat_f1_score(y_train, y_pred_train,average='weighted',labels=crf.classes_)

0.9965169675831768

THE CRF Model had an F1 score of 0.97 on the test data and 0.996 on the train data. There is overfitting and we have to tune this model.

In [19]:
metrics.flat_accuracy_score(y_test,y_pred)

0.9716345505996496

In [20]:
metrics.flat_accuracy_score(y_train,y_pred_train)

0.9965207347306739

In [21]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels=crf.classes_, digits=3
))

              precision    recall  f1-score   support

         DET      0.992     0.991     0.992      2541
        NOUN      0.963     0.975     0.969      8612
           .      1.000     0.999     1.000      3529
         ADJ      0.909     0.864     0.886      1929
         ADV      0.902     0.908     0.905       903
        VERB      0.963     0.957     0.960      3950
         ADP      0.976     0.982     0.979      2810
         PRT      0.980     0.983     0.982       961
         NUM      0.994     0.987     0.991      1023
           X      1.000     0.996     0.998      1912
        CONJ      0.994     0.996     0.995       713
        PRON      0.998     0.998     0.998       801

    accuracy                          0.972     29684
   macro avg      0.973     0.970     0.971     29684
weighted avg      0.971     0.972     0.972     29684



Adjectives have a low precision, recall and F1 score

In [22]:
print("Number of Transition Features ")
len(crf.transition_features_)

Number of Transition Features 


144

In [24]:
Counter(crf.transition_features_).most_common(10)

[(('ADJ', 'NOUN'), 4.060954),
 (('NOUN', 'NOUN'), 2.838974),
 (('NOUN', 'VERB'), 2.714596),
 (('VERB', 'PRT'), 2.663168),
 (('NOUN', 'PRT'), 2.273041),
 (('ADP', 'NOUN'), 2.235203),
 (('DET', 'NOUN'), 2.067339),
 (('PRON', 'VERB'), 2.052993),
 (('NUM', 'NOUN'), 1.982179),
 (('X', 'VERB'), 1.905777)]

An adjective is more likely to be followed by a noun

In [26]:
Counter(crf.transition_features_).most_common()[-10:]

[(('PRT', 'NUM'), -1.592485),
 (('PRT', 'PRT'), -1.601084),
 (('PRON', 'DET'), -1.619706),
 (('DET', 'ADP'), -1.894368),
 (('PRON', 'PRT'), -1.951926),
 (('X', 'PRT'), -1.964785),
 (('CONJ', 'X'), -3.2181),
 (('ADP', 'X'), -3.283691),
 (('.', 'PRT'), -3.395523),
 (('DET', 'PRT'), -4.193892)]

#What are the most likely state features

In [27]:
print("Number of State Features ",len(crf.state_features_))

Number of State Features  30440


In [29]:
Counter(crf.state_features_).most_common(10)

[(('prev_word:will', 'VERB'), 6.36682),
 (('prefix_1:*', 'X'), 5.787324),
 (('prev_word:would', 'VERB'), 5.767208),
 (('suffix_2:ly', 'ADV'), 5.344567),
 (('prev_word:could', 'VERB'), 5.225837),
 (('is_first_capital', 'NOUN'), 5.159361),
 (('prev_word:can', 'VERB'), 4.818747),
 (('suffix_4:rest', 'NOUN'), 4.777643),
 (('next_word:1929', 'NOUN'), 4.756875),
 (('suffix_4:ment', 'NOUN'), 4.614442)]

A word will is more likely to be a verb, if a word has first word as a capital it is more likely to be a noun


In [30]:
Counter(crf.state_features_).most_common()[-10:]

[(('prev_word:--', 'DET'), -2.687418),
 (('prev_word:*U*', 'VERB'), -2.946117),
 (('prev_word:--', 'CONJ'), -3.122758),
 (('prev_word:moderate', 'NOUN'), -3.171468),
 (('suffix_4:rter', 'ADJ'), -3.181243),
 (('next_word:of', 'PRT'), -3.218117),
 (('suffix_4:less', 'NOUN'), -3.25398),
 (('next_word:swap', 'ADJ'), -3.457023),
 (('prev_word:his', 'VERB'), -3.518914),
 (('word_has_hyphen', 'VERB'), -3.978556)]

It is less likely that a word with a hyphen is a verb and it is also likely that his is followed by a verb