In [None]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.9 sklearn_crfsuite-0.3.6


In [None]:
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from collections import Counter
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package bcp47 to /root/nltk_data...
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [None]:
tagged_sentence = nltk.corpus.treebank.tagged_sents(tagset='universal')

In [None]:
tagged_sentence

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], ...]

In [None]:
print("Number of Tagged Sentences ",len(tagged_sentence))
tagged_words=[tup for sent in tagged_sentence for tup in sent]
print("Total Number of Tagged words", len(tagged_words))
vocab=set([word for word,tag in tagged_words])
print("Vocabulary of the Corpus",len(vocab))
tags=set([tag for word,tag in tagged_words])
print("Number of Tags in the Corpus ",len(tags))

Number of Tagged Sentences  3914
Total Number of Tagged words 100676
Vocabulary of the Corpus 12408
Number of Tags in the Corpus  12


In [None]:
print(tags)

{'X', 'ADV', 'DET', 'VERB', '.', 'NUM', 'CONJ', 'PRON', 'NOUN', 'ADP', 'ADJ', 'PRT'}


In [None]:
train_set, test_set = train_test_split(tagged_sentence,test_size=0.2,random_state=1234)
print("Number of Sentences in Training Data ",len(train_set))
print("Number of Sentences in Testing Data ",len(test_set))

Number of Sentences in Training Data  3131
Number of Sentences in Testing Data  783


In [None]:
def features(sentence,index):
    ### sentence is of the form [w1,w2,w3,..], index is the position of the word in the sentence
    return {
        'is_first_capital':int(sentence[index][0].isupper()),
        'is_first_word': int(index==0),
        'is_last_word':int(index==len(sentence)-1),
        'is_complete_capital': int(sentence[index].upper()==sentence[index]),
        'prev_word':'' if index==0 else sentence[index-1],
        'next_word':'' if index==len(sentence)-1 else sentence[index+1],
        'is_numeric':int(sentence[index].isdigit()),
        'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
        'prefix_1':sentence[index][0],
        'prefix_2': sentence[index][:2],
        'prefix_3':sentence[index][:3],
        'prefix_4':sentence[index][:4],
        'suffix_1':sentence[index][-1],
        'suffix_2':sentence[index][-2:],
        'suffix_3':sentence[index][-3:],
        'suffix_4':sentence[index][-4:],
        'word_has_hyphen': 1 if '-' in sentence[index] else 0


    }

In [None]:
def untag(sentence):
    return [word for word,tag in sentence]


def prepareData(tagged_sentences):
    X,y=[],[]
    for sentences in tagged_sentences:
        X.append([features(untag(sentences), index) for index in range(len(sentences))])
        y.append([tag for word,tag in sentences])
    return X,y

In [None]:

X_train,y_train=prepareData(train_set)
X_test,y_test=prepareData(test_set)

In [None]:
X_train[0]

[{'is_first_capital': 1,
  'is_first_word': 1,
  'is_last_word': 0,
  'is_complete_capital': 0,
  'prev_word': '',
  'next_word': 'Wall',
  'is_numeric': 0,
  'is_alphanumeric': 0,
  'prefix_1': 'O',
  'prefix_2': 'On',
  'prefix_3': 'On',
  'prefix_4': 'On',
  'suffix_1': 'n',
  'suffix_2': 'On',
  'suffix_3': 'On',
  'suffix_4': 'On',
  'word_has_hyphen': 0},
 {'is_first_capital': 1,
  'is_first_word': 0,
  'is_last_word': 0,
  'is_complete_capital': 0,
  'prev_word': 'On',
  'next_word': 'Street',
  'is_numeric': 0,
  'is_alphanumeric': 0,
  'prefix_1': 'W',
  'prefix_2': 'Wa',
  'prefix_3': 'Wal',
  'prefix_4': 'Wall',
  'suffix_1': 'l',
  'suffix_2': 'll',
  'suffix_3': 'all',
  'suffix_4': 'Wall',
  'word_has_hyphen': 0},
 {'is_first_capital': 1,
  'is_first_word': 0,
  'is_last_word': 0,
  'is_complete_capital': 0,
  'prev_word': 'Wall',
  'next_word': 'men',
  'is_numeric': 0,
  'is_alphanumeric': 0,
  'prefix_1': 'S',
  'prefix_2': 'St',
  'prefix_3': 'Str',
  'prefix_4': 'Str

In [None]:
y_train[0]

['ADP',
 'NOUN',
 'NOUN',
 'NOUN',
 'CONJ',
 'NOUN',
 'VERB',
 'ADP',
 'ADJ',
 'NOUN',
 '.',
 'X',
 'VERB',
 'NUM',
 'DET',
 'ADV',
 'ADV',
 'PRON',
 'VERB',
 'ADP',
 'NOUN',
 'X',
 '.']

In [None]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

AttributeError: ignored

AttributeError: ignored

AttributeError: ignored

In [None]:
!pip install scikit-learn==0.22.2 --user

[31mERROR: Could not find a version that satisfies the requirement scikit-learn==0.22.2 (from versions: 0.9, 0.10, 0.11, 0.12, 0.12.1, 0.13, 0.13.1, 0.14, 0.14.1, 0.15.0, 0.15.1, 0.15.2, 0.16.0, 0.16.1, 0.17, 0.17.1, 0.18, 0.18.1, 0.18.2, 0.19.0, 0.19.1, 0.19.2, 0.20.0, 0.20.1, 0.20.2, 0.20.3, 0.20.4, 0.21.1, 0.21.2, 0.21.3, 0.22, 0.22.1, 0.22.2.post1, 0.23.0, 0.23.1, 0.23.2, 0.24.0, 0.24.1, 0.24.2, 1.0, 1.0.1, 1.0.2, 1.1.0, 1.1.1, 1.1.2, 1.1.3, 1.2.0rc1, 1.2.0, 1.2.1, 1.2.2, 1.3.0rc1, 1.3.0)[0m[31m
[0m[31mERROR: No matching distribution found for scikit-learn==0.22.2[0m[31m
[0m

In [None]:
y_pred=crf.predict(X_test)
y_pred

[['NOUN', 'NOUN', 'NOUN', 'NOUN', '.', 'NOUN', '.'],
 ['ADP',
  'ADJ',
  'NOUN',
  'NOUN',
  'PRON',
  'NOUN',
  'X',
  'VERB',
  'ADP',
  'DET',
  'NOUN',
  'NOUN',
  '.',
  'NOUN',
  'ADP',
  'NOUN',
  'NOUN',
  '.',
  'VERB',
  'X',
  'ADP',
  'NOUN',
  '.',
  'VERB',
  'PRON',
  'NOUN',
  'VERB',
  'NUM',
  'NUM',
  'PRT',
  'NUM',
  'NUM',
  '.'],
 ['NOUN',
  'NOUN',
  'PRON',
  'X',
  'VERB',
  'DET',
  'NOUN',
  'ADV',
  'VERB',
  'ADJ',
  'ADP',
  'DET',
  'VERB',
  'ADP',
  'NOUN',
  'NOUN',
  'CONJ',
  'DET',
  'NOUN',
  'NOUN',
  'ADP',
  'NOUN',
  '.'],
 ['NOUN', 'NOUN', 'NOUN', 'NOUN', '.', 'NOUN', '.'],
 ['DET',
  'NOUN',
  'CONJ',
  'NOUN',
  'NOUN',
  'VERB',
  'VERB',
  'X',
  'PRT',
  'VERB',
  'X',
  'VERB',
  'ADV',
  'ADJ',
  'NOUN',
  '.',
  'CONJ',
  'DET',
  'NOUN',
  'CONJ',
  'NOUN',
  'NOUN',
  'ADJ',
  'ADJ',
  'NOUN',
  '.'],
 ['ADP',
  'NOUN',
  '.',
  'DET',
  'NOUN',
  'VERB',
  'ADP',
  'ADP',
  '.',
  'NUM',
  'NUM',
  'CONJ',
  '.',
  'NUM',
  'NUM',


In [None]:
metrics.flat_f1_score(y_test, y_pred,average='weighted',labels=crf.classes_)

0.9738471726864286

In [None]:
y_pred_train=crf.predict(X_train)
metrics.flat_f1_score(y_train, y_pred_train,average='weighted',labels=crf.classes_)

0.9963402924209424

In [None]:
print(metrics.flat_classification_report(
    y_test, y_pred#, labels=crf.classes_, digits=3
))

TypeError: ignored

In [None]:
print("Number of Transition Features ")
len(crf.transition_features_)

Number of Transition Features 


144

In [None]:
Counter(crf.transition_features_).most_common(20)

[(('ADJ', 'NOUN'), 4.114996),
 (('NOUN', 'NOUN'), 2.935448),
 (('NOUN', 'VERB'), 2.891987),
 (('VERB', 'PRT'), 2.519179),
 (('X', 'VERB'), 2.271558),
 (('ADP', 'NOUN'), 2.265833),
 (('NOUN', 'PRT'), 2.172849),
 (('PRON', 'VERB'), 2.117186),
 (('NUM', 'NOUN'), 2.059221),
 (('DET', 'NOUN'), 2.053832),
 (('ADV', 'VERB'), 1.994419),
 (('ADV', 'ADJ'), 1.957063),
 (('NOUN', 'ADP'), 1.838684),
 (('VERB', 'NOUN'), 1.763319),
 (('ADJ', 'ADJ'), 1.660578),
 (('NOUN', 'CONJ'), 1.591359),
 (('PRT', 'NOUN'), 1.398473),
 (('NOUN', '.'), 1.381863),
 (('NOUN', 'ADV'), 1.380086),
 (('ADV', 'ADV'), 1.301282)]

In [None]:

Counter(crf.transition_features_).most_common()[-20:]

[(('X', 'NOUN'), -1.136906),
 (('CONJ', 'PRT'), -1.140622),
 (('ADJ', 'DET'), -1.146271),
 (('.', 'DET'), -1.255028),
 (('ADJ', 'PRON'), -1.266624),
 (('PRON', 'DET'), -1.330807),
 (('DET', '.'), -1.336752),
 (('CONJ', '.'), -1.368327),
 (('ADP', 'PRT'), -1.392629),
 (('X', 'NUM'), -1.484666),
 (('DET', 'DET'), -1.509759),
 (('PRT', 'PRT'), -1.522135),
 (('PRT', 'NUM'), -1.562026),
 (('DET', 'ADP'), -1.969625),
 (('X', 'PRT'), -2.096541),
 (('CONJ', 'X'), -2.157477),
 (('PRON', 'PRT'), -2.158365),
 (('ADP', 'X'), -3.107295),
 (('.', 'PRT'), -3.193167),
 (('DET', 'PRT'), -4.377446)]

In [None]:
print("Number of State Features ",len(crf.state_features_))

Number of State Features  32413


In [None]:
Counter(crf.state_features_).most_common(20)

[(('prev_word:will', 'VERB'), 6.751359),
 (('prev_word:would', 'VERB'), 5.940819),
 (('prefix_1:*', 'X'), 5.830558),
 (('suffix_4:rest', 'NOUN'), 5.644523),
 (('suffix_2:ly', 'ADV'), 5.260228),
 (('is_first_capital', 'NOUN'), 5.043121),
 (('prev_word:could', 'VERB'), 5.018842),
 (('suffix_3:ous', 'ADJ'), 4.870949),
 (('prev_word:to', 'VERB'), 4.849822),
 (('suffix_4:will', 'VERB'), 4.677684),
 (('next_word:appeal', 'ADJ'), 4.386434),
 (('prev_word:how', 'PRT'), 4.35094),
 (('suffix_4:pany', 'NOUN'), 4.329975),
 (('prefix_4:many', 'ADJ'), 4.205028),
 (('prev_word:lock', 'PRT'), 4.153643),
 (('word_has_hyphen', 'ADJ'), 4.151036),
 (('prev_word:tune', 'PRT'), 4.147576),
 (('next_word:Express', 'NOUN'), 4.137127),
 (('suffix_4:food', 'NOUN'), 4.116688),
 (('suffix_2:ed', 'VERB'), 4.070659)]

In [None]:

Counter(crf.state_features_).most_common()[-20:]

[(('suffix_4:less', 'NOUN'), -2.430638),
 (('prev_word:*', 'DET'), -2.435687),
 (('prev_word:moderate', 'NOUN'), -2.517772),
 (('prev_word:paid', 'ADP'), -2.533975),
 (('suffix_4:ment', 'ADJ'), -2.572212),
 (('prev_word:was', 'NOUN'), -2.586244),
 (('prev_word:--', 'CONJ'), -2.58728),
 (('next_word:what', 'CONJ'), -2.621051),
 (('prev_word:--', 'DET'), -2.692732),
 (('prev_word:Media', 'VERB'), -2.6973),
 (('prefix_4:shor', 'NOUN'), -2.698477),
 (('prev_word:their', 'VERB'), -2.714216),
 (('next_word:currency', 'NOUN'), -2.732162),
 (('suffix_4:good', 'NOUN'), -2.809532),
 (('suffix_4:rter', 'ADJ'), -3.174431),
 (('prev_word:*U*', 'VERB'), -3.205405),
 (('next_word:of', 'PRT'), -3.22855),
 (('next_word:swap', 'ADJ'), -3.474744),
 (('prev_word:his', 'VERB'), -3.683731),
 (('word_has_hyphen', 'VERB'), -4.63526)]