In [1]:
import sys
print("Python:", sys.version)

import numpy as np
print("NumPy:", np.__version__)

import pandas as pd
print("Pandas:", pd.__version__)

import sklearn
print("Scikit-learn:", sklearn.__version__)

import spacy
print("SpaCy:", spacy.__version__)

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

Python: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
NumPy: 1.16.4
Pandas: 0.25.3
Scikit-learn: 0.23.2
SpaCy: 2.3.4


In [2]:
import nltk
#nltk.download('brown')
#nltk.download('universal_tagset')
from nltk.corpus import brown

### Load data

In [3]:
brown.tagged_words(tagset='universal')

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [4]:
nlp = spacy.load("en")

In [5]:
brown.tagged_words(tagset='universal')[:3]

[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN')]

### Split data into train, test set

In [6]:
data = brown.tagged_sents(tagset='universal')
data_size = len(data)
train_size = int(data_size * 0.8)
train, test = data[:train_size], data[train_size:]
print(f"Train size: {len(train)}")
print(f"Test size: {len(test)}")

Train size: 45872
Test size: 11468


### Define feature transform functions

In [7]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2pos(sent):
    return [postag for token, postag in sent]

def sent2tokens(sent):
    return [token for token, postag in sent]

In [8]:
sent2features(data[0])[0]

{'bias': 1.0,
 'word.lower()': 'the',
 'word[-3:]': 'The',
 'word[-2:]': 'he',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'postag': 'DET',
 'postag[:2]': 'DE',
 'BOS': True,
 '+1:word.lower()': 'fulton',
 '+1:word.istitle()': True,
 '+1:word.isupper()': False,
 '+1:postag': 'NOUN',
 '+1:postag[:2]': 'NO'}

### Get features

In [9]:
X_train = [sent2features(s) for s in train]
y_train = [sent2pos(s) for s in train]

In [10]:
X_test = [sent2features(s) for s in test]
y_test = [sent2pos(s) for s in test]

### Training

In [11]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [12]:
crf.classes_

['DET',
 'NOUN',
 'ADJ',
 'VERB',
 'ADP',
 '.',
 'ADV',
 'CONJ',
 'PRT',
 'PRON',
 'NUM',
 'X']

### Testing

In [13]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted')

1.0

In [14]:
X_test[0]

[{'bias': 1.0,
  'word.lower()': 'the',
  'word[-3:]': 'The',
  'word[-2:]': 'he',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'DET',
  'postag[:2]': 'DE',
  'BOS': True,
  '+1:word.lower()': 'quarrel',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'NOUN',
  '+1:postag[:2]': 'NO'},
 {'bias': 1.0,
  'word.lower()': 'quarrel',
  'word[-3:]': 'rel',
  'word[-2:]': 'el',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'NOUN',
  'postag[:2]': 'NO',
  '-1:word.lower()': 'the',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:postag': 'DET',
  '-1:postag[:2]': 'DE',
  '+1:word.lower()': 'ended',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'VERB',
  '+1:postag[:2]': 'VE'},
 {'bias': 1.0,
  'word.lower()': 'ended',
  'word[-3:]': 'ded',
  'word[-2:]': 'ed',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()'

### Running on custom text

In [15]:
def transform(text):
    if isinstance(text, spacy.tokens.doc.Doc):
        # already tokenized
        return [token for token in text]
    return [word for word in text.split(' ')]

def predict(text):
    pre = transform(text)
    post = sent2features(pre)
    return crf.predict(post)

In [16]:
predict("Mom pick me up I'm scared")

[['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
 ['X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X'],
 ['X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X'],
 ['X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X'],
 ['X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X',
  'X'],
 ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']]