# Importing the libraries that the program needs

In [21]:
import sys
print("Python:", sys.version)

import numpy as np
print("NumPy:", np.__version__)

import pandas as pd
print("Pandas:", pd.__version__)

import sklearn
print("Scikit-learn:", sklearn.__version__)

import spacy
from spacy.tokens import Doc
print("SpaCy:", spacy.__version__)

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

Python: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
NumPy: 1.16.4
Pandas: 0.25.3
Scikit-learn: 0.23.2
SpaCy: 2.3.4


# Downloading the brown corpus

In [22]:
import nltk
#nltk.download('brown')
#nltk.download('universal_tagset')
from nltk.corpus import brown

### Load data with the universal tagset, so it contains the POS tags as well, because we will need it in the training

In [23]:
brown.tagged_words(tagset='universal')

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

### The brown corpus contains english sentences, so we need to load the english version of spacy

In [24]:
nlp = spacy.load("en")

In [25]:
brown.tagged_words(tagset='universal')[:3]

[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN')]

## Split data into train, test set

### First let's define a function that will do the preprocessing of the brown corpus. This will convert the (str, POS) pairs of the brown corpus into a list of (Doc, POS) pairs. <br> <br>We need the Doc so we can use some special feature extractions like is_stop.

In [26]:
def process_corpus(data):
    processed = []
    for i in data:
        sentence, postags = map(list, zip(*i))
        processed.append((Doc(nlp.vocab, sentence), postags))
    return processed

### We go through the corpus and apply our function, after that we split the data into train and test using a 80%-20% split. <br><br> We need to use separate data for the training and testing so our model won't overfit.

In [27]:
data = process_corpus(brown.tagged_sents(tagset='universal'))
data_size = len(data)
train_size = int(data_size * 0.8)
# Split
train, test = data[:train_size], data[train_size:]

print(f"Train size: {len(train)}")
print(f"Test size: {len(test)}")

Train size: 45872
Test size: 11468


### Define feature extraction functions. We use features from the Doc and from the str. To use functions like ".lower()" we need to convert the Doc into str, we can do this with the ".text" property of the Doc. <br><br> We return "BOS" if this is the first word of the sentence and "EOS" if this is the last word of the sentence

In [28]:
def token2features(tokens):
    def single_token2feature(tokens, i):
        word = tokens[i].text

        features = {
            #'bias': 1.0,
            'word.lower()': word.lower(),
            'word[-3:]': word[-3:],
            'word[-2:]': word[-2:],
            'word.isupper()': word.isupper(),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
            'is_stop': tokens[i].is_stop,
            'is_alpha': tokens[i].is_alpha,
        }
        if i > 0:
            word1 = tokens.text[i-1][0]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.istitle()': word1.istitle(),
                '-1:word.isupper()': word1.isupper(),
            })
        else:
            features['BOS'] = True

        if i < len(tokens)-1:
            word1 = tokens.text[i+1][0]
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.istitle()': word1.istitle(),
                '+1:word.isupper()': word1.isupper(),
            })
        else:
            features['EOS'] = True

        return features
    return [single_token2feature(tokens, i) for i in range(len(tokens))]

### Get features
### We transform our processed training and testing data into features <br><br> X_train, X_test contains the features, while y_train, y_test only contains the POS tags

In [29]:
X_train = [token2features(tokens) for tokens,_ in train]
y_train = [pos for _, pos in train]

In [30]:
X_test = [token2features(tokens) for tokens,_ in test]
y_test = [pos for _, pos in test]

# Training
### We use the sklearn_crfsuite.CRF for training, the parameters supplied to the function (algorithm, c1, c2, max_iterations and  all_possible_transitions) are hyperparameters which we can use for optimization

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

## Display the classes of the model
## Here we should see the POS tags that our model can predict

In [None]:
crf.classes_

# Testing
### We used X_train, y_train for training the model, now we will use X_test, y_test for testing the model. To do the testing first we predict the POS tags of X_test with our model, then we compare the predicted POS tags to the ground truth POS tags of y_test using the f1 score.

In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted')

# Running on custom text
## Here we can try our model on some custom text
## First let's define some helper functions that will help us in the transformation of the input text into a format that our model can understand

In [None]:
def pre_process(custom):
    if isinstance(custom, str):  
        tokenized = nlp(custom)
        postags = [token.pos_ for token in tokenized]
        return tokenized, postags
    else:
        postags = [token.pos_ for token in custom]
        return custom, postags

def pos_tagger(custom):
    tokens, postags = pre_process(custom)
    features = token2features(tokens)
    return crf.predict([features]), postags        

In [None]:
def evaluate_custom(custom):
    predict, true = pos_tagger(custom)

    predict = predict[0] # Remove from inner list

    correct = 0
    for i in range(len(predict)):
        if(predict[i] == true[i]):
            correct += 1

    print(f"Predicted: {predict}")
    print(f"True: {true}")
    print(f"Accuracy: {correct}/{len(predict)}")

# We can use string or a list of tokens as input

## Example with string

In [None]:
evaluate_custom("Hello this is a test")

## Example with tokens

In [None]:
tokens = nlp("Hello this is a test")
evaluate_custom(tokens)

## Try something (type your text here)

In [None]:
Text = ""

In [None]:
evaluate_custom(Text)