In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet

In [None]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

## Sklearn vectoriser

In [None]:
vectoriser = CountVectorizer()
vectoriser.fit(corpus)
print(vectoriser.vocabulary_)
X_train = vectoriser.transform(corpus)
X_train

In [None]:
embedding = vectoriser.transform(corpus).toarray()

## My simple vectoriser

In [None]:
class MyCountVectoriser(object):
    """Simple word-level count vectoriser ignoring uppercases.
    Just for demonstration purposes"""
    def __init__(self) -> None:
        pass

    def fit(self, corpus: list[str]):
        vocab = []
        for doc in corpus:
            words_list = doc.replace('.', '').replace('.', '').replace('?', '').lower().split(' ')
            for word in words_list:
                if word not in vocab:
                    vocab.append(word)

        self.vocab = vocab
        self.vocab.sort()

        return self.vocab
    
    def transform(self, corpus: list[str]):
        N = len(corpus)
        try:
            D = len(self.vocab)
        except AttributeError as e:
            print(e)
            print('Vocabulary has not been defined. Call .fit method first.')
        
        sparse_mat = sparse.csr_matrix((N, D), dtype=np.uint32)
        sparse_mat = sparse_mat.tolil() # for efficiency

        for ii in range(len(corpus)):
            doc = corpus[ii]
            words_list = doc.replace('.', '').replace('.', '').replace('?', '').lower().split(' ') # would need to tidy this up with regex
            for jj in range(len(words_list)):
                word = words_list[jj]
                try:
                    pos = self.vocab.index(word) # usually this mapping of word2ind is implemented with a dict. See tfidf_from_scratch.ipynb
                    sparse_mat[ii, pos] += 1
                except ValueError as e:
                    pass
             
        sparse_mat = sparse_mat.tocsr()
        return sparse_mat

In [None]:
my_vectoriser = MyCountVectoriser()
print(my_vectoriser.fit(corpus=corpus))
my_vectoriser.transform(['asdasd'])

In [None]:
my_embedding = my_vectoriser.transform(corpus).toarray()
my_vectoriser.vocab

In [None]:
print(f'Embeddings agree?: {(my_embedding == embedding).all()}')

# Application to the BBC dataset

In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')  

In [None]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

In [None]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()

In [None]:
inputs = df['text']
labels = df['labels']

In [None]:
# do a histogram of the labels to determine if we have imbalanced classes
labels.hist(figsize=(10, 5))

In [None]:
inputs_train, inputs_test, labels_train, labels_test = train_test_split(inputs, labels, random_state=123)

## Default settings

In [None]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)

In [None]:
print(np.prod(X_train.shape))
print(X_train.size)

In [None]:
print(f'there are {(X_train != 0).sum()} non-zero entries in the count-vectoriser matrix')
print(f'percentage of non-zero entries: {(X_train != 0).sum() / np.prod(X_train.shape)}')

In [None]:
model = MultinomialNB()
model.fit(X_train, labels_train)
print(f'Training score: {model.score(X_train, labels_train)}')
print(f'Test score: {model.score(X_test, labels_test)}')

## With stopwords

In [None]:
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, labels_train)
print(f'Training score: {model.score(X_train, labels_train)}')
print(f'Test score: {model.score(X_test, labels_test)}')

## Lemmatisation

To achieve good lemmatisation, we need to specify the part of speech of each word. But of course, we don't want to do this manually. This functionality is included in NLTK. A small practical difficulty is that the outcomes of the tagger do not correspond to the wordnet.POS that go into the lemmatiser. Hence, we need to create a mapping between them:

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    if treebank_tag.startswith('V'):
        return wordnet.VERB
    if treebank_tag.startswith('N'):
        return wordnet.NOUN
    if treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
class LemmaTokeniser(object):
    def __init__(self) -> None:
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc) -> list[str]:
        """Takes in a document and returns a list of lemmatised tokens."""
        tokens = word_tokenize(doc) # equivalent to .split(), but better
        toks_and_tags = nltk.pos_tag(tokens)

        return [self.wnl.lemmatize(tok, pos=get_wordnet_pos(tag)) for tok, tag in toks_and_tags]

In [None]:
vectorizer = CountVectorizer(stop_words='english', tokenizer=LemmaTokeniser()) # tokeniser takes in any callable
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, labels_train)
print(f'Training score: {model.score(X_train, labels_train)}')
print(f'Test score: {model.score(X_test, labels_test)}')

## Stemmer

In [None]:
class Stemmer(object):
    def __init__(self) -> None:
        self.stemmer = PorterStemmer()
    def __call__(self, doc) -> list[str]:
        """Takes in a document and returns a list of stemmed tokens."""
        tokens = word_tokenize(doc) # equivalent to .split(), but better

        return [self.stemmer.stem(tok) for tok in tokens]

In [None]:
vectorizer = CountVectorizer(stop_words='english', tokenizer=Stemmer()) # tokeniser takes in any callable
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, labels_train)
print(f'Training score: {model.score(X_train, labels_train)}')
print(f'Test score: {model.score(X_test, labels_test)}')

## Plain split with no preprocessing

In [None]:
def splitter(doc):
    """Takes in a document and returns a list of tokens split on whitespace."""
    return doc.split()

In [None]:
vectorizer = CountVectorizer(stop_words='english', tokenizer=splitter) # tokeniser takes in any callable
X_train = vectorizer.fit_transform(inputs_train)
X_test = vectorizer.transform(inputs_test)
model = MultinomialNB()
model.fit(X_train, labels_train)
print(f'Training score: {model.score(X_train, labels_train)}')
print(f'Test score: {model.score(X_test, labels_test)}')