In [None]:
import numpy as np
import scipy.sparse as sparse
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

## Sklearn vectoriser

In [None]:
vectoriser = CountVectorizer()
vectoriser.fit(corpus)
print(vectoriser.vocabulary_)
X_train = vectoriser.transform(corpus)
X_train

In [None]:
embedding = vectoriser.transform(corpus).toarray()

## My simple vectoriser

In [None]:
class MyCountVectoriser(object):
    """Simple word-level count vectoriser ignoring uppercases.
    Just for demonstration purposes"""
    def __init__(self) -> None:
        pass

    def fit(self, corpus: list[str]):
        vocab = []
        for doc in corpus:
            words_list = doc.replace('.', '').replace('.', '').replace('?', '').lower().split(' ')
            for word in words_list:
                if word not in vocab:
                    vocab.append(word)

        self.vocab = vocab
        self.vocab.sort()

        return self.vocab
    
    def transform(self, corpus: list[str]):
        N = len(corpus)
        try:
            D = len(self.vocab)
        except AttributeError as e:
            print(e)
            print('Vocabulary has not been defined. Call .fit method first.')
        
        sparse_mat = sparse.csr_matrix((N, D), dtype=np.uint32)
        sparse_mat = sparse_mat.tolil() # for efficiency

        for ii in range(len(corpus)):
            doc = corpus[ii]
            words_list = doc.replace('.', '').replace('.', '').replace('?', '').lower().split(' ') # would need to tidy this up with regex
            for jj in range(len(words_list)):
                word = words_list[jj]
                try:
                    pos = self.vocab.index(word)
                    sparse_mat[ii, pos] += 1
                except ValueError as e:
                    pass
             
        sparse_mat = sparse_mat.tocsr()
        return sparse_mat

In [None]:
my_vectoriser = MyCountVectoriser()
print(my_vectoriser.fit(corpus=corpus))
my_vectoriser.transform(['asdasd'])

In [None]:
my_embedding = my_vectoriser.transform(corpus).toarray()
my_vectoriser.vocab

In [None]:
print(f'Embeddings agree?: {(my_embedding == embedding).all()}')