In [None]:
import numpy as np
import pandas as pd
from scipy import sparse

from nltk import word_tokenize

In [None]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()

In [None]:
corpus = df['text']

This [post](https://stats.stackexchange.com/questions/154660/tfidfvectorizer-should-it-be-used-on-train-only-or-traintest)
suggest calculating the idf part on the training set and then applying it to the test set.

In [None]:
class MyTFIDF(object):
    """My TD-IDF implementation from scratch.
    Just for demonstration purposes"""
    def __init__(self) -> None:
        pass

    def fit(self, corpus: list[str]):
        N = len(corpus)

        current_idx = 0
        word2idx = {}
        tokenised_docs = []

        for doc in corpus:
            tokens = word_tokenize(doc.lower())
            tokenised_docs.append(tokens) # for later
            for tok in tokens:
                if tok not in word2idx:
                    word2idx[tok] = current_idx
                    current_idx += 1

        vocab = list(word2idx.keys())
        self.vocab = vocab
        self.word2idx = word2idx
        D = len(self.vocab)
        print(f'Vocab length: {D}')

        N_t = np.zeros(D)
        for tok in self.vocab:
            for doc in tokenised_docs:
                if tok in doc:
                    pos = word2idx[tok]
                    N_t[pos] += 1

        idf = np.log(N / (N_t+1) + 1)

        self.idf = idf

        return self.vocab
    
    def transform(self, corpus: list[str]):
        N = len(corpus)
        try:
            D = len(self.vocab)
        except AttributeError as e:
            print(e)
            print('Vocabulary has not been defined. Call .fit method first.')
        
        
        tf = sparse.csr_matrix((N, D), dtype=np.float64)
        tf = tf.tolil() # for efficiency
        
        for ii in range(N):
            doc = corpus[ii]
            tokens = word_tokenize(doc)
            for tok in tokens:
                if tok in self.vocab:
                    pos = self.word2idx[tok]
                    tf[ii, pos] += 1 # this is not very efficient, should initialise the sparse matrix in a better way (see docs)

        tf = tf.tocsr()

        # broadcasting for scipy.sparse is not so easy, so we need a trick
        tfidf = tf.copy()
        tfidf.data *= np.take(self.idf, tfidf.indices)
        return tfidf
    
    def fit_transform(self, corpus):
        _ = self.fit(corpus)
        return self.transform(corpus)

In [None]:
tfidf_vectoriser = MyTFIDF()
_ = tfidf_vectoriser.fit(corpus[:10])
X_train = tfidf_vectoriser.transform(corpus[:10])

In [None]:
X_train.toarray()

X_test = tfidf_vectoriser.transform(['I am a dog'])
X_test.toarray()