### Task 0. 
Using https://www.nltk.org/howto/corpus.html#overview, implement TF-IDF vectorizer for e.g. Treebank corpus

In [64]:
import nltk
import math
from collections import Counter
import string
from nltk.corpus import treebank, stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

In [113]:
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    return [
        lemmatizer.lemmatize(word.lower())
        for word in nltk.word_tokenize(text)
    ]

def compute_tf(text):
    words = Counter(text)
    total_words = len(text)
    return {word: count / total_words for word, count in words.items()}


def compute_idf(corpus):
    vocab = {}
    inverse_vocab = {}
    word_idx = 0
    idf = {}
    
    preprocessed_corpus = [set(preprocess(text)) for text in corpus]
    unique_words = set(word for text in preprocessed_corpus for word in text)
    
    for word in unique_words:
        vocab[word] = word_idx
        inverse_vocab[word_idx] = word
        word_idx += 1
        df = sum(1 for text in preprocessed_corpus if word in text)
        idf[word] = math.log(len(corpus) / (1 + df)) + 1 
        
    return idf, vocab, inverse_vocab


def compute_tfidf(corpus):
    idf, vocab, inverse_vocab = compute_idf(corpus)
    tfidf_vectors = []
    
    for text in corpus:
        vector = [0] * len(vocab)
        processed_text = preprocess(text)
        tf = compute_tf(processed_text)
        
        for word in processed_text:
            if word in vocab: 
                vector[vocab[word]] = tf[word] * idf[word]
                
        tfidf_vectors.append(vector)
        
    return np.array(tfidf_vectors)

sentences = [" ".join(sent) for sent in treebank.sents()[:100]]
np_tfidf = compute_tfidf(sentences)

In [115]:
np_tfidf[5]

array([0.        , 0.        , 0.1637341 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.1637341 ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     