In [2]:
import numpy as np

# import stuff for creating class

from abc import ABC, abstractmethod


In [3]:
from sklearn.preprocessing import normalize
class TfIdfBase(ABC):
    def __init__(self, corpus, docids, save_folder):
        self.corpus = corpus
        self.docids = docids
        self.save_folder = save_folder

    @abstractmethod
    def compute_idf(self):
        pass

    @abstractmethod
    def compute_tf(self):
        pass

    @abstractmethod
    def preprocess(self):
        pass

    def compute_tf_idf(self, do_normalize=True):
        self.tf_idf_matrix = self.tf_matrix * self.idf_matrix
        # row-wise L2 norm
        if do_normalize:
            self.tf_idf_matrix = normalize(self.tf_idf_matrix)

In [4]:
import stopwordsiso as stopwords
import re
from loguru import logger
from tqdm.auto import tqdm
import time
import pickle
import scipy.sparse
import os
import collections

class TfIdfFrench(TfIdfBase):
    def __init__(self, corpus, docids, save_folder, overwrite=False):
        super().__init__(corpus, docids, save_folder)
        self.stopwords = stopwords.stopwords("fr")
        self.overwrite = overwrite

        os.makedirs(f'{self.save_folder}/fr', exist_ok=True)

    def preprocess(self):

        if not self.overwrite and os.path.exists(f'{self.save_folder}/fr/word_list.pkl'):
            logger.info("Loading unique words and word list")
            with open(f'{self.save_folder}/fr/word_list.pkl', 'rb') as f:
                self.unique_words, self.word_list, self.word_to_index = pickle.load(f)

            return  

        logger.info("Preprocessing corpus")
        start = time.perf_counter()
        # self.corpus = [re.sub(r'[^\w\s]', doc.lower()) for doc in tqdm(self.corpus)]
        # self.corpus = [[word for word in doc.split() if word not in self.stopwords] for doc in tqdm(self.corpus)]

        # combine the above 2 into 1
        self.word_list = [[word for word in re.sub(r'[^\w\s]', '', doc.lower()).split()\
                        if word not in self.stopwords] for doc in tqdm(self.corpus)]


        end = time.perf_counter()

        self.unique_words = set()
        for doc in tqdm(self.word_list):
            self.unique_words.update(doc)

        self.unique_words = sorted(list(self.unique_words))
        self.word_to_index = {word: i for i, word in enumerate(self.unique_words)}

        logger.info(f"Preprocessing took {end-start:.2f} seconds")
        logger.info(f"Number of unique words: {len(self.unique_words)}")

        logger.info(f"Saving unique words and word list to {self.save_folder}/fr")
        with open(f'{self.save_folder}/fr/word_list.pkl', 'wb') as f:
            pickle.dump((self.unique_words, self.word_list, self.word_to_index), f)

    def compute_tf(self):
        # computing term frequency

        # if not self.overwrite and os.path.exists(f'{self.save_folder}/en/tf_matrix.npy'):
        #     logger.info("Loading tf matrix")
        #     self.tf_matrix = np.load(f'{self.save_folder}/en/tf_matrix.npy')
        #     return self.tf_matrix

        # logger.info("Computing term frequency")
        # start = time.perf_counter()

        # self.tf_matrix = scipy.sparse.csr_matrix(np.zeros((len(self.docids), len(self.unique_words)), dtype=np.int32))

        # for i, doc in enumerate(tqdm(self.word_list)):
        #     for word in doc:
        #         self.tf_matrix[i, self.word_to_index[word]] += 1

        # end = time.perf_counter()
        # logger.info(f"Computing term frequency took {end-start:.2f} seconds")

        # logger.info(f"Saving tf matrix to {self.save_folder}/en")
        # np.save(f'{self.save_folder}/en/tf_matrix.npy', self.tf_matrix)
        # return self.tf_matrix
        self._compute_sparse_tf()
    

    def _compute_sparse_tf(self):
        if not self.overwrite and os.path.exists(f'{self.save_folder}/fr/sparse_tf_matrix.npy'):
            logger.info("Loading tf matrix")
            self.tf_matrix = np.load(f'{self.save_folder}/fr/sparse_tf_matrix.npy', allow_pickle=True)
            return self.tf_matrix
        start = time.perf_counter()
        row, col, data = [], [], []
        for i, doc in enumerate(tqdm(self.word_list)):
            word_count = collections.defaultdict(int)
            for word in doc:
                word_count[word] += 1


            for word, count in word_count.items():
                row.append(i)
                col.append(self.word_to_index[word])
                data.append(count)

        self.tf_matrix = scipy.sparse.csr_matrix((data, (row, col)), shape=(len(self.docids), len(self.unique_words)), dtype=np.int32)
        end = time.perf_counter()
        logger.info(f"Computing term frequency took {end-start:.2f} seconds")

        logger.info(f"Saving tf matrix to {self.save_folder}/fr")
        np.save(f'{self.save_folder}/fr/sparse_tf_matrix.npy', self.tf_matrix)
        # return self.tf_matrix


    def _compute_sparse_idf(self):
        if not self.overwrite and os.path.exists(f'{self.save_folder}/fr/sparse_idf_matrix.npy'):
            logger.info("Loading idf matrix")
            self.idf_matrix = np.load(f'{self.save_folder}/fr/sparse_idf_matrix.npy', allow_pickle=True)
            return self.idf_matrix

        start = time.perf_counter()
        self.idf_matrix = (self.tf_matrix > 0).sum(axis=0)
        self.idf_matrix = np.squeeze(np.array(self.idf_matrix))
        self.idf_matrix = np.log((1 + len(self.docids)) / (1 + self.idf_matrix)) + 1
        print(self.idf_matrix)
        self.idf_matrix = scipy.sparse.diags(self.idf_matrix.astype(np.float32))
        end = time.perf_counter()
        logger.info(f"Computing inverse document frequency took {end-start:.2f} seconds")

        logger.info(f"Saving idf matrix to {self.save_folder}/fr")
        np.save(f'{self.save_folder}/fr/sparse_idf_matrix.npy', self.idf_matrix)
        # return self.idf_matrix
    
    def compute_idf(self):
        # return self._compute_sparse_idf()
        self._compute_sparse_idf()


    def _get_sparse_query_tfidf(self, queries, use_idf, do_normalize):
        query_word_list = [[word for word in re.sub(r'[^\w\s]', '', query.lower()).split()\
                        if word not in self.stopwords] for query in queries]

        # query_vectors = np.zeros((len(queries), len(self.unique_words)))
        # for i in range(len(queries)):
        #     for word in query_word_list[i]:
        #         if word in self.word_to_index:
        #             query_vectors[i][self.word_to_index[word]] = 

        row, col, data = [], [], []

        for i, doc in enumerate(query_word_list):
            word_count = collections.defaultdict(int)
            for word in doc:
                if word in self.word_to_index:
                    word_count[word] += 1

            for word, count in word_count.items():
                row.append(i)
                col.append(self.word_to_index[word])
                data.append(count)

        query_tf_idf_matrix = scipy.sparse.csr_matrix((data, (row, col)), shape=(len(queries), len(self.unique_words)), dtype=np.int32)

        if use_idf:
            query_tf_idf_matrix = np.array(query_tf_idf_matrix)
            query_tf_idf_matrix = query_tf_idf_matrix * self.idf_matrix
        
        if do_normalize:
            query_tf_idf_matrix = normalize(query_tf_idf_matrix)

        return query_tf_idf_matrix



    def process_queries(self, queries, topk=5, use_idf=False, do_normalize=True):

        assert len(queries) == 1
        query_tf_idf_matrix = self._get_sparse_query_tfidf(queries, use_idf, do_normalize)

        cosine_similarities = self.tf_idf_matrix.dot(query_tf_idf_matrix.T).toarray().flatten()
        top_k_indices = np.argsort(cosine_similarities)[::-1][:topk]  # Get top 10 indices by score

        return [self.docids[i] for i in top_k_indices]



In [20]:
# english_corpus = [
#     "this is the first document psosp",
#     "this document is the second document ssdfa",
#     "and this is the third one",
#     "is this the first document"
# ]
# english_docids = list(range(len(english_corpus)))


In [5]:
import json

with open('./corpus.json/corpus.json') as f:
    corpus = json.load(f)

In [6]:
english_corpus = [doc['text'] for doc in corpus if doc['lang'] == 'fr']
english_docids = [doc['docid'] for doc in corpus if doc['lang'] == 'fr']

In [7]:
TFV = TfIdfFrench(english_corpus, english_docids, 'tfidf', overwrite=False)

In [8]:
TFV.preprocess()

[32m2024-10-14 17:01:12.007[0m | [1mINFO    [0m | [36m__main__[0m:[36mpreprocess[0m:[36m22[0m - [1mLoading unique words and word list[0m


In [9]:
TFV.compute_tf()

[32m2024-10-14 17:01:36.395[0m | [1mINFO    [0m | [36m__main__[0m:[36m_compute_sparse_tf[0m:[36m82[0m - [1mLoading tf matrix[0m


In [10]:
TFV.compute_idf()

[32m2024-10-14 17:01:39.169[0m | [1mINFO    [0m | [36m__main__[0m:[36m_compute_sparse_idf[0m:[36m109[0m - [1mLoading idf matrix[0m


In [11]:
TFV.compute_tf_idf()

In [13]:
import pandas as pd
dev_set = pd.read_csv('./dev.csv')
french_dev_set = dev_set[dev_set['lang'] == 'fr']
french_dev_set.shape

(200, 5)

In [14]:
french_dev_set_queries = french_dev_set['query'].tolist()
french_dev_set_positive_docs = french_dev_set['positive_docs'].tolist()

In [15]:
def recall_at_10(positive_docs, top_10_ids):
    recall = []
    for positive_doc, top_10_id in zip(positive_docs, top_10_ids):
        recall.append(positive_doc in top_10_id)

    print(np.mean(recall))

In [16]:
top_10_ids = []
for dev_query in tqdm(french_dev_set_queries):
    top_10_ids.append(TFV.process_queries([dev_query], 10, True, True))

    

  0%|          | 0/200 [00:00<?, ?it/s]

In [17]:
recall_at_10(french_dev_set_positive_docs, top_10_ids)

0.815


In [18]:
top_10_ids = []
for dev_query in tqdm(french_dev_set_queries):
    top_10_ids.append(TFV.process_queries([dev_query], 10, False, True))

    

  0%|          | 0/200 [00:00<?, ?it/s]

In [19]:
top_10_ids = []
for dev_query in tqdm(french_dev_set_queries):
    top_10_ids.append(TFV.process_queries([dev_query], 10, True, False))

    

  0%|          | 0/200 [00:00<?, ?it/s]

In [20]:
recall_at_10(french_dev_set_positive_docs, top_10_ids)

0.815


In [21]:
# svd trial
from sklearn.decomposition import TruncatedSVD

In [22]:
svd = TruncatedSVD(n_components=512)

In [23]:
reduced_tfidf_matrix = svd.fit_transform(TFV.tf_idf_matrix)

KeyboardInterrupt: 