<a id="home"></a>

[1- Tokenizer](#1)

[2- Preprocessing](#2)

[3- Stemming](#3)


In [21]:
import nltk
import os
from collections import Counter
from math import log10
import pandas as pd

In [22]:
# nltk.download()

<a id="1"></a>

## Tokenizers

[Home](#home)

In [23]:
def split_tokenizer(txt):
    return [token for token in txt.split()]

In [24]:
def regex_tokenizer(txt,regex='\w+'):
    reg = nltk.RegexpTokenizer(regex)
    return reg.tokenize(txt)

<a id="2"></a>
## Preprocessing

[Home](#home)

### stop words removal

In [25]:
def stop_remove(tokens):
    stop = nltk.corpus.stopwords.words('english')
    return [term.lower() for term in tokens if term.lower() not in stop]

<a id="3"></a>
## Stemming

[Home](#home)

In [26]:
def porter_stem(tokens):
    porter = nltk.PorterStemmer()
    return[porter.stem(term) for term in tokens]

In [27]:
def lancester_stem(tokens):
    # lancester = nltk.PorterStemmer()
    lancester = nltk.LancasterStemmer()
    return[lancester.stem(term) for term in tokens]

<a id="4"></a>
## Files Reader

[Home](#home)

In [28]:
def read_docs(folder_path):

    docs_content = dict()

    # Ensure the path is a directory
    if os.path.isdir(folder_path):
        # List all files in the directory
        files = os.listdir(folder_path)

        for file_name in files:
            # Check if the file is a text file (you can modify the condition as needed)
            if file_name.endswith('.txt'):
                file_path = os.path.join(folder_path, file_name)

                # Read the text from the file
                with open(file_path, 'r', encoding='utf-8') as file:
                    text_content = file.read()

                # Store the text content in the dictionary
                docs_content[file_name.replace(".txt","")] = text_content

    return docs_content

In [29]:
documents = read_docs("Collection")
documents

{'D1': 'Researchers have successfully applied large language models (LLMs) such as ChatGPT to reranking in an information retrieval context, but to date, such work has mostly been built on proprietary models hidden behind opaque API endpoints. This approach yields experimental results that are not reproducible and non-deterministic, threatening the veracity of outcomes that build on such shaky foundations. To address this significant shortcoming, we present RankVicuna, the first fully open-source LLM capable of performing high-quality listwise reranking in a zero-shot setting. Experimental results on the TREC 2019 and 2020 Deep Learning Tracks show that we can achieve effectiveness comparable to zero-shot reranking with GPT-3.5 with a much smaller 7B parameter model, although our effectiveness remains slightly behind reranking with GPT-4. We hope our work provides the foundation for future research on reranking with modern LLMs.',
 'D2': 'With the advent of transformer-based architectu

<a id="5"></a>
## Files PreProcessor

[Home](#home)

In [30]:
def files_preprocessor(documents):
    
    """This function applies tokenization ,stop words removal and stemming
    it takes as input documents which is a dictionnary with values as documents content

    Returns:
        _type_: _description_
    """
    
    #{file_name:dict of frequencies,file_name:dict of frequencies}
    all_docs_frequencies_split_port=dict()
    all_docs_frequencies_split_lancaster=dict()
    all_docs_frequencies_regex_port=dict()
    all_docs_frequencies_regex_lancaster=dict()
    
    for name , content in documents.items():
        
        #get tokens
        splitted_tokens = split_tokenizer(content)
        # regex_tokens = regex_tokenizer(content,"\w+(?:-\w+)*(?:\.\w+)?(?:\@\w+)?")
        # regex_tokens = regex_tokenizer(content,"\w+(?:\/\w+)*(?:-\w+)*(?:\,\w+)?(?:\.\w+)?(?:\@\w+)?")
       
       
        regex_tokens = regex_tokenizer(content,"\w+(?:\/\w+)*(?:-\w+)*(?:\,\w+)?(?:\.\w+)?(?:\@\w+)?%?")
        # regex_tokens = regex_tokenizer(content,"(?:[A-Za-z]\.)+|[A-Za-z]+[\-@]\d+(?:\.\d+)?|\d+[A-Za-z]+|\d+(?:[\.\,]\d+)?%?|\w+(?:[\-/]\w+)*")
       
       
        # regex_tokens = regex_tokenizer(content,"\b(?:\w+@\w+|\w+\.\d+|\w+-\w+\.\w+|\w+-?\w+-?\w*-?\w+|\w+)\b")
        
        #remove stopwords
        splitted_tokens = stop_remove(splitted_tokens)
        regex_tokens = stop_remove(regex_tokens)
        
        #stemming
        
        splitted_tokens_porter = porter_stem(splitted_tokens)
        splitted_tokens_lancester = lancester_stem(splitted_tokens)
        
        regex_tokens_porter = porter_stem(regex_tokens)
        regex_tokens_lancester = lancester_stem(regex_tokens)
        
        #frequencies
        
        all_docs_frequencies_split_port[name] = dict(Counter(splitted_tokens_porter))
        all_docs_frequencies_split_lancaster[name] = dict(Counter(splitted_tokens_lancester))
        all_docs_frequencies_regex_port[name] = dict(Counter(regex_tokens_porter))
        all_docs_frequencies_regex_lancaster[name] = dict(Counter(regex_tokens_lancester))
        
    
    
    descripteur = dict()
    descripteur["split_port"] = all_docs_frequencies_split_port
    descripteur["split_lancester"] = all_docs_frequencies_split_lancaster
    descripteur["regex_port"] = all_docs_frequencies_regex_port
    descripteur["regex_lancester"] = all_docs_frequencies_regex_lancaster
    
    return descripteur
    

In [31]:
documents_preprocessed = files_preprocessor(documents)
# documents_preprocessed.keys()
documents_preprocessed["split_port"]["D1"]


{'research': 2,
 'success': 1,
 'appli': 1,
 'larg': 1,
 'languag': 1,
 'model': 2,
 '(llms)': 1,
 'chatgpt': 1,
 'rerank': 5,
 'inform': 1,
 'retriev': 1,
 'context,': 1,
 'date,': 1,
 'work': 2,
 'mostli': 1,
 'built': 1,
 'proprietari': 1,
 'hidden': 1,
 'behind': 2,
 'opaqu': 1,
 'api': 1,
 'endpoints.': 1,
 'approach': 1,
 'yield': 1,
 'experiment': 2,
 'result': 2,
 'reproduc': 1,
 'non-deterministic,': 1,
 'threaten': 1,
 'verac': 1,
 'outcom': 1,
 'build': 1,
 'shaki': 1,
 'foundations.': 1,
 'address': 1,
 'signific': 1,
 'shortcoming,': 1,
 'present': 1,
 'rankvicuna,': 1,
 'first': 1,
 'fulli': 1,
 'open-sourc': 1,
 'llm': 1,
 'capabl': 1,
 'perform': 1,
 'high-qual': 1,
 'listwis': 1,
 'zero-shot': 2,
 'setting.': 1,
 'trec': 1,
 '2019': 1,
 '2020': 1,
 'deep': 1,
 'learn': 1,
 'track': 1,
 'show': 1,
 'achiev': 1,
 'effect': 2,
 'compar': 1,
 'gpt-3.5': 1,
 'much': 1,
 'smaller': 1,
 '7b': 1,
 'paramet': 1,
 'model,': 1,
 'although': 1,
 'remain': 1,
 'slightli': 1,
 'gpt-

In [32]:
def unique_occurences(descripteurs):
    
    all_terms_occ = dict()
    
    for descripteur , docs_freq_dict in descripteurs.items():
        terms_occ = dict()
        for doc_name , doc_freq in docs_freq_dict.items():
            
            for term , freq in doc_freq.items():
                
                terms_occ[term] = terms_occ.get(term,0) + 1 
        
        all_terms_occ[descripteur] = terms_occ  
    
    return all_terms_occ          

In [33]:
term_per_doc = unique_occurences(documents_preprocessed)
term_per_doc["split_port"]

{'research': 2,
 'success': 1,
 'appli': 1,
 'larg': 4,
 'languag': 3,
 'model': 6,
 '(llms)': 3,
 'chatgpt': 1,
 'rerank': 1,
 'inform': 2,
 'retriev': 5,
 'context,': 1,
 'date,': 1,
 'work': 1,
 'mostli': 1,
 'built': 1,
 'proprietari': 1,
 'hidden': 1,
 'behind': 1,
 'opaqu': 1,
 'api': 1,
 'endpoints.': 1,
 'approach': 3,
 'yield': 2,
 'experiment': 2,
 'result': 5,
 'reproduc': 1,
 'non-deterministic,': 1,
 'threaten': 1,
 'verac': 1,
 'outcom': 1,
 'build': 1,
 'shaki': 1,
 'foundations.': 1,
 'address': 4,
 'signific': 2,
 'shortcoming,': 1,
 'present': 1,
 'rankvicuna,': 1,
 'first': 2,
 'fulli': 1,
 'open-sourc': 1,
 'llm': 3,
 'capabl': 2,
 'perform': 4,
 'high-qual': 1,
 'listwis': 2,
 'zero-shot': 2,
 'setting.': 1,
 'trec': 4,
 '2019': 1,
 '2020': 1,
 'deep': 3,
 'learn': 4,
 'track': 1,
 'show': 2,
 'achiev': 1,
 'effect': 5,
 'compar': 4,
 'gpt-3.5': 1,
 'much': 1,
 'smaller': 1,
 '7b': 1,
 'paramet': 1,
 'model,': 1,
 'although': 1,
 'remain': 1,
 'slightli': 1,
 'gpt-

In [34]:
def weight(freq,max_freq,n,N=6):
    
    return (freq/max_freq)*log10((N/n)+1)

In [35]:
def descripteurs_writer(descripteurs,unique_freq,header='doc term frequency weight\n'):
    
    """
    dicts -> name of dict : dict of frequencies of docs 
    
    unique_freq -> number of documents in which each term appears
    
    """
    
    for descripteur , docs_freq_dict in descripteurs.items():
        
        with open(f"descripteurs/descripteur_{descripteur}.txt",mode="w") as file:
            
            file.write(header)
            for doc_name , doc_freq in docs_freq_dict.items():
                
                lines = []
                max_freq = max(doc_freq.values())
                
                for term , freq in doc_freq.items():
               
                     line = f"{doc_name} {term} {freq} {weight(freq,max_freq,unique_freq[descripteur][term]):.4f}\n"    
                     
                     lines.append(line)
                
                file.writelines(lines)     
                
                
            
        
    
    
    

In [36]:
descripteurs_writer(documents_preprocessed,term_per_doc)

In [37]:
def read_descripteurs(folder_path):

    docs_content = dict()

    # Ensure the path is a directory
    if os.path.isdir(folder_path):
        # List all files in the directory
        files = os.listdir(folder_path)

        for file_name in files:
            # Check if the file is a text file (you can modify the condition as needed)
            if file_name.endswith('.txt'):
                file_path = os.path.join(folder_path, file_name)
                content = pd.read_csv(file_path,sep=" ")
                # content = pd.read_csv(file_path,header=None,sep=" ")
                # Read the text from the file
                # with open(file_path, 'r', encoding='utf-8') as file:
                    
                #     text_content = file.readlines()

                # Store the text content in the dictionary
                docs_content[file_name.replace(".txt","")] = content

    return docs_content

In [38]:
# descripteurs = read_descripteurs("descripteurs")
descripteurs = read_descripteurs("inverse")
# descripteurs.keys()
type(descripteurs["inverse_regex_lancester"])
# type(descripteurs["descripteur_regex_lancester"])
# descripteurs

pandas.core.frame.DataFrame

In [39]:
def inverse(path):
    
    descripteurs = read_descripteurs(path)
    
    for desc , content in descripteurs.items():
        content = content[["term","doc","frequency","weight"]]
        
        content.to_csv(f"inverse/{desc.replace('descripteur','inverse')}.txt",index = False,sep=" ")
        # content.to_csv(f"inverse/{desc.replace('descripteur','inverse')}.txt",index = False,header=False)
            
    
    
    

In [40]:
inverse("descripteurs")


In [41]:
from back import load_descripteurs_and_inverse

In [44]:
documents =load_descripteurs_and_inverse()
inverse_doc = documents["inverse_regex_port"] 

In [45]:
def doc_size(doc_name,inverse_doc):
    return len(inverse_doc[inverse_doc["doc"] == doc_name])

def docs_mean(inverse_doc):
    
    docs = inverse_doc["doc"].unique()
    sum = 0
    
    for doc in docs:
        sum += doc_size(doc,inverse_doc)
    
    return sum/len(docs)    
    

In [47]:
docs_mean(inverse_doc)

86.0

In [1]:
import re

In [33]:

def is_valid_query(query):
    
    tokens = [f"'{token}'" if token not in ("AND", "OR", "NOT") else token.lower() for token in query.split()]
    try:
        eval(" ".join(tokens))
    except SyntaxError:
        valid = False
    else:valid = True
    
    return valid

In [35]:
is_valid_query("Terme AND NOT OR d")
# is_valid_query("Terme AND NOT d")

False

In [8]:
pile=[]
for token in "Terme AND NOT OR d".split():
    pile.insert(0,token)

In [9]:
pile

['d', 'OR', 'NOT', 'AND', 'Terme']

In [39]:
type("ff")=="str"

False