<a id="home"></a>

[1- Tokenizer](#1)

[2- Preprocessing](#2)

[3- Stemming](#3)


In [1]:
import nltk
import os
from collections import Counter
from math import log10
import pandas as pd

In [2]:
# nltk.download()

<a id="1"></a>

## Tokenizers

[Home](#home)

In [3]:
def split_tokenizer(txt):
    return [token for token in txt.split()]

In [4]:
def regex_tokenizer(txt,regex='\w+'):
    reg = nltk.RegexpTokenizer(regex)
    return reg.tokenize(txt)

<a id="2"></a>
## Preprocessing

[Home](#home)

### stop words removal

In [5]:
def stop_remove(tokens):
    stop = nltk.corpus.stopwords.words('english')
    return [term.lower() for term in tokens if term.lower() not in stop]

<a id="3"></a>
## Stemming

[Home](#home)

In [6]:
def porter_stem(tokens):
    porter = nltk.PorterStemmer()
    return[porter.stem(term) for term in tokens]

In [7]:
def lancester_stem(tokens):
    # lancester = nltk.PorterStemmer()
    lancester = nltk.LancasterStemmer()
    return[lancester.stem(term) for term in tokens]

Concatenate docs

In [8]:
def concatenate_files(folder_path, output_file):
    with open(output_file, 'w') as outfile:
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                with open(file_path, 'r') as infile:
                    outfile.write(infile.read())
                    # outfile.write('\n')  # Add a newline between concatenated files

In [9]:
concatenate_files("docs","concatenated.txt")

<a id="4"></a>
## Files Reader

[Home](#home)

In [10]:
def read_documents(file_path):
    with open(file_path, 'r') as file:
        document_data = file.read()
    return document_data

In [11]:
def create_document_dictionary(document_data):
    
    """create a dict where key is doc_id and value is the content of doc

    Returns:
        dict:{"1":str representing the content of doc1 , etc..}
        """
    
    documents = document_data.split("********************************************\n")
    document_dict = {}

    for document in documents:
        lines = document.split('\n')
        if lines:
            document_id = lines[0].split()[-1]
            content = ' '.join(lines[1:]).strip()
            document_dict[document_id] = content

    return document_dict

In [13]:
document_dict = create_document_dictionary(read_documents("concatenated.txt"))

In [14]:
document_dict["2001"]

'MICROFILM IN THE ELECTRONIC OFFICE.       THE RESULTS OF A SURVEY OF THE MICROFILM INDUSTRY CONCERNING THE ROLE AND FUTURE OF MICROFILM IN THE ELECTRONICS ORIENTATED OFFICE OF THE FUTURE ARE REPORTED.'

In [15]:
nbr_docs = len(document_dict.keys())
# document_dict.keys()
nbr_docs

5999

<a id="5"></a>
## Files PreProcessor

[Home](#home)

In [18]:
def files_preprocessor(documents):
    
    """This function applies tokenization ,stop words removal and stemming
    it takes as input documents which is a dictionnary with values as documents content

    Returns:
        _type_: _description_
    """
    
    #{file_name:dict of frequencies,file_name:dict of frequencies}
    all_docs_frequencies_split_port=dict()
    all_docs_frequencies_split_lancaster=dict()
    all_docs_frequencies_regex_port=dict()
    all_docs_frequencies_regex_lancaster=dict()
    
    for name , content in documents.items():
        
        #get tokens
        splitted_tokens = split_tokenizer(content)
        # regex_tokens = regex_tokenizer(content,"(?:[A−Za−z]\.)+|[A−Za−z]+[\−@]\d+(?:\.\d+)?|\d+[A−Za−z]+|\d+(?:[\.\,]\d+)?%?|\w+(?:[\−/]\w+)*")
        # regex_tokens = regex_tokenizer(content,"\w+(?:\/\w+)*(?:-\w+)*(?:\,\w+)?(?:\.\w+)?(?:\@\w+)?%?")
        regex_tokens = regex_tokenizer(content,"(?:[A-Za-z]\.)+|[A-Za-z]+[\-@]\d+(?:\.\d+)?|\d+[A-Za-z]+|\d+(?:[\.\,]\d+)?%?|\w+(?:[\-/]\w+)*")
        #remove stopwords
        splitted_tokens = stop_remove(splitted_tokens)
        regex_tokens = stop_remove(regex_tokens)
        
        #stemming
        
        splitted_tokens_porter = porter_stem(splitted_tokens)
        splitted_tokens_lancester = lancester_stem(splitted_tokens)
        
        regex_tokens_porter = porter_stem(regex_tokens)
        regex_tokens_lancester = lancester_stem(regex_tokens)
        
        #frequencies
        
        all_docs_frequencies_split_port[name] = dict(Counter(splitted_tokens_porter))
        all_docs_frequencies_split_lancaster[name] = dict(Counter(splitted_tokens_lancester))
        all_docs_frequencies_regex_port[name] = dict(Counter(regex_tokens_porter))
        all_docs_frequencies_regex_lancaster[name] = dict(Counter(regex_tokens_lancester))
        
    
    
    descripteur = dict()
    descripteur["split_port"] = all_docs_frequencies_split_port
    descripteur["split_lancester"] = all_docs_frequencies_split_lancaster
    descripteur["regex_port"] = all_docs_frequencies_regex_port
    descripteur["regex_lancester"] = all_docs_frequencies_regex_lancaster
    
    return descripteur
    

In [19]:
documents_preprocessed = files_preprocessor(document_dict)
# documents_preprocessed = files_preprocessor(dict(list(document_dict.items())[:5]))
# documents_preprocessed.keys()
# documents_preprocessed["split_port"]["D1"]


In [20]:
documents_preprocessed["regex_port"]["6000"]
# documents_preprocessed.keys()

{'perspect': 1,
 'cognit': 3,
 'human': 2,
 'inform': 3,
 'process': 2,
 '4': 1,
 'paper': 3,
 'review': 1,
 'state': 1,
 'art': 1,
 'research': 1,
 'appli': 1,
 'scienc': 2,
 'handl': 1,
 'relationship': 1,
 'explor': 1,
 'includ': 1,
 'artifici': 1,
 'intellig': 1,
 'comput': 1,
 'linguist': 1,
 'introduct': 1,
 'b.c.': 1,
 'griffith': 1,
 'abstract': 1,
 'note': 1,
 'final': 1,
 'seri': 1,
 'publish': 1,
 'j.': 1,
 'soc': 1,
 'inf': 1,
 'sci': 1,
 '32': 1,
 '6': 1,
 'nov': 1,
 '81': 1,
 '405': 1,
 '411': 1,
 'individu': 1,
 'see': 1,
 'follow': 1,
 'serial': 1,
 'number': 1}

### Prepare data to generate descripteurs and inverse docs

In [21]:
def unique_occurences(descripteurs):
    
    all_terms_occ = dict()
    
    for descripteur , docs_freq_dict in descripteurs.items():
        terms_occ = dict()
        for doc_name , doc_freq in docs_freq_dict.items():
            
            for term , freq in doc_freq.items():
                
                terms_occ[term] = terms_occ.get(term,0) + 1 
        
        all_terms_occ[descripteur] = terms_occ  
    
    return all_terms_occ          

In [22]:
term_per_doc = unique_occurences(documents_preprocessed)
term_per_doc["regex_port"]

{'indian': 57,
 'council': 150,
 'librari': 3547,
 'inform': 2301,
 'servic': 1647,
 'research': 890,
 'train': 421,
 'propos': 248,
 'consider': 171,
 'critic': 105,
 'examin': 467,
 'scene': 29,
 'manag': 395,
 'oper': 415,
 'ineffici': 8,
 'ineffect': 5,
 'result': 626,
 'grave': 3,
 'misalloc': 1,
 'underutilis': 4,
 'resourc': 444,
 'littl': 103,
 'major': 300,
 'carri': 249,
 'univers': 847,
 'librarianship': 347,
 'cours': 238,
 'much': 155,
 'help': 227,
 'effici': 156,
 'nation': 838,
 'institut': 506,
 'seriou': 31,
 'limit': 146,
 'improv': 415,
 'situat': 225,
 'establish': 446,
 'would': 212,
 'set': 315,
 'govern': 397,
 'well-defin': 3,
 'role': 488,
 'function': 333,
 'cover': 396,
 'aspect': 275,
 'linger': 1,
 'fragranc': 1,
 'proceed': 96,
 'xxiv': 1,
 'india': 63,
 'confer': 414,
 'bangalor': 6,
 'paper': 625,
 'summari': 111,
 'n.d.': 2,
 'bagari': 1,
 'transcript': 14,
 'inaugur': 10,
 'speech': 32,
 'introductori': 20,
 'main': 310,
 'present': 1279,
 'held': 248

In [23]:
def weight(freq,max_freq,n,N=nbr_docs):
    
    return (freq/max_freq)*log10((N/n)+1)

In [24]:
def descripteurs_writer(descripteurs,unique_freq,header='doc term frequency weight\n'):
    
    """
    dicts -> name of dict : dict of frequencies of docs 
    
    unique_freq -> number of documents in which each term appears
    
    """
    
    for descripteur , docs_freq_dict in descripteurs.items():
        
        with open(f"descripteurs/descripteur_{descripteur}.txt",mode="w") as file:
            
            file.write(header)
            for doc_name , doc_freq in docs_freq_dict.items():
                
                lines = []
                max_freq = max(doc_freq.values())
                
                for term , freq in doc_freq.items():
               
                     line = f"{doc_name} {term} {freq} {weight(freq,max_freq,unique_freq[descripteur][term],nbr_docs):.4f}\n"    
                     
                     lines.append(line)
                
                # if int(doc_name)%100 == 0:
                #     print(doc_name)
                
                file.writelines(lines)     

In [25]:
documents_preprocessed["split_port"].keys()

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157'

generate descripteurs

In [26]:
descripteurs_writer(documents_preprocessed,term_per_doc)

In [27]:
def read_descripteurs(folder_path):

    docs_content = dict()

    # Ensure the path is a directory
    if os.path.isdir(folder_path):
        # List all files in the directory
        files = os.listdir(folder_path)

        for file_name in files:
            # Check if the file is a text file (you can modify the condition as needed)
            if file_name.endswith('.txt'):
                file_path = os.path.join(folder_path, file_name)
                content = pd.read_csv(file_path,sep=" ")
                docs_content[file_name.replace(".txt","")] = content

    return docs_content

In [28]:
descripteurs = read_descripteurs("descripteurs")
# descripteurs = read_descripteurs("inverse")
# descripteurs.keys()
# type(descripteurs["inverse_regex_lancester"])
# type(descripteurs["descripteur_regex_lancester"])
# descripteurs

In [29]:
descripteurs["descripteur_regex_lancester"]

Unnamed: 0,doc,term,frequency,weight
0,1,ind,4,0.6490
1,1,council,3,0.6037
2,1,libr,8,0.4157
3,1,inform,2,0.1392
4,1,serv,3,0.2419
...,...,...,...,...
238715,6004,knowledg,1,0.3957
238716,6004,langu,1,0.3486
238717,6004,improv,1,0.2973
238718,6004,understand,1,0.4352


In [30]:
def inverse(path):
    
    descripteurs = read_descripteurs(path)
    
    for desc , content in descripteurs.items():
        content = content[["term","doc","frequency","weight"]]
        
        content.to_csv(f"inverse/{desc.replace('descripteur','inverse')}.txt",index = False,sep=" ")
        # content.to_csv(f"inverse/{desc.replace('descripteur','inverse')}.txt",index = False,header=False
    

generate inverse

In [31]:
inverse("descripteurs")
