In [1]:
import requests
import lxml
from bs4 import BeautifulSoup
import ast
import pandas as  pd
import codecs
from tqdm.notebook import tqdm
import numpy as np
import glob
import re
import pickle

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stopwords = set(stopwords.words('english'))

## Reading all the xml files

In [2]:
with open('path_list.pkl', 'rb') as config_dictionary_file:
    xml_files = pickle.load(config_dictionary_file)

In [3]:
len(xml_files)

1942302

In [38]:
def file_reader(file):
    doc = []
    with codecs.open(file,'r',"utf-8") as tei:
        soup = BeautifulSoup(tei, 'lxml')
        abstract = soup.teiheader.abstract.get_text().lower().replace('\n',' ')
        title = soup.teiheader.title.get_text().lower().replace('\n',' ')
        full_text = soup.get_text().lower().replace('\n',' ')
        id = file[28:-8]
    doc.append({'id':id,'title':title, 'text':full_text,'abstract':abstract})
    return doc
    

In [5]:
with open('text_title_abstract.pkl', 'rb') as f:
    df_dic_all_files = pickle.load(f)

In [6]:
df = pd.DataFrame.from_dict(df_dic_all_files)
df

Unnamed: 0,id,title,text,abstract
0,1807.11091,"structadmm: a systematic, high-efficiency fram...","structadmm: a systematic, high-efficiency ...",weight pruning methods of dnns have been demo...
1,hep-ph0003173,brane cosmologies without orbifolds,brane cosmologies without orbifolds ma...,we study the dynamics of branes in configurat...
2,2002.06870,consistency of the plfit estimator for power-l...,consistency of the plfit estimator for pow...,we prove the consistency of the power-law fit...
3,1204.6600,positive operators and maximal operators in a ...,positive operators and maximal operators i...,"in a filtered measure space, a characterizati..."
4,1404.3331,priors for random count matrices derived from ...,priors for random count matrices derived f...,we define a family of probability distributio...
...,...,...,...,...
1940905,2011.02495,orbital foregrounds for ultra-short duration t...,orbital foregrounds for ultra-short durati...,reflections from objects in earth orbit can p...
1940906,2101.06890,cooperative and competitive biases for multi-a...,cooperative and competitive biases for mul...,training a multi-agent reinforcement learning...
1940907,2112.01060,strong optical coupling in metallo-dielectric ...,strong optical coupling in metallo-dielect...,metasurfaces consisting of hybrid metal/diele...
1940908,2106.06977,floquet engineering of magnetism in topologica...,floquet engineering of magnetism in topolo...,


In [None]:
def stopremoveal(t):
# for t in tqdm(df_dic_all_files):
    word_tokens = t['text'].split()
    word_tokens = [ x for x in word_tokens if len(x) >1]
    word_tokens = [y for y in word_tokens if not (y.isdigit() or y[0] == '-' and y[1:].isdigit())]
    filtered_sentence = [w for w in word_tokens if not w.lower() in stopwords]
    filtered_sentence_nonalpha = [re.sub('[^A-Za-z0-9]', ' ', c) for c in filtered_sentence ]
    t['text'] = " ".join(filtered_sentence_nonalpha)
    

In [None]:
#Better to use multiprocessing for this task.
for p in tqdm(df_dic_all_files):
    stopremoveal(p)

## Creating Inverted Indexing

In [4]:
import os
import string
import multiprocessing

def process_file(filename):
    # Define the function to process a single file
#     with open(filename) as f:
#         text = f.read()
    text = filename['text']    
    # Remove punctuation and convert to lowercase
#     text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    
    # Split the text into a list of words
    words = text.split()
    
    # Initialize an empty list to store the word frequencies and document IDs for this file
    word_freq_doc_id = []
    
    # Loop over each word in the file and update the word frequency and document ID list
    for word in tqdm(words):
        if len(word) >1:
            word_found = False
            for i in range(len(word_freq_doc_id)):
                if word_freq_doc_id[i][0] == word:
                    # If the word is already in the list, increment its frequency
                    word_freq_doc_id[i][1] += 1
                    word_found = True
                    break
            if not word_found:
                # If the word is not in the list, add it with a frequency of 1 and the file name as the document ID
                word_freq_doc_id.append([word, 1, filename['id']])
    
    # Return the word frequency and document ID list for this file
    return word_freq_doc_id




In [6]:
if __name__ == '__main__':
    # Define the path to the directory containing the text files
    
    # Define the number of processes to use
    num_processes = 20
    
    # Create a pool of processes
    pool = multiprocessing.Pool(num_processes)
    
    # Get the list of filenames in the directory
#     filenames = [os.path.join(data_dir, filename) for filename in os.listdir(data_dir)]
    filenames = df_dic_all_files
    with tqdm(total=len(filenames)) as pbar:
    # Apply the process_file function to each file using multiple processes
        results = pool.map(process_file, filenames)
        pbar.update()
    
    # Initialize an empty dictionary to store the word frequencies and document IDs for all files
    word_freq_doc_id = {}
    
    # Merge the word frequency and document ID lists from each file
    for file_result in tqdm(results):
        for word, freq, doc_id in file_result:
            if word in word_freq_doc_id:
                # If the word is already in the dictionary, update its frequency and document ID list
                word_freq_doc_id[word]['freq'] += freq
                word_freq_doc_id[word]['docs'].append(doc_id)
            else:
                # If the word is not in the dictionary, add it with a frequency of 1 and a document ID list containing the ID of this file
                word_freq_doc_id[word] = {'freq': freq, 'docs': [doc_id]}
    
    # Print the word frequencies and document IDs for each word
#     for word, data in word_freq_doc_id.items():
#         print(f"{word}: {data['freq']} (documents {', '.join(data['docs'])})")

  0%|          | 0/440910 [00:00<?, ?it/s]

  0%|          | 0/440910 [00:00<?, ?it/s]

In [8]:
with open('word_freq_doc_id.pkl', 'wb') as f:
    pickle.dump(word_freq_doc_id, f)