In [1]:
import os
import pandas
import string
import nltk
import numpy as np

from tqdm.notebook import tqdm_notebook
from glob import glob
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')
NLTK_STOP_WORDS = [word for word in stopwords.words('english')]

chars_to_remove = list(string.punctuation)
chars_to_remove += ["*", "-", "/", "+"]
chars_to_remove = set(chars_to_remove)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\m7irt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
paths = glob(os.path.join('Original_Text_Files', '*.txt'))
print(f"Text Files Found: {len(paths)}")

Text Files Found: 22379


In [4]:
def remove_stop_words(input_file_path):
    output_file_path = "./Stopwords_Removed/" + os.path.basename(input_file_path)
    
    with open(input_file_path, "r") as input_file, open(output_file_path, "w+") as output_file:
        lines = input_file.readlines()
        for line in lines:
            words = line.split()
            for word in words:
                l_word = word.lower()
                if l_word in NLTK_STOP_WORDS:
                    words.remove(word)

            output_file.write(" ".join(words))
            output_file.write(" ")
    
    return output_file_path

In [5]:
def remove_punctuation(input_file_path):
    output_file_path = "./Punctuation_Removed/" + os.path.basename(input_file_path)
    
    with open(input_file_path, "r") as input_file, open(output_file_path, "w") as output_file:
        data = input_file.read()
        for char in chars_to_remove:
            data = data.replace(str(char), "")
        
        output_file.write(data)
    
    return output_file_path

In [6]:
def to_lowercase(input_file_path):
    output_file_path = "./Lowercased_Documents/" + os.path.basename(input_file_path)
    
    with open(input_file_path, "r") as input_file, open(output_file_path, "w+") as output_file:
        lines = input_file.readlines()
        for line in lines:
            words = line.split()
            lowercased_words = []
            for word in words:
                l_word = word.lower()
                lowercased_words.append(l_word)

            output_file.write(" ".join(lowercased_words))
            output_file.write(" ")

    return output_file_path

In [7]:
for input_path in tqdm_notebook(paths, desc = "Progress", ncols = 700, unit = " Files"):
    input_path = remove_stop_words(input_path)
    input_path = remove_punctuation(input_path)
    input_path = to_lowercase(input_path)

Progress:   0%|                                                                                               …

In [8]:
def get_document_content():
    docs_paths = glob(os.path.join('Lowercased_Documents', '*.txt'))
    docs_with_content = {}
    
    for input_path in tqdm_notebook(docs_paths, desc = "Progress", ncols = 700, unit = " Files"):
        doc_words = []
        with open(input_path, "r") as input_file:
            lines = input_file.readlines()
            for line in lines:
                words = line.split()
                for word in words:
                    doc_words.append(word)
        
        docs_with_content[os.path.basename(input_path)] = doc_words

    return docs_with_content

In [9]:
documents_content = get_document_content()

Progress:   0%|                                                                                               …

In [39]:
def create_vocabulary(documents):
    vocabulary = {}
    for doc_words in tqdm_notebook(documents.values(), desc = "Progress", ncols = 700, unit = " Documents"):
        for word in doc_words: 
            if word not in vocabulary:
                vocabulary[word] = True
    
    return vocabulary

In [40]:
vocabulary = create_vocabulary(documents_content)
print(f"Words in Vocabulary: {len(vocabulary)}")

Progress:   0%|                                                                                               …

Words in Vocabulary: 1808540


In [42]:
def get_documnet_term_frequency(documents_content):
    docs_paths = glob(os.path.join('Lowercased_Documents', '*.txt'))
    doc_term_frequency = {}
    
    for input_path in docs_paths:
        doc_name = os.path.basename(input_path)
        doc_term_frequency[doc_name] = {}
        
    for doc in tqdm_notebook(documents_content.keys(), desc = "Progress", ncols = 700, unit = " Documents"):
        current_doc_words = documents_content[doc]
        words_count = {}
        
        for word in current_doc_words:
            words_count[word] = 0
        
        for word in current_doc_words:
            words_count[word] += 1
        
        doc_term_frequency[doc] = words_count
    
    return doc_term_frequency

In [43]:
document_term_frequency = get_documnet_term_frequency(documents_content)

Progress:   0%|                                                                                               …

In [50]:
def get_word_document_frequency(vocabulary, document_tf):
    word_df = {}
    for vocab_word in tqdm_notebook(vocabulary.keys(), desc = "Progress", ncols = 700, unit = " Words"):
        word_doc_count = 0
        for tf in document_tf.values():
            if vocab_word in tf:
                word_doc_count += 1
                
        word_df[vocab_word] = word_doc_count
    return word_df

In [51]:
word_document_frequency = get_word_document_frequency(vocabulary, document_term_frequency)

Progress:   0%|                                                                                               …

In [54]:
def get_idf(vocab, m, word_df):
    inverse_df_vocab = {}
    for vocab_word in tqdm_notebook(vocab.keys(), desc = "Progress", ncols = 700, unit = " Words"):
        inverse_df_vocab[vocab_word] = np.log2((m + 1) / word_df[vocab_word])
    
    return inverse_df_vocab

In [55]:
idf_vocabulary = get_idf(vocabulary, len(paths), word_document_frequency)

Progress:   0%|                                                                                               …

In [61]:
def get_tf_idf(idf_vocab, doc_tf):
    tf_idf_docs = {}
    for doc_name in doc_tf.keys():
        tf_idf_docs[doc_name] = {}
        
    for (doc_name, words_freq) in tqdm_notebook(doc_tf.items(), desc = "Progress", ncols = 700, unit = " Documents"):
        tf_idf_words = {}
        for (doc_word, word_freq) in words_freq.items():
            tf_idf_words[doc_word] = word_freq * idf_vocab[doc_word]
            
        tf_idf_docs[doc_name] = tf_idf_words
        
    return tf_idf_docs

In [62]:
tf_idf_documents = get_tf_idf(idf_vocabulary, document_term_frequency)

Progress:   0%|                                                                                               …

In [75]:
queries = {
    1 : "LDA",
    2 : "Topic modelling",
    3 : "Generative models",
    4 : "Semantic relationships between terms",
    5 : "Natural Language Prrocessing",
    6 : "Text Mining",
    7 : "Translation model",
    8 : "Learning model",
    9 : "Semantic evaluations",
    10 : "System results and combination"
}

In [97]:
def vector_space_model(query, tf_idf_docs):
    query_word_count = {}
    relevance_scores = {}
    
    for word in query:
        query_word_count[word] = 0
        
    for word in query:
        query_word_count[word] += 1
    
    for doc_name in tf_idf_docs.keys():
        rel_score = 0
        for query_word in query:
            x = query_word_count[query_word]
            y_temp = tf_idf_docs.get(doc_name, {}).get(query_word)
            y = y_temp if y_temp else 0
            rel_score += x * y 
        
        relevance_scores[doc_name] = rel_score
    
    return relevance_scores

In [109]:
for i in range(1, 11):
    query = queries[i]
    print(f"\nQuery # {i}: {query}\nResults:")
    
    query = query.lower()
    query = query.split()
    relevance_scores = vector_space_model(query, tf_idf_documents)
    
    sorted_scores = sorted(relevance_scores, key = relevance_scores.get, reverse = True)
    
    i = 0
    for doc in sorted_scores:
        print(doc, relevance_scores[doc])
        i += 1
        if i == 5: break


Query # 1: LDA
Results:
J14-2003.pdf.txt 382.13569917748
D09-1026.pdf.txt 352.7406453945969
D11-1050.pdf.txt 342.94229413363587
N10-1070.pdf.txt 313.5472403507528
P10-1117.pdf.txt 298.84971345931126

Query # 2: Topic modelling
Results:
J14-2003.pdf.txt 379.4902612898605
P12-1079.pdf.txt 338.76447715143644
Q15-1004.pdf.txt 303.5922090318884
N15-1074.pdf.txt 301.74103702559637
W10-4104.pdf.txt 281.37814495638435

Query # 3: Generative models
Results:
W06-1668.pdf.txt 187.6090498930423
W11-0100.pdf.txt 176.02977221496937
J03-4003.pdf.txt 149.81290586100675
D09-1111.pdf.txt 132.98051273548964
D09-1058.pdf.txt 132.23411905337565

Query # 4: Semantic relationships between terms
Results:
W11-0100.pdf.txt 1043.6132419908004
J08-2004.pdf.txt 201.8914384556315
W15-3808.pdf.txt 185.08429317623128
J91-1003.pdf.txt 184.25659918760599
W09-2004.pdf.txt 178.67830016293192

Query # 5: Natural Language Prrocessing
Results:
W11-0100.pdf.txt 106.62161254506168
J14-1005.pdf.txt 88.83184861348217
J87-1020.