In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Connecting to Data Folder

In [None]:
import os
file_path = "/content/drive/MyDrive/CITS5553_Group 5/50reports_table_of_contents"
file_list = os.listdir(file_path)

The below model aims to get TF-IDF scores of the documents and calculate similarity between documents based on table of content txt files (extracted from pdf files) in order to find the top n similar documents given a target PDF.

In [None]:
import json
import os
import re
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as sw

# Setting up text processig function
def process_text(text):
    tokens = word_tokenize(text)
    stopwords = set(sw.words('english'))
    tokens = [token for token in tokens if token.lower() not in stopwords]
    return ' '.join(tokens)

# Loading documents
file_path = "/content/drive/MyDrive/CITS5553_Group 5/wamex_metadata/50reports_table_of_contents"
file_list = os.listdir(file_path)
documents = []
txt_file_list = []

for file_name in file_list:
    if file_name.endswith('.txt'):
        with open(os.path.join(file_path, file_name), 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append(process_text(content))
            txt_file_list.append(file_name)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Computing similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

# Setting function to get top n similar documents
def get_top_n_similar_documents(index, similarity_matrix, n=5):
    similarity_values = similarity_matrix[index]
    sorted_indices = np.argsort(similarity_values)[::-1]
    return sorted_indices[1:n+1]

# Requesting for user input
target_txt = input("Please enter the name of the target TXT file: ")
n = int(input("Please enter the number of top similar documents you want to retrieve: "))

try:
    document_index = txt_file_list.index(target_txt)
    top_n_indices = get_top_n_similar_documents(document_index, similarity_matrix, n)

    print(f"Top {n} similar documents to {target_txt} are:")
    for i in top_n_indices:
        print("\nTXT File Name:", txt_file_list[i])
        print("Content:", documents[i])

except ValueError:
    print(f"'{target_txt}' not found in txt_file_list.")

#a072118_e31_566_2005a_16472941.txt for example

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Please enter the name of the target TXT file: a072118_e31_566_2005a_16472941.txt
Please enter the number of top similar documents you want to retrieve: 3
Top 3 similar documents to a072118_e31_566_2005a_16472941.txt are:

TXT File Name: a072180_e31_564_2005a_9936201.txt
Content: TENEMENT STATUS ............................................................................................................................... 6 \n 3.0 GEOLOGY ................................................................................................................................................. 7 \n 3.1 REGIONAL GEOLOGY ....................................................................................................................................... 7 \n 3.2 LOCAL GEOLOGY ............................................................................................................................................ 7 \n 3.3 MINERALISATION ...............................................................

In [None]:
similarity_matrix

array([[1.        , 0.03691552, 0.06380629, ..., 0.02037928, 0.03599627,
        0.03004774],
       [0.03691552, 1.        , 0.03849052, ..., 0.04478536, 0.1588699 ,
        0.0859678 ],
       [0.06380629, 0.03849052, 1.        , ..., 0.15752522, 0.15581619,
        0.08125206],
       ...,
       [0.02037928, 0.04478536, 0.15752522, ..., 1.        , 0.0689867 ,
        0.0380864 ],
       [0.03599627, 0.1588699 , 0.15581619, ..., 0.0689867 , 1.        ,
        0.09229298],
       [0.03004774, 0.0859678 , 0.08125206, ..., 0.0380864 , 0.09229298,
        1.        ]])