In [77]:
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
import fitz
import re

In [78]:
# change to your computer's path directory
dir_path = "/Users/ilyasabdulrahman/Desktop/work_docs/some_docs"

In [79]:
def get_file_list(dir_path):
    '''
    This function gets the list of documents in your computer's directory.
    Parameters: dir_path: path to your computer's located directory
    Returns: file_list: list of documents in your computer's directory
    '''
    #dir_path = "/Users/ilyasabdulrahman/Desktop/work_docs/some_docs"
    os.chdir(dir_path)
    file_list = os.listdir()
    return file_list

In [80]:
def rest_of_pages(text):
    '''
    Tokenizes and filters words of the page being compared to the most updated change notice page.
    Parameters: text: the extracted text from the page being compared to the most updated change notice page.
    Returns: filtered_other_pages: set of words filtered out with stop words of the page being compared
    to the most updated change notice page
    '''
    word_list = word_tokenize(text)
    # identifies if page is an "old" template of a change notice page if contains these keywords
    if "PRIMARY" and "CONTACT" in word_list:
        # skips first few words that would have decreased the similarity
        word_list = word_list[30:]
        boolean_val = True
    else:
        boolean_val = False
    # only uses first 40 words used to compare the pages
    word_list = word_list[:40]
    filtered_other_pages = {word.lower() for word in word_list if word not in stopwords.words('english')}
    return filtered_other_pages, boolean_val

In [81]:
def calculation(filtered_first_pg, filtered_other_pages):
    '''
    Calculates the cosine similarity between the identified most updated change notice page and the other page.
    Parameters: filtered_first_pg: keywords contained in the most updated change notice
    filtered_other_pages: keywords contained in the other page
    Returns: cosine similarity between the two pages
    '''
    l1 = []
    l2 = []
    # forms a set containing keywords of both strings 
    rvector = filtered_first_pg.union(filtered_other_pages)
    for w in rvector:
        if w in filtered_first_pg: 
            l1.append(1) # creates a vector/matrix
        else: 
            l1.append(0)
        if w in filtered_other_pages: 
            l2.append(1)
        else: 
            l2.append(0)
    c = 0
    # cosine similarity formula 
    for i in range(len(rvector)):
            c+= l1[i]*l2[i]
    cosine = c / float((sum(l1)*sum(l2))**0.5)
    return cosine

In [82]:
def write_to_file(cosine, text, new_file, boolean_val):
    '''
    Writes to a new file if under the identified threshold.
    Parameters: cosine: cosine similarity between the two pages being compared
    text: page being compared to the most updated change notice
    new_file: file to write to
    boolean_val: a boolean value used to determine if the page is an "old template" of a change notice page
    Returns: None
    '''
    if cosine >= 0.58:
        pass
    else:
        if boolean_val == True:
            pass
        else:
            new_file.write(text)
    return 0

In [83]:
def page_by_page(doc, first_pg, text, new_file):
    '''
    Iterates through each page in the PDF document, extracting text and ignoring any outdated
    change notice page.
    Parameters: doc: entire readable PDF document 
    first_pg: empty string
    text: empty string
    new_file: file to write to
    Returns: None
    '''
    page_count = 1
    for page in doc:
        if first_pg != "":
            text = ""
            text = page.get_text()
            filtered_other_pages, boolean_val = rest_of_pages(text)
        else:
            first_pg += page.get_text()
            if first_pg.find("Change Notice") != -1:
                new_file.write(first_pg)
            word_list = word_tokenize(first_pg)
            # only uses first 40 words used to compare the pages
            word_list = word_list[:40]
            # removes stop words from word_list
            filtered_first_pg = {word.lower() for word in word_list if word not in stopwords.words('english')}
            continue
        cosine = calculation(filtered_first_pg, filtered_other_pages)
        written = write_to_file(cosine, text, new_file, boolean_val)
        page_count += 1
        continue
        return 0

In [84]:
def main():
    page_count = 1
    count = 1
    file_list = get_file_list(dir_path)
    for file_name in file_list:
        try:
            new_file = open("output" + str(count) + ".txt", "w")
            with fitz.open(dir_path+'/' + file_name) as doc:
                first_pg = ""
                text = ""
                result = page_by_page(doc, first_pg, text, new_file)
            # increments counter to write to separate file for each pdf document
            count += 1
        except RuntimeError:
            continue
if __name__ == '__main__':
    main()