In [194]:
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import words
import fitz
import re
import time

In [195]:
# change to path that contains the PDFs
path = r"C:\Users\AbdulrahmanI\Downloads"
# change to path that will contain the output files
output_path = r"C:\Users\AbdulrahmanI\Downloads"

In [196]:
def get_files(directory):
    file_list = [file for file in os.listdir(path)
             if os.path.isfile(os.path.join(path, file))]
    file_list = [f for f in filter(lambda f: f.endswith(('.pdf','.PDF')), file_list)]
    return file_list

In [197]:
def rest_of_pages(text):
    '''
    Tokenizes and filters words of the page being compared to the most updated change notice page.
    Parameters: text: the extracted text from the page being compared to the most updated change notice page.
    Returns: filtered_other_pages: set of words filtered out with stop words of the page being compared
    to the most updated change notice page
    boolean_val: a boolean value that identifies if a page contains a keyword
    '''
    # removes all phone numbers from documents
    remove_phone = re.sub(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]', "", text)
    word_list = word_tokenize(remove_phone)
    # checks if page is a change notice if it contains the keyword "SUMMARY"
    if "SUMMARY" in word_list:
        boolean_val = True
        # removes top left text of older templates
        if "CONTRACT" in word_list:
            inde = word_list.index("CONTRACT")
            word_list = word_list[inde:]
            # if contains these keywords, remove the agency details section to increase similarity
            if "PRIMARY" and "CONTACT" in word_list:
                ind = word_list.index('AGENCY')
                del word_list[ind+1:ind+20]
                del word_list[ind-2:ind+1]
    else:
        if "TELEPHONE" in word_list:
            boolean_val = True
        else:
            boolean_val = False
    filtered_other_pages = {word.lower() for word in word_list if word not in stopwords.words('english')}
    return filtered_other_pages, boolean_val

In [198]:
def calculation(filtered_first_pg, filtered_other_pages):
    '''
    Calculates the cosine similarity between the identified most updated change notice page and the other page.
    Parameters: filtered_first_pg: keywords contained in the most updated change notice
    filtered_other_pages: keywords contained in the other page
    Returns: cosine similarity between the two pages
    '''
    l1 = []
    l2 = []
    # forms a set containing keywords of both strings 
    rvector = filtered_first_pg.union(filtered_other_pages)
    for w in rvector:
        if w in filtered_first_pg: 
            l1.append(1) # creates a vector/matrix
        else: 
            l1.append(0)
        if w in filtered_other_pages: 
            l2.append(1)
        else: 
            l2.append(0)
    c = 0
    # cosine similarity formula 
    for i in range(len(rvector)):
            c+= l1[i]*l2[i]
    try:
        cosine = c / float((sum(l1)*sum(l2))**0.5)
    except ZeroDivisionError:
        return 0
    return cosine

In [199]:
def write_to_file(cosine, text, new_file, boolean_val):
    '''
    Writes to a new file if under the identified threshold.
    Parameters: cosine: cosine similarity between the two pages being compared
    text: page being compared to the most updated change notice
    new_file: file to write to
    boolean_val: a boolean value that identifies if a page contains the keyword "SUMMARY"
    Returns: None
    '''
    if boolean_val == True:
        if cosine >= 0.38:
            pass
        else:
            new_file.write(text)
    else:
        new_file.write(text)
    return 0

In [200]:
def page_by_page(doc, first_pg, text, new_file):
    '''
    Iterates through each page in the PDF document, extracting text and ignoring any outdated
    change notice page.
    Parameters: doc: entire readable PDF document 
    first_pg: empty string
    text: empty string
    new_file: file to write to
    Returns: None
    '''
    for page in doc:
        if first_pg != "":
            text = ""
            text = page.get_text()
            filtered_other_pages, boolean_val = rest_of_pages(text)
        else:
            first_pg += page.get_text()
            # splits all newline ('\n') characters
            tokens = [i for i in first_pg.split('\n') if i]
            first_pg = ' '.join(tokens)
            first_pg += ' '
            if first_pg.find("SUMMARY") != -1:
                new_file.write(first_pg)
            else:
                new_file.write(first_pg)
                first_pg = ""
                continue
            # removes all phone numbers from documents
            remove_phone = re.sub(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]', "", first_pg)
            word_list = word_tokenize(remove_phone)
            # removes stop words from word_list
            filtered_first_pg = {word.lower() for word in word_list if word not in stopwords.words('english')}
            continue
        cosine = calculation(filtered_first_pg, filtered_other_pages)
        written = write_to_file(cosine, text, new_file, boolean_val)
        continue
        return 0

In [201]:
%%time
def main():
    file_list = get_files(path)
    for file_name in file_list:
        try:
            file_path = os.path.join(output_path, file_name[:-4])
            new_file = open(file_path + ".txt", "w", encoding='utf8')           
            with fitz.open(path+'/' + file_name) as doc:
                first_pg = ""
                text = ""
                result = page_by_page(doc, first_pg, text, new_file)
        except RuntimeError:
            continue
if __name__ == '__main__':
    main()

STATE OF MICHIGAN CENTRAL PROCUREMENT SERVICES Department of Technology, Management, and Budget 525 W. ALLEGAN ST., LANSING, MICHIGAN 48913 P.O. BOX 30026 LANSING, MICHIGAN 48909 CONTRACT CHANGE NOTICE Change Notice Number 12 to 071B3200101 Contract Number  TekinelK@michigan.gov Kemal Tekinel Jarrod Barron ernie.sanders@perspecta.com Ernie Sanders Chantilly, VA 20151 PERSPECTA STATE & LOCAL INC. CV0017629 214-734-3093 15052 Conference Center Dr (517) 249-0406 STATE Program  Manager Contract  Administrator CONTRACTOR 517-284-4512 DTMB barronj1@michigan.gov $1,943,927.27 June 1, 2013 March 31, 2022 DISASTER RECOVERY SERVICES May 31, 2016 INITIAL AVAILABLE OPTIONS EXPIRATION DATE  BEFORE  4 - 1 Year PAYMENT TERMS DELIVERY TIMEFRAME NET 45 DAYS ALTERNATE PAYMENT OPTIONS EXTENDED PURCHASING ☐ P-Card ☐ PRC ☐ Other ☐ Yes ☒ No MINIMUM DELIVERY REQUIREMENTS DESCRIPTION OF CHANGE NOTICE OPTION LENGTH OF OPTION EXTENSION  LENGTH OF EXTENSION  REVISED EXP. DATE ☐ ☒ 3 months June 30, 2022 CURRENT V