In [45]:
import PyPDF2 as pdf
import os
import nltk
import numpy as np

FILE_MATCHES = 5
SENTENCE_MATCHES = 10


def main():
    dirName = input(
        "Enter the directory name which papers with txt file extension are located: ")
    print("Loading Data...")
    files = load_files(dirName)

    files_words = {filename: tokenize(files[filename]) for filename in files}

    file_idfs = compute_idfs(files_words)
    print("Data loaded!")

    while True:
        query = set(
            tokenize(input("Enter query:     (enter 'quit' to exit the program)")))

        if len(query) > 1 or str(*query) != "quit":
            print("Please wait procesing...")
            filenames = top_files(query, files_words,
                                  file_idfs, n=FILE_MATCHES)

            sentences = dict()
            for filename in filenames:
                for passage in files[filename].split("\n"):
                    for sentence in nltk.sent_tokenize(passage):
                        tokens = tokenize(sentence)
                        if tokens:
                            sentences[sentence] = tokens

            idfs = compute_idfs(sentences)

            matches = top_sentences(query, sentences, idfs, n=SENTENCE_MATCHES)

            print(write_query_result_2txt(filenames, matches))
        else:
            print("You quitted program")
            break


In [None]:
def convert_pdfs_to_txt(directory):
    """
    Extract texts from pdf files and write their content into new txt files
    You only want to do this process once
    Inputs a directory name, for every pdf file in the directory
    this function creates new .txt files writes the content of pdf
    """
    for file in os.listdir(directory):
        pdfFile = open(os.path.join(directory, file), "rb")
        pdfReader = pdf.PdfFileReader(pdfFile)
        numPdfPage = pdfReader.numPages
        paperTxt = []
        filetxtName = file[:-3]
        for pageNum in range(numPdfPage):
            paperTxt.append(pdfReader.getPage(pageNum).extractText())
        with open(os.path.join("papersTXT", filetxtName), "w", encoding="utf-8") as f:
            for pagetext in paperTxt:
                f.write(pagetext)
    return True


convert_pdfs_to_txt("papersPDF")


In [None]:
def load_files(directory):
    """
    Extract text from txt files
    Inputs a directory name, 
    Outputs a dictionary whose keys are names of the files and values are texts of the files
    """
    filesDict = dict()
    for file in os.listdir(directory):
        with open(os.path.join(directory, file), "r", encoding="utf-8") as f:
            filesDict[file] = f.read()
    return filesDict


In [None]:
def tokenize(text):
    """
    Process text such that oly alphabetical characters will be left
    Convert uppercase characters to lowercase
    Inputs a string
    Outputs a list of words
    """
    text = text.lower()
    words = nltk.word_tokenize(text)
    tokenizedWords = [word for word in words if word.isalpha()]
    return tokenizedWords


In [None]:
def compute_idfs(fileDict):
    """
    computes idf values for tokenized words
    idf formula:
    idf= log(total number of documents/
            number of document that contains word)

    Inputs a dictionary whose keys are name of the files
    and values are tokenized words
    Outputs a dictionary whose keys are words and values are idf values
    """
    countDict = dict()
    idfDict = dict()
    uniqueWords = set(sum(fileDict.values(), []))

    for word in uniqueWords:
        for content in fileDict.values():
            if word in content:
                try:
                    countDict[word] += 1
                except KeyError:
                    countDict[word] = 1

    numFiles = len(fileDict)
    for word, count in countDict.items():
        idfDict[word] = np.log(numFiles/count)

    return idfDict


In [None]:
def top_files(query, files_words, file_idfs, n):
    """
    Ranks files according to their tf-idf values
    Inputs a query (a set of words), files_words (a dictionary mapping
    names of files to a list of their words), file_idfs (a dictionary mapping words
    to their idf values, n is the number of top n relevant file)
    Outputs a list of top "n" names of the files
    """
    topDict = dict()
    for word in query:
        for namefile, content in files_words.items():
            tf = content.count(word)
            if tf:
                try:
                    topDict[namefile] += tf*file_idfs[word]
                except KeyError:
                    topDict[namefile] = tf*file_idfs[word]

    sortedFiles = [word for word, val in sorted(
        topDict.items(), key=lambda x: x[1], reverse=True)]

    return sortedFiles[:n]


In [None]:
def top_sentences(query, sentences, idfs, n):
    """
    Ranks sentences according to their idf values.If there are
    ties, preference should be given to sentences that have
    a higher query term density.
    Inputs a query (a set of words), sentences (a dictionary mapping
    sentences to a list of their words), idfs (a dictionary mapping words
    to their idf values, n is the number of top n relevant sentences)
    Outputs a list of top "n" names of the sentences
    """
    topSen = dict()
    for sentence, words in sentences.items():
        score = 0
        for word in query:
            if word in words:
                try:
                    score += idfs[word]
                except KeyError:
                    score = idfs[word]
        if score:
            density = words.count(word)/len(words)
            topSen[sentence] = (score, density)

    sortedSentences = [sentence for sentence, pair in sorted(
        topSen.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True)]
    return sortedSentences[:n]


In [None]:
def write_query_result_2txt(filenames, matches):
    """
    Print results of the query to a txt file
    There are several things you might want to print you can choose whatever you like 
    """
    with open("Query_result.txt", "w", encoding="utf-8") as f:
        f.write("top "+str(len(matches)) +
                " relevant sentences found in papers\n")
        f.write("top "+str(len(filenames)) +
                " relevant papers found in collection\n\n")

        # f.write top FILE_MATCHES relevant filenames
        for filename in filenames:
            f.write(filename+"\n")

        f.write("\n")

        # f.write top SENTENCE_MATCHES relevant sentences
        for match in matches:
            f.write(match+"\n")

        f.write("\n")

        # f.write pairs of file name and sentences
        for filename, match in zip(filenames, matches):
            f.write(f"{filename}.pdf:   {match}\n")
    return "top "+str(len(matches))+" relevant sentences found in papers\n"\
        + "top "+str(len(filenames))+" relevant papers found in collection\n"\
        + "Result of query is printed to Query_result.txt"


In [46]:
if __name__ == "__main__":
    main()


Loading Data...
Data loaded!
Please wait procesing...
top 10 relevant sentences found in papers
top 5 relevant papers found in collection
Result of query is printed to Query_result.txt
Please wait procesing...
top 3 relevant sentences found in papers
top 2 relevant papers found in collection
Result of query is printed to Query_result.txt
