In [3]:
import PyPDF2 as pdf
import os
import nltk
import numpy as np


def main():
    dirName = input(
        "Enter the directory name which papers with txt file extension are located: ")
    files = load_files(dirName)
    files_words = {filename: tokenize(files[filename]) for filename in files}
    file_idfs = compute_idfs(files_words)


In [None]:
def convert_pdfs_to_txt(directory):
    """
    Extract texts from pdf files and write their content into new txt files
    You only want to do this process once
    Inputs a directory name, for every pdf file in the directory
    this function creates new .txt files writes the content of pdf
    """
    for file in os.listdir(directory):
        pdfFile = open(os.path.join(directory, file), "rb")
        pdfReader = pdf.PdfFileReader(pdfFile)
        numPdfPage = pdfReader.numPages
        paperTxt = []
        filetxtName = file[:-3]
        for pageNum in range(numPdfPage):
            paperTxt.append(pdfReader.getPage(pageNum).extractText())
        with open(os.path.join("papersTXT", filetxtName), "w", encoding="utf-8") as f:
            for pagetext in paperTxt:
                f.write(pagetext)
    return True


convert_pdfs_to_txt("papersPDF")


In [None]:
def load_files(directory):
    """
    Extract text from txt files
    Inputs a directory name, 
    Outputs a dictionary whose keys are names of the files and values are texts of the files
    """
    filesDict = dict()
    for file in os.listdir(directory):
        with open(os.path.join(directory, file), "r", encoding="utf-8") as f:
            filesDict[file] = f.read()
    return filesDict


In [None]:
def tokenize(text):
    """
    Process text such that oly alphabetical characters will be left
    Convert uppercase characters to lowercase
    Inputs a string
    Outputs a list of words
    """
    text = text.lower()
    words = nltk.word_tokenize(text)
    tokenizedWords = [word for word in words if word.isalpha()]
    return tokenizedWords


In [None]:
def compute_idfs(fileDict):
    """
    computes idf values for tokenized words
    idf formula:
    idf= log(total number of documents/
            number of document that contains word)

    Inputs a dictionary whose keys are name of the files
    and values are tokenized words
    Outputs a dictionary whose keys are words and values are idf values
    """
    countDict = dict()
    idfDict = dict()
    uniqueWords = set(sum(fileDict.values(), [])

    for word in uniqueWords:
        for content in fileDict.values():
            if word in content:
                try:
                    countDict[word] += 1
                except KeyError:
                    countDict[word]=1

    numFiles=len(fileDict)
    for word, count in countDict.items():
        idfDict[word]=np.log(numFiles/count)

    return idfDict


In [None]:
if __name__ == "__main__":
    main()
