In [5]:
import string
import csv
import collections
import nltk
from nltk.corpus import stopwords
import operator
from collections import OrderedDict
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
import os
import math
from textblob import TextBlob as tb
import textrank

english_stops = []


def clean_tokens(tokens):
    """ Lowercases, takes out punct and stopwords and short strings """
    return [token.lower() for token in tokens if (token not in string.punctuation) and
               	(token.lower() not in english_stops) and len(token) > 2]

def get_stopwords():
    enc = 'utf-8'
    with open('stopword_file.csv', 'r', encoding = enc) as f:
        reader = csv.reader(f)
        keywords = list(reader)
    english_stops = [i[0] for i in keywords]
    #print ( english_stops)
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
def get_cleanTokens(directory):
    cleanTokens = []
    wordcount = {} 
    for filename in os.listdir(directory):
        text = get_text("KeywordDocs/" + filename)
        tokens = nltk.word_tokenize(text)
        cleanTokens.extend(clean_tokens(tokens))
    return cleanTokens
     
def create_freqList(cleanTokens):
    for tok in cleanTokens:
        if tok not in wordcount:
            wordcount[tok] = 1
        else:
            wordcount[tok] += 1
        
    sorted_wordCount = OrderedDict(sorted(wordcount.items(), key=operator.itemgetter(1), reverse=True))
    write_csv(sorted_wordCount)

def write_csv(sorted_wordCount):
    enc = 'utf-8'
    if not os.path.isfile('frequency.csv'):
        with open('frequency.csv', 'w', encoding = enc) as f:
            for key in sorted_wordCount.keys():
                f.write("%s,\n"%(key))
    else:
        with open('frequency.csv', 'a+', encoding = enc) as f:
            for key in sorted_wordCount.keys():
                f.write("%s,\n"%(key))


def get_text(filename):
    fp = open(filename, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()
    return data



def tf(word, blob):
    return (float)(blob.words.count(word)) / (float)(len(blob.words))

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob)

def idf(word, bloblist): return math.log(len(bloblist) / (float)(1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)


def create_tfIdfList(document):
    totalLength, splitLength = len(document), int(len(document)/400)
    bloblist = [ document[i:i+splitLength] for i in range(0, totalLength, splitLength) ]
    for i, blob in enumerate(bloblist):
        blob = tb(blob)
        scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
        sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        enc = 'utf-8'
        with open('frequency.csv', 'a+', encoding = enc) as f:
            columnTitleRow = "Word, Score\n"
            f.write(columnTitleRow)
            for word, score in sorted_words: 
                score = "{},{}\n".format(word, round(score, 5))
                f.write(score)

def create_textrankList(sample_text):
    textrank_results = textrank.extractKeyphrases(sample_text)
    for res in enumerate(textrank_results):
        with open('textrank.csv', 'a+', encoding = 'utf-8') as f:
            for word in res:
                f.write(word + ', ')


if __name__ == '__main__':
    get_stopwords()
    clean_tokens = get_cleanTokens("KeywordDocs")
    clean_doc = ' '.join(clean_tokens)
    create_freqList(clean_tokens)
    create_tfIdfList(' '.join(clean_tokens))
    parts = [ clean_doc[i:i+int(len(clean_doc)/10)] for i in range(0, len(clean_doc), int(len(clean_doc)/10)) ]
    for i,p in enumerate(parts):
        create_textrankList(p)

In [24]:
! pip install textblob

Collecting textblob
  Downloading https://files.pythonhosted.org/packages/7c/7d/ad09a26b63d4ad3f9395840c72c95f2fc9fa2b192094ef14e9e720be56f9/textblob-0.15.2-py2.py3-none-any.whl (636kB)
Installing collected packages: textblob
Successfully installed textblob-0.15.2
