In [1]:
import os
import sys
import copy
import collections

import nltk
import nltk.tokenize

import pagerank

In [2]:
def __preprocessDocument(document, relevantPosTags):
    '''
    This function accepts a string representation 
    of a document as input, and returns a tokenized
    list of words corresponding to that document.
    '''
    
    words = __tokenizeWords(document)
    posTags = __tagPartsOfSpeech(words)
    
    # Filter out words with irrelevant POS tags
    filteredWords = []
    for index, word in enumerate(words):
        word = word.lower()
        tag = posTags[index]
        if not __isPunctuation(word) and tag in relevantPosTags:
            filteredWords.append(word)

    return filteredWords

In [3]:
def textrank(document, windowSize=2, rsp=0.15, relevantPosTags=["NN", "ADJ"]):
    '''
    This function accepts a string representation
    of a document and three hyperperameters as input.
    It returns Pandas matrix (that can be treated
    as a dictionary) that maps words in the
    document to their associated TextRank significance
    scores. Note that only words that are classified
    as having relevant POS tags are present in the
    map.
    '''
    
    # Tokenize document:
    words = __preprocessDocument(document, relevantPosTags)
    
    # Build a weighted graph where nodes are words and
    # edge weights are the number of times words cooccur
    # within a window of predetermined size. In doing so
    # we double count each coocurrence, but that will not
    # alter relative weights which ultimately determine
    # TextRank scores.
    edgeWeights = collections.defaultdict(lambda: collections.Counter())
    for index, word in enumerate(words):
        for otherIndex in range(index - windowSize, index + windowSize + 1):
            if otherIndex >= 0 and otherIndex < len(words) and otherIndex != index:
                otherWord = words[otherIndex]
                edgeWeights[word][otherWord] += 1.0

    # Apply PageRank to the weighted graph:
    wordProbabilities = pagerank.powerIteration(edgeWeights, rsp=rsp)
    wordProbabilities.sort(ascending=False)

    return wordProbabilities

In [4]:
def __asciiOnly(string):
    return "".join([char if ord(char) < 128 else "" for char in string])

def __isPunctuation(word):
    return word in [".", "?", "!", ",", "\"", ":", ";", "'", "-"]

def __tagPartsOfSpeech(words):
    return [pair[1] for pair in nltk.pos_tag(words)]

def __tokenizeWords(sentence):
    return nltk.tokenize.word_tokenize(sentence)


In [7]:
def applyTextRank(fileName, title="a document"):
    print
    print "Reading \"%s\" ..." % title
#     filePath = os.path.join(os.path.dirname(__file__), fileName)
    filePath = fileName
    document = open(filePath).read()
    document = __asciiOnly(document)
    
    print "Applying TextRank to \"%s\" ..." % title
    keywordScores = textrank(document)
    
    print
    header = "Keyword Significance Scores for \"%s\":" % title
    print header
    print "-" * len(header)
    print keywordScores
    print


In [8]:
applyTextRank("Cinderalla.txt", "Cinderalla")


Reading "Cinderalla" ...
Applying TextRank to "Cinderalla" ...


LookupError: 
**********************************************************************
  Resource u'tokenizers/punkt/english.pickle' not found.  Please
  use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/Users/jason.xie/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - u''
**********************************************************************