In [1]:
import json
from collections import Counter
import pandas as pd
from nltk.corpus import names
from nltk.metrics.distance import edit_distance as editDistance
import nltk
import re 
import os
import difflib 
import logging
import itertools
import numpy as np
from nltk.util import ngrams 
from difflib import SequenceMatcher
from string import punctuation
from termcolor import colored
from IPython.display import clear_output
%matplotlib inline

In [2]:
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [3]:
with open('txt/e1a.json') as f: 
    rawData = f.read()

df = pd.read_json(rawData)

In [4]:
test1 = df.loc[0]['ocr']

In [5]:
tests = df['ocr']

In [6]:
class Text: 
    def __init__(self, raw_text, label, removeStopwords=True): 
        if type(raw_text) == list: 
            # JSTOR critical works come in lists, where each item represents a page. 
            self.text = ' \n '.join(raw_text)
        else: 
            self.text = raw_text
        self.label = label
        self.tokens = self.getTokens(removeStopwords)
        self.trigrams = self.ngrams(3)
        
    def getTokens(self, removeStopwords=True): 
        """ Tokenizes the text, breaking it up into words, removing punctuation. """
        tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer. 
        #tokenizer = nltk.RegexpTokenizer('\w+|\$[\d\.]+|\S+') # A custom regex tokenizer. 
        spans = list(tokenizer.span_tokenize(self.text))
        # Take note of how many spans there are in the text
        #print(spans)
        self.length = spans[-1][-1] 
        tokens = tokenizer.tokenize(self.text)
        tokens = [ token.lower() for token in tokens ] # make them lowercase
        if not removeStopwords: 
            self.spans = spans
            return tokens
        tokenSpans = list(zip(tokens, spans)) # zip it up
        stopwords = nltk.corpus.stopwords.words('english') # get stopwords
        tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip
        self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans
        return [ x[0] for x in tokenSpans ] # unzip; get tokens
    
    def ngrams(self, n): 
        """ Returns ngrams for the text."""
        return list(ngrams(self.tokens, n))

class Matcher: 
    def __init__(self, textObjA, textObjB, threshold=5, ngramSize=3, removeStopwords=True):
        """
        Takes as input two Text() objects, and matches between them.
        """
        self.threshold = threshold
        self.ngramSize = ngramSize
        
        #self.textA, self.textB = Text(fileA, removeStopwords=removeStopwords), \
        #        Text(fileB, removeStopwords=removeStopwords)
        self.textA = textObjA
        self.textB = textObjB 
        
        self.textAgrams = self.textA.ngrams(ngramSize)
        self.textBgrams = self.textB.ngrams(ngramSize)

        self.locationsA = []
        self.locationsB = []

    def getContext(self, text, start, length, context): 
        match = self.getTokensText(text, start, length)
        before = self.getTokensText(text, start-context, context)
        after = self.getTokensText(text, start+length, context)
        match = colored(match, 'red')
        out = " ".join([before, match, after])
        out = out.replace('\n', ' ') # Replace newlines with spaces. 
        out = re.sub('\s+', ' ', out)
        return out

    def getTokensText(self, text, start, length):  
        """ Looks up the passage in the original text, using its spans. """
        matchTokens = text.tokens[start:start+length]
        spans = text.spans[start:start+length]
        if len(spans) == 0: 
            # Don't try to get text or context beyond the end of a text. 
            passage = ""
        else: 
            passage = text.text[spans[0][0]:spans[-1][-1]]
        return passage 

    def getLocations(self, text, start, length, asPercentages=False): 
        """ Gets the numeric locations of the match. """
        spans = text.spans[start:start+length]
        if asPercentages: 
            locations = (spans[0][0]/text.length, spans[-1][-1]/text.length)
        else: 
            locations = (spans[0][0], spans[-1][-1])
        return locations

    def getMatch(self, match, textA, textB, context): 
        length = match.size + self.ngramSize - 1 # offset according to nGram size 
        wordsA = self.getContext(textA, match.a, length, context)
        wordsB = self.getContext(textB, match.b, length, context)
        spansA = self.getLocations(textA, match.a, length)
        spansB = self.getLocations(textB, match.b, length)
        self.locationsA.append(spansA)
        self.locationsB.append(spansB)
        line1 = ('%s: %s %s' % (colored(textA.label, 'green'), spansA, wordsA) )
        line2 = ('%s: %s %s' % (colored(textB.label, 'green'), spansB, wordsB) )
        return line1 + '\n' + line2

    def match(self): 
        """
        This does the main work of finding matching n-gram sequences between
        the texts.
        """
        sequence = SequenceMatcher(None,self.textAgrams,self.textBgrams)
        matchingBlocks = sequence.get_matching_blocks()

        # Only return the matching sequences that are higher than the 
        # threshold given by the user. 
        highMatchingBlocks = [match for match in matchingBlocks if match.size > self.threshold]
    
        numBlocks = len(highMatchingBlocks)
        self.numMatches = numBlocks
        
        if numBlocks > 0: 
            print('%s total matches found.' % numBlocks, flush=True)

        for num, match in enumerate(highMatchingBlocks): 
            print('match: ', match)
            out = self.getMatch(match, self.textA, self.textB, 5)
            print('\n')
            print('match %s:' % (num+1), flush=True)
            print(out, flush=True)

        return self.numMatches, self.locationsA, self.locationsB



In [7]:
test1Text = Text(test1, 'test1')

In [8]:
mm = Text(open('middlemarch.txt').read(), 'Middlemarch')

In [95]:
class Match(): 
    def __init__(self, a, b, size): 
        self.a = a
        self.b = b
        self.size = size
    
    def __str__(self):
        return str(self.__dict__)
    
    def __repr__(self): 
        return self.__str__()

    def __eq__(self, other): 
        return self.__dict__ == other.__dict__

class Matcher(): 
    """ Replacement for SequenceMatcher that does fuzzy text matching."""
    def __init__(self, textAgrams, textBgrams): 
        self.textAgrams = textAgrams
        self.textBgrams = textBgrams
        self.countA = Counter(textAgrams)
        self.countB = Counter(textBgrams)
        # Find the intersection of the two sets. 
        self.initialMatches = list(set(textAgrams) & set(textBgrams))
    
    @property
    def matchPairs(self): 
        pairs = []
        for match in self.initialMatches: 
            # Handle multiple matches. 
            if self.countA[match] > 1: 
                aLocs = [i for i, x in enumerate(self.textAgrams) if x==match]
                bLoc = self.textBgrams.index(match)
                for loc in aLocs: 
                    pairs.append(Match(a=loc, b=bLoc, size=1))
            if self.countB[match] > 1: 
                bLocs = [i for i, x in enumerate(self.textBgrams) if x==match]
                aLoc = self.textAgrams.index(match)
                for loc in bLocs: 
                    pairs.append(Match(a=aLoc, b=loc, size=1))
            else: 
                pairs.append(Match(a=self.textAgrams.index(match),
                                   b=self.textBgrams.index(match), size=1))
        return pairs
    
    @property 
    def extendedMatches(self): 
        out = []
        for match in self.matchPairs: 
            for forward in [True, False]: 
                go = True
                while go: 
#                     print('match: ', match)
                    # Extend it as far as possible.
                    extended = self.extendMatch(match, forward=forward)
#                     print('extended: ', extended)
                    if match == extended: 
                        go = False
                    match = extended
                out.append(match)
        return out
        
    def extendMatch(self, match, forward):  
        if forward: 
            offset, wordIndex, correction = match.size, -1, 0
        else: 
            offset, wordIndex, correction = -1, 0, 1
        try: 
            a = self.textAgrams[match.a+offset]
            b = self.textBgrams[match.b+offset]
        except IndexError:
            # The extension is out of range for the document. 
            return match
        wordA, wordB = a[wordIndex], b[wordIndex]
#         print('a: ', a)
#         print('b: ', b)
        if a==b or editDistance(wordA, wordB)<2 or self.editRatio(wordA,wordB)>.9: 
            return Match(a=match.a-correction, b=match.b-correction, size=match.size+1)
        else: 
            return match
    
    def editRatio(self, a, b): 
        lensum = len(a) + len(b)
        distance = editDistance(a, b)
        return (lensum-distance)/lensum

In [96]:
class fuzzySeqMatcher(SequenceMatcher): 
    def __init__(self, a, b): 
        SequenceMatcher.__init__(self, a=a, b=b)
        self.matches = self.get_matching_blocks()

In [97]:
m = Matcher(test1Text.trigrams, mm.trigrams)

In [98]:
m.extendedMatches

[{'a': 1208, 'b': 7221, 'size': 6},
 {'a': 1205, 'b': 7218, 'size': 9},
 {'a': 3076, 'b': 16781, 'size': 21},
 {'a': 3056, 'b': 16761, 'size': 41},
 {'a': 968, 'b': 767, 'size': 8},
 {'a': 965, 'b': 764, 'size': 11},
 {'a': 3687, 'b': 38443, 'size': 21},
 {'a': 3668, 'b': 38424, 'size': 40},
 {'a': 4124, 'b': 71570, 'size': 6},
 {'a': 4112, 'b': 71558, 'size': 18},
 {'a': 2565, 'b': 10117, 'size': 30},
 {'a': 2554, 'b': 10106, 'size': 41},
 {'a': 1687, 'b': 14731, 'size': 15},
 {'a': 1668, 'b': 14712, 'size': 34},
 {'a': 1145, 'b': 10421, 'size': 1},
 {'a': 1144, 'b': 10420, 'size': 2},
 {'a': 4292, 'b': 152476, 'size': 10},
 {'a': 4280, 'b': 152464, 'size': 22},
 {'a': 3072, 'b': 16777, 'size': 25},
 {'a': 3056, 'b': 16761, 'size': 41},
 {'a': 3693, 'b': 38449, 'size': 15},
 {'a': 3668, 'b': 38424, 'size': 40},
 {'a': 4122, 'b': 71568, 'size': 8},
 {'a': 4112, 'b': 71558, 'size': 18},
 {'a': 3707, 'b': 38463, 'size': 1},
 {'a': 3668, 'b': 38424, 'size': 40},
 {'a': 3944, 'b': 140390, 

In [82]:
edit_distance('orange', 'apple')

NameError: name 'edit_distance' is not defined