# Cat queries

In [1]:
import regex as re
with open('cat', 'r') as f:
    cat = f.read()

In [2]:
def get_matches_starting_at(data, pattern, index, starting_index, resulting_matches):

    P = pattern[0]
    #Removed sub_data field
    if data[index:index+len(P)] == P:
        if len(pattern) == 1:
            resulting_matches.append((starting_index,index+len(P)))
        else:
            for new_index in range(pattern[1][0]+1, pattern[1][1]+2):
                get_matches_starting_at(data, pattern[2:], index+len(P)+new_index-1, starting_index, resulting_matches)

        return resulting_matches
    else:
        return []

def get_all_matches(data, pattern):
    matches_found = []
    #Remade index to only start at places where the first string is
    for index in [m.start() for m in re.finditer(pattern[0], data)]:
        matches_at_index = get_matches_starting_at(data, pattern, index, index, [])
        matches_found += matches_at_index

    return set(matches_found)

In [3]:
def david_finder(data,pattern):
    matches = get_all_matches(data, pattern)
    total_matches = []
    for m in matches:
            total_matches.append(data[m[0]:m[1]])
    
    return(total_matches)
    
matches = david_finder(cat, ['cat', (0, 10), 'are', (0, 10), 'to'])
assert(len(matches) == 10)

In [5]:
pattern = ['cat', (0, 10), 'are', (0, 10), 'to']
matches = david_finder(cat, pattern)
print(len(matches))
%timeit david_finder(cat, pattern)

10
100 loops, best of 3: 6.94 ms per loop


In [6]:
pattern = ['cat', (0, 100), 'anatomy']
matches = david_finder(cat, pattern)
print(len(matches))
%timeit david_finder(cat, pattern)

11
10 loops, best of 3: 47.1 ms per loop


In [798]:
pattern = ['china', (30, 150), 'washington']
matches = david_finder(cat, pattern)
print(len(matches))
%timeit david_finder(cat, pattern)

1
1000 loops, best of 3: 400 µs per loop


In [799]:
pattern = ['english', (0, 200), 'cat']
matches = david_finder(cat, pattern)
print(len(matches))
%timeit david_finder(cat, pattern)

34
100 loops, best of 3: 2.28 ms per loop


In [800]:
pattern = ['kitten', (15, 85), 'cat', (0, 100), 'sire', (0, 200), 'oxford']
matches = david_finder(cat, pattern)
print(len(matches))
%timeit david_finder(cat, pattern)

2
1000 loops, best of 3: 1.65 ms per loop


# Wikipedia title Aa

In [4]:
import glob, multiprocessing

def process(line, pattern):
    # Function that processes the text of an article
    matches = david_finder(line, pattern)
    return(matches)

def worker(file, pattern):
    # Opens a wikipedia file process it line by line,
    # returning all matches found
    total_matches = []
    with open(file, 'r') as f:
        for line in f:
            # Scan to check, that all the string are present in the text.
            # If they are, run modified david script
            if all(c in line for c in pattern[::2]):
                matches = process(line, pattern)
                if len(matches) > 0:
                    for match in matches:
                        total_matches.append(match)
                    
    return(total_matches)


def fast(pattern, path):
    
    pool = multiprocessing.Pool(4)
    results = []

    for ifile in glob.glob(path): #all wikipedia files
        res = pool.apply_async(worker,[ifile, pattern])
        results.append([ifile,res])

    #wait for jobs to complete
    pool.close()
    pool.join()

    #Combine the result of all files into one list of matches
    matches = []
    for result in results:
        matches += result[1].get()
    return(matches)

In [5]:
pattern = ['arnold', (0, 10), 'schwarzenegger', (0, 10), 'is']
matches = fast(pattern, "Anwiki_p*")
print(len(matches))
%timeit -n2 -r2 fast(pattern, "Anwiki_p*")

14
2 loops, best of 2: 2.42 s per loop


In [21]:
pattern = ['apache', (0, 100), 'software']
matches = fast(pattern, "Anwiki_p*")
print(len(matches))
%timeit -n2 -r2 fast(pattern, "Anwiki_p*")

1441
2 loops, best of 2: 2.62 s per loop


In [22]:
pattern = ['aarhus', (30, 150), 'denmark']
matches = fast(pattern, "Anwiki_p*")
print(len(matches))
%timeit -n2 -r2 fast(pattern, "Anwiki_p*")

510
2 loops, best of 2: 2.67 s per loop


In [769]:
pattern = ['english', (0, 100), 'alphabet']
matches = fast(pattern, "Anwiki_p*")
print(len(matches))
%timeit -n2 -r2 fast(pattern, "Anwiki_p*")

181
2 loops, best of 2: 3.56 s per loop


In [770]:
pattern = ['first', (0, 85), 'letter', (0, 100), 'alphabet', (0, 200), 'consonant']
matches = fast(pattern, "Anwiki_p*")
print(len(matches))
%timeit -n2 -r2 fast(pattern, "Anwiki_p*")

2
2 loops, best of 2: 3.61 s per loop


# Entire English Wikipedia

In [6]:
pattern = ['elephants', (0, 20), 'are', (0, 20), 'to']
matches = fast(pattern, "enwiki_p*")
print(len(matches))
%timeit -n1 -r1 fast(pattern, "enwiki_p*")

1 loop, best of 1: 28.1 s per loop


In [24]:
pattern = ['technical', (0, 20), 'university', (0, 20), 'denmark']
matches = fast(pattern, "enwiki_p*")
print(len(matches))
%timeit -n1 -r2 fast(pattern, "enwiki_p*")

616
1 loop, best of 2: 31.7 s per loop


In [26]:
pattern = ['testing', (0, 20), 'with', (0, 20), 'a', (0, 30), 'lot', (0, 4), 'of', (0, 5), 'words']
matches = fast(pattern, "enwiki_p*")
print(len(matches))
%timeit -n1 -r2 fast(pattern, "enwiki_p*")

0
1 loop, best of 2: 33.2 s per loop


In [12]:
pattern = ['stress', (0, 250), 'test']
matches = fast(pattern, "enwiki_p*")
print(len(matches))
%timeit -n1 -r2 fast(pattern, "enwiki_p*")

7499
1 loop, best of 2: 32.7 s per loop


In [27]:
pattern = ['object', (10, 200), 'application', (0, 100), 'python', (10, 200), 'system', (0, 100), 'computer', (0, 10), 'science', (0, 150), 'linux', (0, 200), 'ruby']
matches = fast(pattern, "enwiki_p*")
print(len(matches))
%timeit -n1 -r2 fast(pattern, "enwiki_p*")

1
1 loop, best of 2: 32.5 s per loop
