In [1]:
from collections import deque

import data
import os
import parse
import requests
import time

In [2]:
seeds = deque(["http://stackexchange.com/"])
searches = data.searches

In [3]:
import similarity

In [4]:
def bfsCrawl(frontier, iterations):
    frontier = deque(frontier)
    lowLim, uppLim = 3/5, 4/5
    i, numVisited, urlVisited = 0, 0, set()
    while frontier and i < iterations:
        url = frontier.popleft()
        urlVisited.add(url)
        try:
            resp = requests.get(url)
        except:
            continue
        host = parse.getDomain(url)
        topics = parse.parseKeywords(resp)
        for topic in topics:
            for search in searches:
                try:
                    score = similarity.getSim(search, topic)
                    if lowLim < score < highLim:
                        print(str((score, search, topic, url)))
                except:
                    continue
        frontier.extend(deque(parse.parseLinks(resp) - urlVisited))
        numVisited += 1
        i += 1
    return (len(frontier), numVisited)

In [5]:
def dfsCrawl(stack, iterations):
    stack = deque(stack)
    lowLim, uppLim = 3/5, 4/5
    depthMax, lastHost = 3, ""
    depthLeft = depthMax
    i, numVisited, urlVisited = 0, 0, set()
    while stack and i < iterations:
        url = stack.pop()
        urlVisited.add(url)
        try:
            resp = requests.get(url)
        except:
            continue
        topics = parse.parseKeywords(resp)
        for topic in topics:
            for search in searches:
                try:
                    score = similarity.getSim(search, topic)
                    if lowLim < score < highLim:
                        print(str((score, search, topic, url))) # why not printed?
                except:
                    continue
        host = parse.getDomain(url)
        if host == lastHost:
            if depthLeft:
                depthLeft -= 1
                stack.extend(deque(parse.parseLinks(resp)))
            else:
                depthLeft = depthMax
        else:
            depthLeft = depthMax
            stack.extend(deque(parse.parseLinks(resp) - urlVisited))
        numVisited += 1
        i += 1
    return (len(stack), numVisited)

In [6]:
def test():
    iterList = [# 5, 10,
                50, 100, 500]
    for iterations in iterList:
        print("Iterations: %d" % iterations)
        print()
        '''
        start = time.time()
        try:
            queueSize, bfsnumVisited = bfsCrawl(seeds, iterations)
            print("Size of Queue: %d" % queueSize)
            print("Number of Visited: %d" % bfsnumVisited)
            print("Time Taken: %f" % (time.time() - start))
            print()
        except Exception as inst:
            print("BFS: %s" % str(inst))
            print()
        '''
        start = time.time()
        try:
            stackSize, dfsnumVisited = dfsCrawl(seeds, iterations)
            print("Size of Stack: %d" % stackSize)
            print("Number of Visited: %d" % dfsnumVisited)
            print("Time Taken: %f" % (time.time() - start))
            print()
        except Exception as inst:
            print("DFS: %s" % str(inst))
            print()

In [7]:
test()

Iterations: 50

Size of Stack: 6096
Number of Visited: 50
Time Taken: 411.113517

Iterations: 100

Size of Stack: 10787
Number of Visited: 100
Time Taken: 819.840771

Iterations: 500

Size of Stack: 26864
Number of Visited: 500
Time Taken: 1881.018762



In [None]:
'''
Iterations: 5

Size of Queue: 1205
Number of Visited: 5
Time Taken: 65.640972

Size of Stack: 923
Number of Visited: 5
Time Taken: 47.394564

Iterations: 10

Size of Queue: 2310
Number of Visited: 10
Time Taken: 134.098501

Size of Stack: 1399
Number of Visited: 10
Time Taken: 64.906949

Iterations: 50

Size of Queue: 9522
Number of Visited: 50
Time Taken: 535.173888

Size of Stack: 6096
Number of Visited: 50
Time Taken: 411.113517

Iterations: 100

Size of Queue: 19770
Number of Visited: 100
Time Taken: 1071.777788

Size of Stack: 10787
Number of Visited: 100
Time Taken: 819.840771

Iterations: 500

Size of Queue: 68615
Number of Visited: 500
Time Taken: 4459.348122

Size of Stack: 26864
Number of Visited: 500
Time Taken: 1881.018762
'''

In [None]:
def bfsCrawlWrite(frontier, iterations):
    frontier = deque(frontier)
    relevant, threshold = [], 2/3
    indent1 = 0, ' ' * 2
    indent2, indent3 = indent1 * 2, indent1 * 3
    indent4 = indent2 * 2
    i = 0
    with open('similarity.json', 'w') as f:
        f.write('[\n%s[\n%s""' % (indent1, indent2))
        while frontier and i < iterations:
            f.write(',\n%s{\n' % (indent2))
            url = frontier.popleft()
            resp = requests.get(url)
            topics = parse.parseKeywords(resp)
            f.write('%s"": ""' % (indent3))
            for topic in topics:
                relevant = False
                stringBuilder = []
                stringBuilder.append(',\n%s"%s": [\n' % (indent3, topic))
                for search in searches:
                    try:
                        score = similarity.getSim(search, topic)
                        if score >= threshold:
                            relevant = True
                            stringBuilder.append('%s"(%s, %s)",\n' % (indent4, search, score))
                    except:
                        continue
                if relevant:
                    stringBuilder[-1] = stringBuilder[-1].rstrip()[:-1]
                    stringBuilder.append('\n')
                    f.write(''.join(stringBuilder))
                    f.write('%s]' % (indent3))
            frontier.extend(deque(parse.parseLinks(resp)))
            f.write('\n%s}\n%s]' % (indent2, indent1))
            if frontier and i < iterations:
                f.write(',')
            else:
                break
            i += 1
        f.write('\n]')
    f.close()
    return len(frontier)