# Import Libraries

In [32]:
import json
import socket
import requests
import urllib.request
import collections
import re
import pickle

from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer('english')

from elasticsearch import Elasticsearch, helpers 
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin,urlunparse,  urlsplit, urlunsplit
import urllib.robotparser as robotparser
import time

# Canonicalizer

In [2]:
def canonicalizer(url):
    # url = url.lower() #convert to lower case
    if not url.startswith("http"): #
        url=urljoin("http://",url)
#         print("hi",url)

    if url.endswith(":80"):                         # Rule 2: Remove ports :80 and :443
        url = url[:-3]
    if url.endswith(":443"):
        url = url[:-4]
    if '#' in url:                                  # Rule 4: Remove fragment begins with '#'
        url = url.split('#',1)[0]

    parsed = list(urlparse(url)) 
    parsed[2] = re.sub("/{2,}", "/", parsed[2])     # Rule 5: Remove duplicate slashes
    cleaned =urlunparse(parsed)
    cleaned=resolve_url(cleaned)  #Remove dots

    idx = len(cleaned) - cleaned[::-1].index('/')   # Rule 1: Convert scheme and host to lower case
    back = cleaned[:idx].lower()
    frnt = cleaned[idx:]
    return back + frnt

def resolve_url(url):
    parts = list(urlsplit(url))
    segments = parts[2].split('/')
    segments = [segment + '/' for segment in segments[:-1]] + [segments[-1]]
    resolved = []
    for segment in segments:
        if segment in ('../', '..'):
            if resolved[1:]:
                resolved.pop()
        elif segment not in ('./', '.'):
            resolved.append(segment)
    parts[2] = ''.join(resolved)
    return urlunsplit(parts)

# Politeness Check

In [3]:
robotcheckers = {}
def polite(robotcheckers, url):
	host = urlparse(url).netloc
	try:
		rc = robotcheckers[host]
	except KeyError:
		rc = robotparser.RobotFileParser()
		rc.set_url('http://' + host + '/robots.txt')
		rc.read()
		robotcheckers[host] = rc
	return rc.can_fetch('*', url)

polite(robotcheckers, 'https://en.wikipedia.org/wiki/john_ruskin')

True

# Crawl Helper Method

In [4]:
def is_new_link(link):
    if link in explored or link in [item[1] for item in queue]:
        return False
    return True

In [34]:
def title_tag(url):
    raw_data = requests.get(url)
    soup = BeautifulSoup(raw_data.text, 'html.parser')
    return soup.title.string

In [6]:
def keywords_related(text):
    token_arr = []
    keywords = ['art', 'artist', 'painter', 'paint', 'modern', 'antoni', 'gaud', 'gaudi', 'build', 'architect', 'architectur']
    text = text.lower()
    tokens = word_tokenize(text)

    for token in tokens:
        stemmed = snowball.stem(token)
        token_arr.append(stemmed)

    for kw in keywords:
        if kw in token_arr:
            return True
    return False

In [7]:
def es_docs_count(index):
    es.indices.refresh(index)
    return int(es.cat.count(index, params={"format": "json"})[0]['count'])

In [8]:
def update_inlinks(curr_url, outlinks):
    global inlinks
    for outlink in outlinks:
        if outlink not in inlinks.keys():
            inlinks[outlink] = [curr_url]
        elif curr_url not in inlinks[outlink]:
            inlinks[outlink].append(curr_url)

# Init before Crawling

In [9]:
inlinks = {}

In [10]:
seed_urls = ['https://en.wikipedia.org/wiki/List_of_modern_artists', 'https://en.wikipedia.org/wiki/Antoni_Gaud%C3%AD', 'http://en.wikipedia.org/wiki/List_of_Gaud%C3%AD_buildings']
dq = collections.deque()
queue = []
explored = []
for seed in seed_urls:
    dq.appendleft((0, seed))

In [11]:
dq

deque([(0, 'http://en.wikipedia.org/wiki/List_of_Gaud%C3%AD_buildings'),
       (0, 'https://en.wikipedia.org/wiki/Antoni_Gaud%C3%AD'),
       (0, 'https://en.wikipedia.org/wiki/List_of_modern_artists')])

In [12]:
wiki = "https://en.wikipedia.org"
text = ''
outlinks = []
count = 0
robotcheckers = {}
MAX_CRAWL = 40000

# Web Crawler

In [13]:
def web_crawl():
    while dq:
        text = ''
        outlinks.clear()

        curr_url = dq.pop()
        # curr_url = queue.pop()
        print(curr_url, end = ' ')
        raw_data = requests.get(curr_url[1])
        soup = BeautifulSoup(raw_data.text, 'html.parser')
        wave = curr_url[0]
        content = soup.find('div', {'id': 'mw-content-text'})
        content = content.find_all('a', {'href': re.compile("^/wiki")})

        if not polite(robotcheckers, curr_url[1]) or 'html' not in raw_data.headers['content-type']:
            print('- neither polite nor html type!', end='\n')
            time.sleep(1)
            continue

        tags = soup.find_all('p')
        for tag in tags:
            text += tag.text

        if not keywords_related(text):
            print(' - irrelevant page, skipping!', end='\n')
            text = ''
            time.sleep(1)
            continue

        for link in content:
            url = urljoin(curr_url[1], link.get('href'))
            canonicalized_url = canonicalizer(url)
            if canonicalized_url not in outlinks:
                outlinks.append(canonicalized_url)

        # if len(queue) < 1000000:
        #     for link in outlinks:
        #         if link not in queue and link not in explored:
        #             # dq.appendleft((wave + 1, link))                       ######
        #             queue.append(link)                                      ######

        update_inlinks(curr_url[1], outlinks)
        explored.append(curr_url[1])
        print(' outlinks', len(outlinks), 'deque', len(dq), 'queue', len(queue), 'explored', len(explored))

        if not dq:
            print('\n Every elements in deque flushed. Sorting and filling new elements again \n')
            sort_queue = []
            for link in queue:
                if link not in sort_queue:
                    sort_queue.append((len(inlinks[link]), link))
            sort_queue.sort(reverse=True)
            for item in sort_queue:
                dq.appendleft((wave + 1, item[1]))
            queue.clear()
            sort_queue.clear()

        res = es.index(index="crawled_data", id=soup.title.string, body={
            'url': curr_url[1],
            'content': text,
            'header': json.dumps(dict(raw_data.headers)),
            'outlinks': outlinks
            # 'raw_html': raw_data.text
            })
        
        if es_docs_count('crawled_data') > MAX_CRAWL:
            break

        time.sleep(1)

In [2846]:
web_crawl()

83724 queue 1000510 explored 31014
(2, 'https://en.wikipedia.org/wiki/Storge') - irrelevant page, skipping!
(2, 'https://en.wikipedia.org/wiki/Storage_of_wine')  - irrelevant page, skipping!
(2, 'https://en.wikipedia.org/wiki/Stora_Hammars_stones') - irrelevant page, skipping!
(2, 'https://en.wikipedia.org/wiki/Stora_Alvaret') - irrelevant page, skipping!
(2, 'https://en.wikipedia.org/wiki/Stopping_down')  - irrelevant page, skipping!
(2, 'https://en.wikipedia.org/wiki/Stoping_(geology)') - irrelevant page, skipping!
(2, 'https://en.wikipedia.org/wiki/Stop_motion') outlinks 847 deque 83717 queue 1000510 explored 31015
(2, 'https://en.wikipedia.org/wiki/Stop_light_party') - irrelevant page, skipping!
(2, 'https://en.wikipedia.org/wiki/Stop_bath')  - irrelevant page, skipping!
(2, 'https://en.wikipedia.org/wiki/Stop_TB_Partnership') - irrelevant page, skipping!
(2, 'https://en.wikipedia.org/wiki/Stool_test') - irrelevant page, skipping!
(2, 'https://en.wikipedia.org/wiki/Stony_Point,_New

# I/O Operation

In [14]:
def index_content(index, id):
    res = es.get(index=index, id=id)['_source']
    return res['url'], res['content'], res['outlinks']

In [21]:
def index_url_outlinks(index, id):
    res = es.get(index=index, id=id)['_source']
    return res['url'], res['outlinks']

In [20]:
def es_ids(index):
    a=helpers.scan(es,query={"query":{"match_all": {}}},scroll='1m',index=index)#like others so far
    return [aa['_id'] for aa in a]

In [47]:
def io_operation():
    index_count = 1
    counter = 0
    save_path = './indexes/index' + str(index_count) + '.txt'
    f = open(save_path, 'a')
    for es_id in es_ids('crawled_data'):
        url, content, ol = index_content('crawled_data', es_id)
        id = '<ID>\n' + str(url) + '\n</ID>'
        body = '\n<BODY>\n' + str(content) + '\n</BODY>'
        if url in inlinks.keys():
            inlink_list = '\n<INLINKS>\n' + str(inlinks[url]) + '\n</INLINKS>\n'
        else:
            inlink_list = ''
        outlinks = '\n<OUTLINKS>\n' + str(ol) + '\n</OUTLINKS>'
        doc = '\n<DOC>\n' + id + body + inlink_list + outlinks + '\n</DOC>\n'
        f.write(doc)
        counter += 1
        if counter == 500:
            print(index_count, end='  ')
            counter = 0
            index_count += 1
            save_path = './indexes/index' + str(index_count) + '.txt'
            f = open(save_path, 'a')
    f.close()

In [48]:
io_operation()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80


# Store all inlinks from ElasticSearch

In [40]:
inlinks = {}

In [41]:
import tqdm
for es_id in es_ids('crawled_data'):
    url, outlinks = index_url_outlinks('crawled_data', es_id)
    update_inlinks(url, outlinks)

In [42]:
len(inlinks)

2656978

# ElasticSearch

In [23]:
stoplist = open('../HW1/reference/stoplist.txt')

stop_arr = []
for line in stoplist:
    stop_arr.append(line.strip())
stop_arr = stop_arr + ['document', 'discuss', 'report', 'include', 'describe', 'identify', 'cite', 'predict', 'new', 'two', 'state']

In [24]:
es.indices.create(index = 'crawled_data', ignore=400, body= {
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "max_result_window" : 30000,
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": stop_arr
                },
                "my_stemmer": {
                    "type": "stemmer",
                    "name": "english"
                }
            },
            "analyzer": {
                "stopped": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop",
                        "my_stemmer"
                    ]
                }
            }
      }
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text",
                "fielddata": True,
                "analyzer": "stopped",
                "index_options": "positions",
                "term_vector": "yes"
            }
        }
    }
})

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'crawled_data'}

# Merge to Elastic Cloud

In [1]:
from elasticsearch import Elasticsearch, helpers
from elasticsearch.helpers import bulk
from tqdm import tqdm
import os

import pickle
# configure elastic search
host='https://elastic:spVvOXLdO4lmlepAPTdLOPNb@960748db24d74b16b6d9bd1b55442db9.us-east-1.aws.found.io:9243'
# awsauth = AWS4Auth(YOUR_ACCESS_KEY, YOUR_SECRET_KEY, REGION, 'es')

es = Elasticsearch([host])
print(es.ping())

True


In [2]:
def es_ids(index):
    a=helpers.scan(es,query={"query":{"match_all": {}}},scroll='1m',index=index)
    return [aa['_id'] for aa in a]

In [3]:
def retrieve_links(index, id):
    res = es.get(index=index, id=id)['_source']
    return res['inlinks'], res['outlinks']

In [4]:
import ast

from elasticsearch.helpers import bulk

def bulk_merge(index, docs, existing_ids):
    counter = 0
    while '<DOC>' in docs:
        text = ''
        inlinks_str = ''
        outlinks_str = ''
        docend = docs.find('</DOC>')
        substr = docs[:docend]
        d_stt = substr.find('<ID>') + len('<ID>')
        d_end = substr.find('</ID>')
        docno = substr[d_stt:d_end].strip()
        while "<BODY>" in substr:
            t_stt = substr.find('<BODY>') + len('<BODY>')
            t_end = substr.find('</BODY>')
            text = text + substr[t_stt:t_end].strip() + '\n'
            substr = substr[t_end + len('</BODY>'):]
        while '<INLINKS>' in substr:
            t_stt = substr.find('<INLINKS>') + len('<INLINKS>')
            t_end = substr.find('</INLINKS>')
            inlinks_str = inlinks_str + substr[t_stt:t_end].strip() + '\n'
            substr = substr[t_end + len('</INLINKS>'):]
        while '<OUTLINKS>' in substr:
            t_stt = substr.find('<OUTLINKS>') + len('<OUTLINKS>')
            t_end = substr.find('</OUTLINKS>')
            outlinks_str = outlinks_str + substr[t_stt:t_end].strip() + '\n'
            substr = substr[t_end + len('</OUTLINKS>'):]
        try:
            inlinks = ast.literal_eval(inlinks_str)
        except:
            inlinks = []
        try:
            outlinks = ast.literal_eval(outlinks_str)
        except:
            outlinks = []
        docs = docs[docend + len('</DOC>'):]

        if docno in existing_ids:
            print('!', end=' ')
            init_inlinks, init_outlinks = retrieve_links(index, docno)
            for inlink in inlinks:
                if inlink not in init_inlinks:
                    init_inlinks.append(inlink)
            for outlink in outlinks:
                if outlink not in init_outlinks:
                    init_outlinks.append(outlink)
            es.update(index=index,id=docno,
            body={'doc': {'inlinks': init_inlinks, 'outlinks': init_outlinks}})
        else:
            print('.', end=' ')§
            yield {
                '_index': index,
                '_id': docno,
                'text': text,
                'inlinks': inlinks,
                'outlinks': outlinks,
                'author': 'sunho'
            }

In [9]:
existing_ids = es_ids('merged_index')
print(len(existing_ids))

def merge_operation():
    for i in range(0, 81, 1):
        filepath = './indices/index' + str(i+1) + '.txt'
        with open(filepath, "r", encoding="ISO-8859-1") as f:
            print(filepath)
            docs = f.read()
            bulk(es, bulk_merge('merged_index', docs, existing_ids), chunk_size=100)

44626


In [10]:
merge_operation()

. . . . . ../indices/index61.txt
. . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . ! . . . . . . . . . . . . . . . . !. . . . . . . . . . ! . . . . . . . . . .. . . . . . . . ! . . . . . . . . . !. . . ! ! . . . . . . . . . . ! ! . .. . . . . . . ! . . . . . . . . . . . . . .. . . . . . . . . . . . . . . . . . ! . .. . . . . ! . . . . ! . . . . . . . . . . . ! . . . . . . . . . . . . . . . . . . . . . . . . ! . . . . . . . . . . . !. . ! . .. ! . . . . . . . ! . !. . ! . . . . . . . . . . !! . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . . . . . . . . . . . . ! . . . !! !! ! . . . . . ! . . ! . . . !. . . . . . . . . . ! . . . . . . . . . . . . . .. . . ! . ! ! . . . . . . ! . ! . . . . . . . . . . . . . . ! . . . . . . . . . . . . . !. . . . . . . . . . . . . . . . ! . . . . . . . . . . . . . . !. ! ! . ! !! . . . ! . . . . . . . . . . . . . . . . . . . . . . . . . . .. . . . . . . . . ! ! . . . . . !! . ! . !. . . . . ! . ! . . !. . ! . . . . . . . . ! . 