In [None]:
# https://stackoverflow.com/questions/34621093/persist-elastic-search-data-in-docker-container
# TODOs: New KB upload with smaller text window
# TODOs: Common Crawl index upload
# TODOs: Good Logging
# TODOs: Better Keyphrase Extraction - Fast KeyBert

In [4]:
### CRAWL LOCAL FILE-SYSTEM ###
import os
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("KNOWLEDGE_BASE")

knowledge_path = "/Users/joshua.sheppard/wiki_extract_II"

def iter_filesys(path):
    if os.path.isfile(path):
        yield path

    elif os.path.isdir(path):
        for dir, _, filenames in os.walk(path):
                for f in filenames:
                    if not f.endswith('.DS_Store'):
                        yield os.path.join(dir, f)

    else:
        raise RuntimeError("Invalid path %s" % path)

knowledge = iter_filesys(knowledge_path)

wiki_files = []
for i in knowledge:
    wiki_files.append(i)

print(len(wiki_files))

17039


In [5]:
# Normalise Text Helper Function
def normalise_text(passage):
    passage = str(passage)
    re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", passage)
    passage.encode("unicode_escape")
    passage.replace('"', '"')
    passage = re.sub("\n", "", passage)
    passage = re.sub('"', "'", passage)
    passage.strip()

    return passage

In [6]:
from multiprocessing.pool import ThreadPool as Pool
from tqdm.notebook import tqdm
import json
import re
import more_itertools
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
nlp = spacy.load("en_core_web_sm")

### EXTRACT CONTENTS ###
def get_contents_(filename):
    """Parse the contents of a file. Each line is a JSON encoded document."""
    documents = []
    with open(filename) as f:
        for line in f:
            doc = json.loads(line)

            if doc["text"] == "": continue
            if not doc: continue

            documents.append((doc['id'], doc["title"], doc["text"]))

    return documents

def generate_wiki_data(files):
    for file in files:
            docs = get_contents_(file)
            for doc in docs:
                yield(doc)

### KEYWORD EXTRACTION ###
from summa import keywords

### EXTRACT PASSAGES ###
def sentence_window(article, window=3, step=2):
    """ Generates a list of sentences of sliding size = window """
    sents = sent_tokenize(article)

    if len(sents) == window:
        yield str(sents)

    for window in more_itertools.windowed(sents, n=window, step=2):
        yield window

### ITERATE PASSAGES ###
import multiprocessing
def passages(files, generator, idx, source, len_):
    with tqdm(total=(len_)) as pbar:
       #with multiprocessing.Pool(8) as pool:
            for i in generator(files):
                id, title, article = i

                for window in sentence_window(article):
                    passage = " ".join(normalise_text(passage) for passage in window)

                    if len(passage) < 50: continue
                    else:
                        # Yake
                        #keywords = kw_extractor.extract_keywords(passage)

                        # Rake
                        #kw_extractor.extract_keywords_from_text(passage)
                        #keywords = kw_extractor.get_ranked_phrases()

                        # Summa
                        # keywords_ = keywords.keywords(passage)

                        yield {
                            "_index": idx,
                            "document": {
                                "id": id,
                                "source": source,
                                "title": title,
                                "text": passage,
                                "keyphrases": keywords.keywords(passage, split=True, ratio=0.8)[0:6]
                                #"keyphrase_yake": [i[0] for i in keywords][0:k],
                                #"keyphrase_rake": set([normalise_text(i) for i in keywords[0:5]]),
                                }
                            }

                    pbar.update()

In [8]:
### TEST KNOWLEDGE LOADER ###
import time
test = []
trial = 1000
count = 0
wiki_files_ = wiki_files[0:trial]

tic = time.time()
for i in passages(files=wiki_files_, generator=generate_wiki_data, idx="testing", source="wikipedia", len_=trial):
    count += 1
    if count > trial:
        break

    test.append(i)
toc = time.time()

duration = toc - tic
print(duration)

  0%|          | 0/1000 [00:00<?, ?it/s]

1.3984639644622803


In [6]:
test

[{'_index': 'testing',
  'document': {'id': '61837831',
   'source': 'wikipedia',
   'title': 'Hedda Lundh',
   'text': 'Hedda Lundh (1921–2012) was a Danish journalist and schoolteacher who, under the German occupation of Denmark in World War II, was a Danish resistance fighter. Based at the time in Aarhus, she is remembered as a railway saboteur, explosives expert and courier in the resistance movement. Early life.',
   'keyphrases': ['hedda',
    'lundh',
    'resistance',
    'early',
    'movement',
    'fighter']}},
 {'_index': 'testing',
  'document': {'id': '61837831',
   'source': 'wikipedia',
   'title': 'Hedda Lundh',
   'text': "Early life. Born on 29 September 1921 in Korsør, Hedda Lundh was the daughter of the newspaper editor Theodor Lundh-Jensen (1884–1952) and Alpha Tusnelda Emilie Winckler (1887–1973). The youngest of three sisters, she was brought up in a middle-class home where her father called her his 'boy' as she climbed trees, joined the scouts and cut her hair 

In [29]:
# from src.utils_.elastic_db import ElasticDB
#
# ## DB CONFIG ###
# PORT = "http://localhost:9200"
# INDEX_WIKI = "wiki"
# SOURCE_WIKI = "wikipedia"
#
# errors_before_interrupt = 5
# refresh_index_after_insert = False
# max_insert_retries = 3
# yield_ok = False
#
# wiki_kb = ElasticDB(PORT)
# print(wiki_kb, "INDEX", INDEX_WIKI, "SOURCE", SOURCE_WIKI)

INFO:src.utils_.elastic_db:Connecting to http://localhost:9200 
INFO:src.utils_.elastic_db:Connected to <Elasticsearch(['http://localhost:9200'])> 


<src.utils_.elastic_db.ElasticDB object at 0x2a7289760> INDEX wiki SOURCE wikipedia


In [30]:
# wiki_kb.add_index(INDEX_WIKI)

INFO:elastic_transport.transport:PUT http://localhost:9200/wiki [status:200 duration:0.452s]
INFO:src.utils_.elastic_db:Connected to wiki 


In [31]:
# wiki_kb.set_index(INDEX_WIKI)

INFO:src.utils_.elastic_db:Set Index to to wiki 


In [32]:
# # Runtime 26 hours
# wiki_kb.bulk_add(
#     files=wiki_files,
#     index_name=INDEX_WIKI,
#     generator=generate_wiki_data,
#     source=SOURCE_WIKI,
#     iterator=passages,
#     chunk_size=10000,
#     len_= len(wiki_files)
# )

  0%|          | 0/17039 [00:00<?, ?it/s]

  0%|          | 0/17039 [00:00<?, ?it/s]

INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.949s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.027s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:7.534s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:8.178s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:1.985s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:1.942s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.011s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:1.816s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:1.951s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:1.897s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_

In [9]:
### CONSTRUCT NEWS DATASET: COMMON CRAWL I ###
from datasets import load_dataset
cc_news = load_dataset("cc_news")
cc_news = cc_news["train"]



  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
### NEWS DATA ###

# TODOs: Make Generic Class
def generate_news_content(file):
    for doc in file:
        document = (doc['url'], doc["title"], doc["text"])

        yield document

In [11]:
### CONSTRUCT COMMON-CRAWL NEWS SET ###
sample = []
trial = 10000

count = 0
for i in passages(files=cc_news, generator=generate_news_content, idx="testing_news", source="cc_news", len_=trial):
    count += 1
    sample.append(i)

    if count > trial:
        break

  0%|          | 0/10000 [00:00<?, ?it/s]

In [10]:
sample

[{'_index': 'testing_news',
  'document': {'id': 'http://www.pointemagazine.com/mother-daughter-duo-dancing-2516681965.html',
   'source': 'cc_news',
   'title': 'Daughter Duo is Dancing in The Same Company',
   'text': "There's a surprising twist to Regina Willoughby's last season with Columbia City Ballet: It's also her 18-year-old daughter Melina's first season with the company. Regina, 40, will retire from the stage in March, just as her daughter starts her own career as a trainee. But for this one season, they're sharing the stage together.",
   'keyphrases': ['surprising twist',
    'regina',
    'city',
    'willoughby',
    'company',
    'daughter']}},
 {'_index': 'testing_news',
  'document': {'id': 'http://www.pointemagazine.com/mother-daughter-duo-dancing-2516681965.html',
   'source': 'cc_news',
   'title': 'Daughter Duo is Dancing in The Same Company',
   'text': "But for this one season, they're sharing the stage together. Performing Side-By-Side In The NutcrackerRegina 

In [18]:
# ### LOAD USING DB OBJECT ###
# from src.utils_ import elastic_db
#
# ### DB CONFIG ###
# INDEX_CC = "cc_news"
# SOURCE_CC = "common_crawl_news"
# PORT = "http://localhost:9200"
#
# cc_kb = elastic_db.ElasticDB(PORT)
#
# print(cc_kb, "INDEX", INDEX_CC, "SOURCE", SOURCE_CC)

INFO:src.utils_.elastic_db:Connecting to http://localhost:9200 
INFO:src.utils_.elastic_db:Connected to <Elasticsearch(['http://localhost:9200'])> 


<src.utils_.elastic_db.ElasticDB object at 0x3f942fbb0> INDEX cc_news SOURCE common_crawl_news


In [None]:
# cc_kb.add_index(INDEX_CC)

In [None]:
# cc_kb.set_index(INDEX_CC)

In [20]:
# cc_kb.bulk_add(
#     files=cc_news,
#     index_name=INDEX_CC,
#     generator=generate_news_content,
#     source=SOURCE_CC,
#     iterator=passages,
#     chunk_size=10000,
#     len_= len(cc_news)
# )

  0%|          | 0/708241 [00:00<?, ?it/s]

  0%|          | 0/708241 [00:00<?, ?it/s]

INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:4.649s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:6.614s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.852s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.712s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.492s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.595s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.219s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.049s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.301s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.304s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_

In [13]:
### CONSTRUCT NEWSROOM SET ###
# TODOs: Use Huggingface Data Library for Newsroom

sample = []
trial = 10000
count = 0

# TODOs: Test
def generate_newsroom_content(file):
    """Parse the contents of a file. Each line is a JSON encoded document."""

    with open(file) as f:
        for line in f:
            doc = json.loads(line)

            if doc["text"] == "": continue
            if not doc: continue

            document = (doc["url"], doc["title"], doc["text"])

            yield document

newsroom_files = "/Users/joshua.sheppard/newsroom/train.jsonl"

In [14]:
newsroom_sample = []

count = 0
for i in passages(files=newsroom_files, generator=generate_newsroom_content, idx="testing", source="newsroom", len_=trial):
    count += 1
    newsroom_sample.append(i)

    if count > trial:
        break

  0%|          | 0/10000 [00:00<?, ?it/s]

In [15]:
newsroom_sample

[{'_index': 'testing',
  'document': {'id': 'http://www.nytimes.com/2006/06/04/sports/soccer/04racism.html',
   'source': 'newsroom',
   'title': 'Surge in Racist Mood Raises Concerns on Eve of World Cup',
   'text': 'HAMBURG, Germany, June 3 \x97 As he left the soccer field after a club match in the eastern German city of Halle on March 25, the Nigerian forward Adebowale Ogungbure was spit upon, jeered with racial remarks and mocked with monkey noises. In rebuke, he placed two fingers under his nose to simulate a Hitler mustache and thrust his arm in a Nazi salute. Marc Zoro, right, an Ivory Coast native, was a target of racial slurs from the home fans in Messina, Italy.',
   'keyphrases': ['germany',
    'hamburg',
    'june',
    'adebowale',
    'nigerian forward',
    'coast']}},
 {'_index': 'testing',
  'document': {'id': 'http://www.nytimes.com/2006/06/04/sports/soccer/04racism.html',
   'source': 'newsroom',
   'title': 'Surge in Racist Mood Raises Concerns on Eve of World Cup'

In [16]:
# ### LOAD USING DB OBJECT ###
from src.utils_ import elastic_db

### DB CONFIG ###
INDEX_NEWS = "newsroom"
SOURCE_NEWS = "newsroom_news"
PORT = "http://localhost:9200"

newsroom_kb = elastic_db.ElasticDB(PORT)

print(newsroom_kb, "INDEX", INDEX_NEWS, "SOURCE", SOURCE_NEWS)

INFO:src.utils_.elastic_db:Connecting to http://localhost:9200 
INFO:src.utils_.elastic_db:Connected to <Elasticsearch(['http://localhost:9200'])> 


<src.utils_.elastic_db.ElasticDB object at 0x2ad3d5b80> INDEX newsroom SOURCE newsroom_news


In [17]:
newsroom_kb.add_index(INDEX_NEWS)

INFO:elastic_transport.transport:PUT http://localhost:9200/newsroom [status:200 duration:0.836s]
INFO:src.utils_.elastic_db:Connected to newsroom 


In [None]:
# newsroom_kb.set_index(INDEX_CC)

In [18]:
newsroom_kb.bulk_add(
    files=newsroom_files,
    index_name=INDEX_NEWS,
    generator=generate_newsroom_content,
    source=SOURCE_NEWS,
    iterator=passages,
    chunk_size=10000,
    len_= len(wiki_files)
)

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/17039 [00:00<?, ?it/s]

INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.797s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.851s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:4.344s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.303s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:9.229s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.269s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.566s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.268s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.095s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.167s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_

In [None]:
# ### CONSTRUCT COMMONCRAWL II ###

# #knowledge_path = "/Users/joshua.sheppard/"
# import os
# news_cc_2 = "/Users/joshua.sheppard/PycharmProjects/cc_download_articles"
# news_cc_1 = "/Users/joshua.sheppard/PycharmProjects/news-please/cc_download_articles"
#
# cc_1 = [i for i in iter_filesys(news_cc_1)]
# cc_2 = [i for i in iter_filesys(news_cc_2)]
#
# cc_news = cc_1 + cc_2
# len(cc_news)