In [20]:
# https://stackoverflow.com/questions/34621093/persist-elastic-search-data-in-docker-container
# TODOs: New KB upload with smaller text window
# TODOs: Common Crawl index upload
# TODOs: Good Logging
# TODOs: Better Keyphrase Extraction - Fast KeyBert

In [21]:
### CRAWL LOCAL FILE-SYSTEM ###
import os
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("KNOWLEDGE_BASE")

knowledge_path = "/Users/joshua.sheppard/wiki_extract_II"

def iter_filesys(path):
    if os.path.isfile(path):
        yield path

    elif os.path.isdir(path):
        for dir, _, filenames in os.walk(path):
                for f in filenames:
                    if not f.endswith('.DS_Store'):
                        yield os.path.join(dir, f)

    else:
        raise RuntimeError("Invalid path %s" % path)

knowledge = iter_filesys(knowledge_path)

wiki_files = []
for i in knowledge:
    wiki_files.append(i)

print(len(wiki_files))

17039


In [22]:
# Normalise Text Helper Function
def normalise_text(passage):
    passage = str(passage)
    re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", passage)
    passage.encode("unicode_escape")
    passage.replace('"', '"')
    passage = re.sub("\n", "", passage)
    passage = re.sub('"', "'", passage)
    passage.strip()

    return passage

In [26]:
from multiprocessing.pool import ThreadPool as Pool
from tqdm.notebook import tqdm
import json
import re
import more_itertools
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
nlp = spacy.load("en_core_web_sm")

### EXTRACT CONTENTS ###
def get_contents_(filename):
    """Parse the contents of a file. Each line is a JSON encoded document."""
    documents = []
    with open(filename) as f:
        for line in f:
            doc = json.loads(line)

            if doc["text"] == "": continue
            if not doc: continue

            documents.append((doc['id'], doc["title"], doc["text"]))

    return documents

def generate_wiki_data(files):
    for file in files:
            docs = get_contents_(file)
            for doc in docs:
                yield(doc)

### KEYWORD EXTRACTION ###
# TODOs: Modularise KW Extraction

# NOTE: YAKE Knowledgebase I
# from yake import KeywordExtractor
# kw_extractor = KeywordExtractor(lan="en", n=4, top=5)
# import yake

# language = "en"
# max_ngram_size = 3
# deduplication_thresold = 0.9
# deduplication_algo = 'seqm'
# k=5

# kw_extractor = yake.KeywordExtractor(lan=language, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo)

# from rake_nltk import Rake
# kw_extractor = Rake()

from summa import keywords

### EXTRACT PASSAGES ###
def sentence_window(article, window=3, step=2):
    """ Generates a list of sentences of sliding size = window """
    sents = sent_tokenize(article)

    if len(sents) == window:
        yield str(sents)

    for window in more_itertools.windowed(sents, n=window, step=2):
        yield window

### ITERATE PASSAGES ###
import multiprocessing
def passages(files, generator, idx, source):
    with tqdm(total=(len(files))) as pbar:
       #with multiprocessing.Pool(8) as pool:
            for i in generator(files):
                id, title, article = i

                for window in sentence_window(article):
                    passage = " ".join(normalise_text(passage) for passage in window)

                    if len(passage) < 50: continue
                    else:
                        # Yake
                        #keywords = kw_extractor.extract_keywords(passage)

                        # Rake
                        #kw_extractor.extract_keywords_from_text(passage)
                        #keywords = kw_extractor.get_ranked_phrases()

                        # Summa
                        # keywords_ = keywords.keywords(passage)

                        yield {
                            "_index": idx,
                            "document": {
                                "id": id,
                                "source": source,
                                "title": title,
                                "text": passage,
                                "keyphrases": keywords.keywords(passage, split=True, ratio=0.8)
                                #"keyphrase_yake": [i[0] for i in keywords][0:k],
                                #"keyphrase_rake": set([normalise_text(i) for i in keywords[0:5]]),
                                }
                            }

                    pbar.update()

In [27]:
### TEST KNOWLEDGE LOADER ###
import time
test = []
trial = 1000
count = 0
wiki_files_ = wiki_files[0:trial]

tic = time.time()
for i in passages(files=wiki_files_, generator=generate_wiki_data, idx="testing", source="wikipedia"):
    count += 1
    if count > trial:
        break

    test.append(i)
toc = time.time()

duration = toc - tic
print(duration)

  0%|          | 0/1000 [00:00<?, ?it/s]

1.453415870666504


In [28]:
test

[{'_index': 'testing',
  'document': {'id': '61837831',
   'source': 'wikipedia',
   'title': 'Hedda Lundh',
   'text': 'Hedda Lundh (1921–2012) was a Danish journalist and schoolteacher who, under the German occupation of Denmark in World War II, was a Danish resistance fighter. Based at the time in Aarhus, she is remembered as a railway saboteur, explosives expert and courier in the resistance movement. Early life.',
   'keyphrases': ['hedda',
    'lundh',
    'resistance',
    'early',
    'movement',
    'fighter',
    'danish journalist',
    'german occupation',
    'life',
    'based',
    'saboteur explosives',
    'war']}},
 {'_index': 'testing',
  'document': {'id': '61837831',
   'source': 'wikipedia',
   'title': 'Hedda Lundh',
   'text': "Early life. Born on 29 September 1921 in Korsør, Hedda Lundh was the daughter of the newspaper editor Theodor Lundh-Jensen (1884–1952) and Alpha Tusnelda Emilie Winckler (1887–1973). The youngest of three sisters, she was brought up in a 

In [29]:
from src.utils_.elastic_db import ElasticDB

## DB CONFIG ###
PORT = "http://localhost:9200"
INDEX_WIKI = "wiki"
SOURCE_WIKI = "wikipedia"

errors_before_interrupt = 5
refresh_index_after_insert = False
max_insert_retries = 3
yield_ok = False

wiki_kb = ElasticDB(PORT)
print(wiki_kb, "INDEX", INDEX_WIKI, "SOURCE", SOURCE_WIKI)

INFO:src.utils_.elastic_db:Connecting to http://localhost:9200 
INFO:src.utils_.elastic_db:Connected to <Elasticsearch(['http://localhost:9200'])> 


<src.utils_.elastic_db.ElasticDB object at 0x2a7289760> INDEX wiki SOURCE wikipedia


In [30]:
wiki_kb.add_index(INDEX_WIKI)

INFO:elastic_transport.transport:PUT http://localhost:9200/wiki [status:200 duration:0.452s]
INFO:src.utils_.elastic_db:Connected to wiki 


In [31]:
wiki_kb.set_index(INDEX_WIKI)

INFO:src.utils_.elastic_db:Set Index to to wiki 


In [None]:
wiki_kb.bulk_add(
    files=wiki_files,
    index_name=INDEX_WIKI,
    generator=generate_wiki_data,
    source=SOURCE_WIKI,
    iterator=passages,
    chunk_size=10000,
    len_= len(wiki_files)
)

  0%|          | 0/17039 [00:00<?, ?it/s]

  0%|          | 0/17039 [00:00<?, ?it/s]

INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.949s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.027s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:7.534s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:8.178s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:1.985s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:1.942s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.011s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:1.816s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:1.951s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:1.897s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_

In [None]:
### CONSTRUCT NEWS DATASET: COMMON CRAWL I ###
from datasets import load_dataset
cc_news = load_dataset("cc_news")
cc_news = cc_news["train"]

In [None]:
### NEWS DATA ###

# TODOs: Make Generic Class
def generate_news_content(file):
    for i in file:
        yield i

In [None]:
### CONSTRUCT COMMON-CRAWL NEWS SET ###
test = []
trial = 10000
count = 0

for i in passages(file=cc_news, generator=generate_news_content, idx="testing_news", source="news", len_=trial):
    count += 1
    test.append(i)

    if count > trial:
        break

In [None]:
test

In [None]:
### LOAD USING DB OBJECT ###
from src.utils_ import elastic_db

### DB CONFIG ###
INDEX_CC = "cc_news"
SOURCE_CC = "common_crawl"
PORT = "http://localhost:9200"

cc_kb = elastic_db.ElasticDB(PORT)

print(cc_kb, "INDEX", INDEX_CC, "SOURCE", SOURCE_CC)

In [None]:
cc_kb.add_index(INDEX_CC)

In [None]:
cc_kb.set_index(INDEX_CC)

In [None]:
cc_kb.bulk_add(
    files=cc_news,
    index_name=INDEX_CC,
    generator=generate_news_content,
    source=SOURCE_CC,
    iterator=passages,
    chunk_size=10000,
    len_= len(cc_news)
)

In [None]:
### CONSTRUCT NEWSROOM SET ###

test = []
trial = 10000
count = 0

def newsroom_content(filename):
    """Parse the contents of a file. Each line is a JSON encoded document."""
    documents = []

    with open(filename) as f:
        for line in f:
            doc = json.loads(line)

            if doc["text"] == "": continue
            if not doc: continue

            documents.append(doc["text"])

    return documents

# TODOs: Make Generic Class
# def generate_news_content(files):
#     for i in files:
#         yield i

filename = "/Users/joshua.sheppard/newsroom/train.jsonl"
def generate_newsroom():
    for i in newsroom_content(filename):
        yield i

newsroom_files = [i for i in generate_newsroom()]

In [None]:
newsroom_test = []

for i in passages(files=newsroom_files, generator=generate_newsroom, idx="testing", source="newsroom", len_=trial):
    count += 1
    newsroom_test.append(i)

    if count > trial:
        break

In [None]:
### DB CONFIG ###
INDEX_NEWS = "newsroom"
SOURCE_NEWS = "newsroom"
PORT = "http://localhost:9200"

from elasticsearch import Elasticsearch
newsroom_kb = Elasticsearch(PORT)
print(INDEX_NEWS, SOURCE_NEWS)

In [None]:
newsroom_kb.add_index(INDEX_CC)

In [None]:
newsroom_kb.set_index(INDEX_CC)

In [None]:
newsroom_kb.bulk_add(
    files=wiki_files,
    index_name=INDEX_WIKI,
    generator=generate_wiki_data,
    source=SOURCE_WIKI,
    iterator=passages,
    chunk_size=10000,
    len_= len(wiki_files)
)

In [None]:
### CONSTRUCT COMMONCRAWL II ###

#knowledge_path = "/Users/joshua.sheppard/"
import os
news_cc_2 = "/Users/joshua.sheppard/PycharmProjects/cc_download_articles"
news_cc_1 = "/Users/joshua.sheppard/PycharmProjects/news-please/cc_download_articles"

cc_1 = [i for i in iter_filesys(news_cc_1)]
cc_2 = [i for i in iter_filesys(news_cc_2)]

cc_news = cc_1 + cc_2
len(cc_news)

In [None]:
### REFRESH DB ###
# knowledge_base.delete_by_query(index=INDEX_CC, query={"match_all": {}})

In [None]:
# from elasticsearch.helpers import streaming_bulk, parallel_bulk
# ## WRITE TO DB ###
# errors_count = 0
# chunk_size = 25000
# counta = len(newsroom_data)
# successes = 0
# errors_before_interrupt = 5
#
# #newsroom_knowledge_base.bulk(index=INDEX_NEWS, operations=passages_(idx=INDEX_CC, source=SOURCE_CC, len_=counta))
#
# with tqdm(total=(counta)) as pbar:
#     for ok, result in parallel_bulk(newsroom_knowledge_base, newsroom_passages_(idx=INDEX_NEWS, source=SOURCE_NEWS, len_=counta), chunk_size=chunk_size, request_timeout=60*3):
#         if ok is not True:
#                 logging.error('Failed to import data')
#                 logging.error(str(result))
#                 errors_count += 1
#
#                 if errors_count == errors_before_interrupt:
#                     logging.fatal('Too many import errors, exiting with error code')
#                     exit(1)
#
#         successes += ok
#         pbar.update()

In [None]:
# import json
#
# import re
# def clean(clean):
#     clean = str(clean)
#     clean = re.sub(r"\n", "", clean)
#     clean = re.sub(r'(?<=[a-z])\'(?=[a-z])', '', clean)
#     clean = re.sub('([^a-zA-Z\s.!?])', "", clean)
#     clean = re.sub('\s+', ' ', clean)
#
#     clean = re.sub(r"www\S+", "", clean)
#     return clean.strip().lower()
#
# def get_contents_(filename):
#     """Parse the contents of a file. Each line is a JSON encoded document."""
#     documents = []
#     with open(filename) as f:
#         doc = json.load(f)
#         documents.append((clean(doc['description']), clean(doc["maintext"])))
#
#     return documents
#
# def generate_cc_data():
#     for file in cc_news:
#             docs = get_contents_(file)
#             for doc in docs:
#                 yield(doc)

In [None]:
### NLTK IMPORT, OVERRIDING SSL CERTIFICATES ###
# import nltk
# import ssl
#
# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context
#
# nltk.download()

In [None]:
# from multiprocessing.pool import ThreadPool as Pool
# from nltk.tokenize import sent_tokenize
#
# def sentence_window(article, window=3, step=2):
#     """ Generates a list of sentences of sliding size = window """
#     sents = sent_tokenize(article)
#
#     if len(sents) == window:
#         yield str(sents)
#
#     for window in more_itertools.windowed(sents, n=window, step=2):
#         yield window
#
# pool = Pool(8)
# from uuid import uuid4
#
# kw_extractor = KeywordExtractor(lan="en", n=3, top=5)
# def passages(idx, source, files):
#     count = 0
#     with tqdm(total=(len(files))) as pbar:
#         for i in pool.apply(generate_cc_data):
#             count += 1
#             title, article = i
#             id = uuid4()
#
#             for window in sentence_window(article):
#                 passage = " ".join(normalise_text(passage) for passage in window)
#
#                 if len(passage) < 50: continue
#                 else:
#
#                     yield {
#                         "_index": idx,
#                         "document": {
#                             "id": id,
#                             "source": source,
#                             "title": title,
#                             "text": passage,
#                             "keyphrase": [i for i in keywords.keywords(passage).split("\n")]
#                             }
#                         }
#
#                 pbar.update()

In [None]:
#from elasticsearch.helpers import streaming_bulk, parallel_bulk
# from tqdm.notebook import tqdm

# ## WRITE TO DB ###
# errors_count = 0
# chunk_size = 25000
# counta = len(files)/chunk_size
# len_ = len(kw_sample)
# successes = 0

# with tqdm(total=(counta)) as pbar:
#     for ok, result in parallel_bulk(wiki_kb, passages(idx=INDEX_WIKI, source=SOURCE, len_=len_), chunk_size=chunk_size, request_timeout=60*3):
#         if ok is not True:
#                 logging.error('Failed to import data')
#                 logging.error(str(result))
#                 errors_count += 1
#
#                 if errors_count == errors_before_interrupt:
#                     logging.fatal('Too many import errors, exiting with error code')
#                     exit(1)
#
#         successes += ok
#         pbar.update()

In [None]:
#cc_news_passages = [i for i in passages(idx=INDEX_CC, source=SOURCE_CC, files=cc_news)]
# sample = 1000
# cc_passages = []
#
# counta = 0
# for i in passages(idx=INDEX_CC, source=SOURCE_CC, files=cc_news):
#     counta += 1
#     cc_passages.append(i)
#
#     if counta > sample:
#         break

In [None]:
# from comcrawl import IndexClient
#
# client = IndexClient(["2019-11", "2020-50"])
# site = "reddit.com/r/MachineLearning/*"
# client.search(site, threads=2)

#client.download()
# first_page_html = client.results[0]["html"]

In [None]:
### TEST SEARCH ###
test_query = "government emails privacy"
#
# def search(query_, db, index, k=5):
#     results = db.search(
#         index = db.elastic_index,
#         query = {
#             "size": k,
#             "query": {
#                 "match": {
#                     "document.text": query_,
#         }}})
#
#     hits = results["hits"]["hits"]
#     doc_ids = [row['_source']["document"]["id"] for row in hits]
#
#     print(results)
#     return (hits, doc_ids)
#
# test = search(test_query, knowledge_base, INDEX_CC, k=2)[0][0]["_source"]["document"]["text"]
# test

In [None]:
# from multiprocessing.pool import ThreadPool as Pool
# from tqdm import tqdm
# import more_itertools
# import re
#
# def clean(passage):
#     passage = str(passage)
#     passage.encode("unicode_escape")
#     passage.replace('"', '"')
#     passage.strip()
#     passage = re.sub("\n", "", passage)
#     passage = re.sub('"', "'", passage)
#
#     return passage
#
# def sentence_window(article, window=5, step=2):
#     """ Generates a list of sentences of sliding size = window """
#     sents = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', article)
#
#     if len(sents) == window:
#         yield str(sents)
#
#     for window in more_itertools.windowed(sents, n=window, step=2):
#         yield window
#
# pool = Pool(8)
# def passages(idx, source):
#     count = 0
#     # with tqdm(total=len(files)) as pbar:
#     for i in pool.apply(iter_data):
#         count += 1
#         article = i["text"]
#
#         for window in sentence_window(article):
#             passage = " ".join(clean(passage) for passage in window)
#
#             #if len(passage) < 50: continue
#
#             yield {
#                 "_index": idx,
#                 "document": {
#                     "source": source,
#                     "title": i["title"],
#                     "text": passage
#                     }
#                 }
#
#         # pbar.update()
#

In [None]:
# import re
# from elasticsearch import helpers
# from elasticsearch.helpers import streaming_bulk, parallel_bulk
# from tqdm.auto import tqdm
# import spacy

# es = wiki_ev
# errors_count = 0

# # TODOs: Tune chunk size
# chunk_size = 25000
# counta = len(files)//chunk_size
# successes = 0

# # with tqdm(total=counta) as pbar:
# for ok, result in parallel_bulk(es, passages(idx=INDEX_NAME, source=SOURCE), chunk_size=chunk_size, request_timeout=60*3):

# # for ok, result in tqdm(streaming_bulk(es, index=INDEX_NAME, actions=passages(idx=INDEX_NAME, source=SOURCE), 
# #                         chunk_size=chunk_size, request_timeout=60*3, max_retries=3)):
#     if ok is not True:
#             logging.error('Failed to import data')
#             logging.error(str(result))
#             errors_count += 1

#             if errors_count == errors_before_interrupt:
#                 logging.fatal('Too many import errors, exiting with error code')
#                 exit(1)
    
#     successes += ok

In [None]:
# import time
# from elasticsearch import helpers
# from elasticsearch.helpers import streaming_bulk, parallel_bulk
# from tqdm import tqdm

# "https://stackoverflow.com/questions/67522617/elasticsearch-bulk-insert-w-python-socket-timeout-error#:~:text=The%20connection%20to%20elasticsearch%20has,be%20handled%20as%20an%20error."

# "https://github.com/elastic/elasticsearch-py/issues/297"

# def load_data(docs, idx_):
#     for doc in docs:
#         idx, title, text = doc
#         doc_ = {"id": idx, "title": title, "text": text}

#         yield {
#             "_index": idx_,
#             "document": {
#                 "id": idx,
#                 "title": title,
#                 "text": text
#             }
#         }

# # TODOs: Utils, Duration Function as Decorator
# es = wiki_ev
# errors_count = 0

# # TODOs: Increase Chunk Size, with extended Timeout + handeling
# # TODOs: Experiment-Check with Yield OK
# for ok, result in parallel_bulk(es, load_data(wiki_data, "wiki_evidence"), chunk_size=500, request_timeout=60*3):
#     if ok is not True:
#             logging.error('Failed to import data')
#             logging.error(str(result))
#             errors_count += 1

#             if errors_count == errors_before_interrupt:
#                 logging.fatal('Too many import errors, exiting with error code')
#                 exit(1)

In [None]:

# import spacy

# nlp = spacy.load("en_core_web_sm")

# def sentence_window(article, window=3, step=2): 
#     """ Generates a list of sentences of sliding size = window """
    
#     sents = list(nlp(article).sents)
    
#     if len(sents) == window:
#         yield sents

#     for i in range(0, len(sents)):
#         yield(sents[i:i + window])

# def load_data(docs, idx_):
#     """ Generates an evidence document to be inserted into ES Index """
#     for doc in docs:
#         idx, title, text = doc

#         for paragraph in sentence_window(text):
#             yield {
#                 "_index": idx_,
#                 "document": {
#                     "id": idx,
#                     "title": title,
#                     "text": paragraph
#                 }
#             }


In [None]:
# import time
# from elasticsearch import helpers
# from elasticsearch.helpers import streaming_bulk
# from tqdm import tqdm
#
# "https://stackoverflow.com/questions/67522617/elasticsearch-bulk-insert-w-python-socket-timeout-error#:~:text=The%20connection%20to%20elasticsearch%20has,be%20handled%20as%20an%20error."
#
# def load_data(docs, idx_):
#     for doc in docs:
#         idx, title, text = doc
#         doc_ = {"id": idx, "title": title, "text": text}
#
#         yield {
#             "_index": idx_,
#             "document": doc
#         }
#
# # TODOs: Utils, Duration Function as Decorator
# es = wiki_ev
# #helpers.bulk(es, load_data(wiki_data, "wiki_evidence"), raise_on_error=False, chunk_size=500)
# errors_count = 0
# # TODOs: Increase Chunk Size, with extended Timeout + handelling
# for ok, result in streaming_bulk(es, load_data(wiki_data, "wiki_evidence"), chunk_size=500, request_timeout=60*3, yield_ok=yield_ok, refresh=refresh_index_after_insert):
#     if ok is not True:
#             logging.error('Failed to import data')
#             logging.error(str(result))
#             errors_count += 1
#
#             if errors_count == errors_before_interrupt:
#                 logging.fatal('Too many import errors, exiting with error code')
#                 exit(1)

In [None]:
# for article in wiki_data:
#     _id, title, text = article
#     doc = {"id": _id, "title": title, "text": text}
#
#     wiki_ev.add_doc(doc)

In [None]:
# https://github.com/elastic/elasticsearch-py/issues/297

In [None]:
### DOCUMENT IMPORT: FULL-TEXTS ###
# from multiprocessing import Pool
# # from utils import get_contents
# import utils
# from tqdm import tqdm
# import json

# # TODOs: USE A GENERATOR OBJECT
# import spacy
# nlp = spacy.load("en_core_web_sm")
#
# p = Pool(8)
# files = [f for f in kw_sample]
#
# count = 0
# test = []
# with tqdm(total=len(files)) as pbar:
#     for documents in p.map(utils.get_contents_2, files):
#             for doc in documents:
#                 _id, title, text = doc
#
#             # count += 1
#             # doc_ = {"id": _id, "title": title, "text": text}
#
#             #wiki_ev.add_doc(doc)
#             test.append(doc_)
#     pbar.update()

In [None]:
# from elasticsearch import Elasticsearch

# # INIT OBJECT

# # TODOs: Persist a Generator Object
# PORT = "http://localhost:9200"
# INDEX_NAME = "wiki_evidence"
# errors_before_interrupt = 5
# refresh_index_after_insert = False
# max_insert_retries = 3
# yield_ok = False

# wiki_ev = Elasticsearch(
#     PORT,
#     #http_auth=(es_api_user, es_api_password)
#     retry_on_timeout=True,  # should timeout trigger a retry on different node?
# )

# wiki_ev.elastic_index = INDEX_NAME

# wiki_ev

In [None]:
# ### DOCUMENT IMPORT: SEGMENTED-TEXTS ###

# from multiprocessing import Pool
# # from utils import get_contents
# from tqdm import tqdm

# # p = Pool(8)
# # files = [f for f in kw_sample]
# #
# # count = 0
# # test = []
# # with tqdm(total=len(files)) as pbar:
# #     for documents in p.map(get_contents, files):
# #         for doc in documents:
# #             _id, title, text = doc
# #
# #             count += 1
# #             doc_ = {"id": _id, "title": title, "text": text}
# #
# #             #wiki_ev.add_doc(doc)
# #             test.append(doc)
# #     pbar.update()

In [None]:
# ### QUERY DB ###
# import elastic_db
# # from elastic_db import ElasticDB
# #
# # # Params
# # PORT = "http://localhost:9200"
# # INDEX = "wiki_evidence"
# # DOC = "evidence"
# #
# # # Init Elasticsearch DB
# # wiki_ev_ = ElasticDB(elastic_port=PORT, elastic_index=INDEX, elastic_doc=DOC)
# #
# # results = wiki_ev_.search("exploitation a wider public debate indecency adult")
# # results

In [None]:
### SQLITE LOAD ###
# from multiprocessing import Pool
# import utils
# from tqdm import tqdm
# import sqlite3

# import spacy
# import uuid

# nlp = spacy.load("en_core_web_sm")

# def paragraphs(document):
#     start = 0
#     document = nlp(document)
#     passages = []
#     for token in document:
#         if token.is_space and token.text.count("\n") > 1:
#             yield document[start:token.i]
#             start = token.i
#     yield document[start:]


# def get_contents(filename):
#     """Parse the contents of a file. Each line is a JSON encoded document."""
#     documents = []

#     with open(filename) as f:
#         for line in f:
#             doc = json.loads(line)

#             if doc["text"] == "": continue
#             if not doc: continue

#             passages = [str(i) for i in paragraphs(doc["text"])][0].split("\n")

#             for passage in passages:
#                 if len(passage) < 50:
#                     continue

#                 documents.append((str(uuid.uuid4()).replace('-',''), doc['id'], doc["title"], passage))

#     return documents

# save_path = "../data/wiki_evidence.db"
#
# p = Pool(8)
# files = [f for f in kw_sample]
#
# conn = sqlite3.connect(save_path)
# c = conn.cursor()
#
# documents = "documents"
# c.execute(f"CREATE TABLE documents (id PRIMARY KEY, id_, title, text);")
#
# count = 0
# step = 100
# batches = [files[i:i + step] for i in range(0, len(files), step)]
#
# for i, batch in enumerate(batches):
#     logger.info(f"[.... Batch #{i} .....]")
#     with tqdm(total=len(batch)) as pbar:
#         for document in tqdm(p.imap_unordered(get_contents, files)):
#             count += 1
#             for content in document:
#                 # _id, title, passage = content
#                 c.executemany("INSERT INTO documents VALUES (?,?,?,?)", (content,))
#
#         pbar.update()
#         logger.info(f"[Uploaded {count} documents]")
#
# conn.commit()
# conn.close()

In [None]:
# seq = [0, 1, 2, 3, 4, 5]
# window_size = 3
# step = 2

# # steps = 0, 2, 4 

# for i in range(0, len(seq) - window_size + 1, step):
#     print(i)
#     # print(i + window_size)
#     if i + window_size > len(seq):
#         # print(window_size)
#         window_size = i + window_size - len(seq)
    
#     print(seq[i: i + window_size])