In [1]:
# https://stackoverflow.com/questions/34621093/persist-elastic-search-data-in-docker-container
# TODOs: New KB upload with smaller text window
# TODOs: Common Crawl index upload
# TODOs: Good Logging
# TODOs: Better Keyphrase Extraction - Fast KeyBert

In [2]:
### NLTK IMPORT, OVERRIDING SSL CERTIFICATES ###
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [6]:
### CRAWL LOCAL FILE-SYSTEM ###
import os
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("KNOWLEDGE_BASE")

knowledge_path = "/Users/joshua.sheppard/wiki_extract_II"

def iter_filesys(path):
    if os.path.isfile(path):
        yield path

    elif os.path.isdir(path):
        for dir, _, filenames in os.walk(path):
                for f in filenames:
                    if not f.endswith('.DS_Store'):
                        yield os.path.join(dir, f)

    else:
        raise RuntimeError("Invalid path %s" % path)

kw_files = iter_filesys(knowledge_path)

kw_sample = []
for i in kw_files:
    kw_sample.append(i)

# print(kw_sample[:-3])
print(len(kw_sample))

17039


In [17]:
from multiprocessing.pool import ThreadPool as Pool
from tqdm import tqdm
from yake import KeywordExtractor
from summa import keywords

import json
import more_itertools
import re
import spacy

nlp = spacy.load("en_core_web_sm")
files = [f for f in kw_sample]

def get_contents_(filename):
    """Parse the contents of a file. Each line is a JSON encoded document."""
    documents = []

    with open(filename) as f:
        for line in f:
            doc = json.loads(line)

            if doc["text"] == "": continue
            if not doc: continue

            documents.append((doc['id'], doc["title"], doc["text"]))

    return documents

def generate_data():
        for file in files:
                #yield get_contents_2(file)
                docs = get_contents_(file)
                for doc in docs:
                    yield(doc)

# Custom Regex Sentenzier (Faster Compute)
def sentensizer(doc):
    sents = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', doc)
    return sents

def normalise_text(passage):
    passage = str(passage)
    re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", passage)
    passage.encode("unicode_escape")
    passage.replace('"', '"')
    passage = re.sub("\n", "", passage)
    passage = re.sub('"', "'", passage)
    passage.strip()

    return passage

from nltk.tokenize import sent_tokenize
def sentence_window(article, window=3, step=2): 
    """ Generates a list of sentences of sliding size = window """

    # sents = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', article)
    # Segment Text to Sentences
    sents = sent_tokenize(article)

    if len(sents) == window:
        yield str(sents)

    for window in more_itertools.windowed(sents, n=window, step=2):
        yield window

# from keybert import KeyBERT
# from keyphrase_vectorizers import KeyphraseCountVectorizer
# kb = KeyBERT()

# def extract_keyphrase(doc, n_gram=3, n_kp=3, use_mmr="False", use_maxsum="False"):
#     kp = kb.extract_keywords(doc, vectorizer=KeyphraseCountVectorizer(), stop_words="english", diversity=0.2, )
#     if len(kp) == 0:
#         return []
#
#     else: return [i[0] for i in kp[0:n_kp]]

pool = Pool(8)
kw_extractor = KeywordExtractor(lan="en", n=3, top=5)
def passages(idx, source):
    count = 0
    with tqdm(total=(len(files))) as pbar:
        for i in pool.apply(generate_data):
            count += 1
            id, title, article = i

            for window in sentence_window(article):
                passage = " ".join(normalise_text(passage) for passage in window)

                if len(passage) < 50: continue
                else:

                    yield {
                        "_index": idx,
                        "document": {
                            "id": id,
                            "source": source,
                            "title": title,
                            "text": passage,
                            #"keyphrase": [i[0] for i in kw_extractor.extract_keywords(passage)],
                            "keyphrase": [i for i in keywords.keywords(passage).split("\n")]
                            }
                        }

                pbar.update()

In [18]:
### TEST KNOWLEDGE LOADER ###
test = []
trial = 1000
count = 0

# with tqdm(total=trial) as pbar:
for i in passages(idx="testing", source="wikipedia"):
    count += 1
    if count > trial:
        break

    test.append(i)

  6%|▌         | 1000/17039 [00:01<00:21, 756.52it/s]


In [19]:
test

[{'_index': 'testing',
  'document': {'id': '61837831',
   'source': 'wikipedia',
   'title': 'Hedda Lundh',
   'text': 'Hedda Lundh (1921–2012) was a Danish journalist and schoolteacher who, under the German occupation of Denmark in World War II, was a Danish resistance fighter. Based at the time in Aarhus, she is remembered as a railway saboteur, explosives expert and courier in the resistance movement. Early life.',
   'keyphrase_summa': ['hedda', 'lundh', 'resistance']}},
 {'_index': 'testing',
  'document': {'id': '61837831',
   'source': 'wikipedia',
   'title': 'Hedda Lundh',
   'text': "Early life. Born on 29 September 1921 in Korsør, Hedda Lundh was the daughter of the newspaper editor Theodor Lundh-Jensen (1884–1952) and Alpha Tusnelda Emilie Winckler (1887–1973). The youngest of three sisters, she was brought up in a middle-class home where her father called her his 'boy' as she climbed trees, joined the scouts and cut her hair short.",
   'keyphrase_summa': ['life', 'early'

In [20]:
len(test)

1000

In [21]:
from elasticsearch import Elasticsearch

### DB CONFIG ###
PORT = "http://localhost:9200"
INDEX_WIKI = "wiki_knowledge"
SOURCE = "wikipedia"

errors_before_interrupt = 5
refresh_index_after_insert = False
max_insert_retries = 3
yield_ok = False

### INIT DB OBJECT ###
knowledge_base = Elasticsearch(
    PORT,
    retry_on_timeout=True
)

knowledge_base

<Elasticsearch(['http://localhost:9200'])>

In [None]:
### REFRESH DB ###
#knowledge_base.delete_by_query(index=INDEX_NAME, query={"match_all": {}})

In [22]:
from elasticsearch.helpers import streaming_bulk, parallel_bulk
from tqdm.auto import tqdm

### WRITE TO DB ###
errors_count = 0
chunk_size = 25000
counta = len(files)/chunk_size
successes = 0

# with tqdm(total=(counta)) as pbar:
for ok, result in parallel_bulk(knowledge_base, passages(idx=INDEX_WIKI, source=SOURCE), chunk_size=chunk_size, request_timeout=60*3):
    if ok is not True:
            logging.error('Failed to import data')
            logging.error(str(result))
            errors_count += 1

            if errors_count == errors_before_interrupt:
                logging.fatal('Too many import errors, exiting with error code')
                exit(1)

    successes += ok
        # pbar.update()

  0%|          | 0/17039 [00:00<?, ?it/s]

INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:10.326s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:13.986s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:25.683s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.280s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.773s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.637s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:4.166s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.143s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.198s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.576s]
INFO:elastic_transport.transport:PUT http://localhost:920

In [32]:
### CONSTRUCT NEWS DATASET ###

### LOAD ###
from datasets import load_dataset
cc_news = load_dataset("cc_news")

def iter_data():
    for i in cc_news["train"]:
        yield i

cc_news_data = iter_data()

### WRITE TO DISK ###
fout = open("../data/cc_news.jsonl", "w")
for ln in cc_news_data:
    fout.write(json.dumps({
        "title": ln["title"],
        "text": ln["text"],
        "domain": ln["domain"],
        "date": ln["date"],
        "url": ln["url"],
    }))
    
    fout.write("\n")

fout.close

Reusing dataset cc_news (/Users/joshua.sheppard/.cache/huggingface/datasets/cc_news/plain_text/1.0.0/ae469e556251e6e7e20a789f93803c7de19d0c4311b6854ab072fecb4e401bd6)


  0%|          | 0/1 [00:00<?, ?it/s]

In [39]:
test = []
trial = 10000
count = 0

for i in passages(idx="testing", source="news"):
    count += 1
    test.append(i)

    if count > trial:
        break

In [40]:
test[-5:]

[{'_index': 'testing',
  'document': {'title': 'AUTO RACING: Auto Racing Glance',
   'text': 'Track: Daytona International Speedway (oval, 2.5 miles). Race distance: 400 miles, 160 laps. Last year: Ricky Stenhouse Jr. won after starting sixth. Last race: Kyle Busch took the checkered flag at Chicagoland for the fifth time in 2018. Fast facts: Busch and Kevin Harvick each have five wins in their first 17 starts of the season.'}},
 {'_index': 'testing',
  'document': {'title': 'AUTO RACING: Auto Racing Glance',
   'text': "Last year: Ricky Stenhouse Jr. won after starting sixth. Last race: Kyle Busch took the checkered flag at Chicagoland for the fifth time in 2018. Fast facts: Busch and Kevin Harvick each have five wins in their first 17 starts of the season. That's only happened one other time since 1977, as Denny Hamlin and Jimmie Johnson had five victories by this point in the season in 2010. ...The last six races at Daytona have been determined by less than a half-second, and a 2007

In [44]:
### LOAD USING DB OBJECT ###

### DB CONFIG ###
INDEX_CC = "common_crawl_news"
SOURCE = "common-crawl"

INFO:elastic_db:Connecting to http://localhost:9200
INFO:elastic_db:Connecting to <Elasticsearch(['http://localhost:9200'])>


In [43]:
#self, index_name, source, iterator, chunk_size

knowledge_base.bulk_add(
    index_name=INDEX_CC,
    source=SOURCE,
    iterator=passages,
    chunk_size=10000
)

INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:4.573s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.934s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:7.892s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:9.578s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:9.720s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.766s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.874s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:4.443s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:2.980s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_bulk [status:200 duration:3.460s]
INFO:elastic_transport.transport:PUT http://localhost:9200/_

UnboundLocalError: local variable 'errors_count' referenced before assignment

In [None]:
### TEST SEARCH ###
test_query = "government emails privacy"

def search(query_, db, index, k=5):
    results = db.search(
        index = db.elastic_index,
        query = {
            "size": k,
            "query": {
                "match": {
                    "document.text": query_,
        }}})

    hits = results["hits"]["hits"]
    doc_ids = [row['_source']["document"]["id"] for row in hits]

    print(results)
    return (hits, doc_ids)

test = search(test_query, knowledge_base, INDEX_CC, k=2)[0][0]["_source"]["document"]["text"]
test

In [7]:
#knowledge_path = "/Users/joshua.sheppard/"
import os
news_knowledge_path = "/Users/joshua.sheppard/PycharmProjects/cc_download_articles"

def iter_filesys(path):
    if os.path.isfile(path):
        yield path

    elif os.path.isdir(path):
        for dir, _, filenames in os.walk(path):
                for f in filenames:
                    if not f.endswith('.DS_Store'):
                        yield os.path.join(dir, f)

    else:
        raise RuntimeError("Invalid path %s" % path)

In [10]:
test = [i for i in iter_filesys(news_knowledge_path)]

In [11]:
len(test)

37332

In [43]:
import json

def get_contents_(filename):
    """Parse the contents of a file. Each line is a JSON encoded document."""
    documents = []
    with open(filename) as f:
        doc = json.load(f)
        documents.append((doc['description'], doc["maintext"]))

    return documents


def generate_data():
    for file in test:
            docs = get_contents_(file)
            for doc in docs:
                yield(doc)

In [45]:
news_ = [i for i in generate_data()]

In [50]:
news_[20000]

('U.S. stocks extended gains on Wednesday after President Donald Trump said there were no American casualties in the overnight Iranian missile strikes and that Tehran appeared to be standing down, easing concerns of an all-out conflict in the Middle East.',
 'Jan 8 (Reuters) - U.S. stocks extended gains on Wednesday after President Donald Trump said there were no American casualties in the overnight Iranian missile strikes and that Tehran appeared to be standing down, easing concerns of an all-out conflict in the Middle East.\nAt 11:32 a.m. ET, the Dow Jones Industrial Average was up 69.53 points, or 0.24%, at 28,653.21, the S&P 500 was up 8.41 points, or 0.26%, at 3,245.59. The Nasdaq Composite was up 23.52 points, or 0.26%, at 9,092.10. (Reporting by Sruthi Shankar in Bengaluru)')

In [42]:
with open(test[0], "r") as f:
    print(json.load(f))

{'authors': ['Kevin Hickey', 'Hours Ago'], 'date_download': '2020-01-03 19:32:59+00:00', 'date_modify': None, 'date_publish': '2020-01-03 17:23:39', 'description': "Indianapolis Colts GM Chris Ballard isn't going to force a selection at the quarterback position in the 2020 NFL Draft.", 'filename': 'https%3A%2F%2Fcoltswire.usatoday.com%2F2020%2F01%2F03%2Fnfl-draft-colts-quarterback-jacoby-brissett%2F.json', 'image_url': 'https://usatcoltswire.files.wordpress.com/2020/01/usatsi_13867257.jpg?w=1024&h=576&crop=1', 'language': 'en', 'localpath': None, 'maintext': 'There are still a few months before the Indianapolis Colts are on the clock in the first round of the 2020 NFL Draft but many are still wondering if they will be selecting a quarterback when the time comes.\nWhile general manager Chris Ballard didn’t exactly commit fully to Jacoby Brissett to be the unquestioned starter under center, he also made it clear that the Colts aren’t going to force a quarterback selection in the draft si

In [41]:
# from multiprocessing.pool import ThreadPool as Pool
# from tqdm import tqdm
# import more_itertools
# import re
#
# def clean(passage):
#     passage = str(passage)
#     passage.encode("unicode_escape")
#     passage.replace('"', '"')
#     passage.strip()
#     passage = re.sub("\n", "", passage)
#     passage = re.sub('"', "'", passage)
#
#     return passage
#
# def sentence_window(article, window=5, step=2):
#     """ Generates a list of sentences of sliding size = window """
#     sents = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', article)
#
#     if len(sents) == window:
#         yield str(sents)
#
#     for window in more_itertools.windowed(sents, n=window, step=2):
#         yield window
#
# pool = Pool(8)
# def passages(idx, source):
#     count = 0
#     # with tqdm(total=len(files)) as pbar:
#     for i in pool.apply(iter_data):
#         count += 1
#         article = i["text"]
#
#         for window in sentence_window(article):
#             passage = " ".join(clean(passage) for passage in window)
#
#             #if len(passage) < 50: continue
#
#             yield {
#                 "_index": idx,
#                 "document": {
#                     "source": source,
#                     "title": i["title"],
#                     "text": passage
#                     }
#                 }
#
#         # pbar.update()
#

In [None]:
# import re
# from elasticsearch import helpers
# from elasticsearch.helpers import streaming_bulk, parallel_bulk
# from tqdm.auto import tqdm
# import spacy

# es = wiki_ev
# errors_count = 0

# # TODOs: Tune chunk size
# chunk_size = 25000
# counta = len(files)//chunk_size
# successes = 0

# # with tqdm(total=counta) as pbar:
# for ok, result in parallel_bulk(es, passages(idx=INDEX_NAME, source=SOURCE), chunk_size=chunk_size, request_timeout=60*3):

# # for ok, result in tqdm(streaming_bulk(es, index=INDEX_NAME, actions=passages(idx=INDEX_NAME, source=SOURCE), 
# #                         chunk_size=chunk_size, request_timeout=60*3, max_retries=3)):
#     if ok is not True:
#             logging.error('Failed to import data')
#             logging.error(str(result))
#             errors_count += 1

#             if errors_count == errors_before_interrupt:
#                 logging.fatal('Too many import errors, exiting with error code')
#                 exit(1)
    
#     successes += ok

In [None]:
# import time
# from elasticsearch import helpers
# from elasticsearch.helpers import streaming_bulk, parallel_bulk
# from tqdm import tqdm

# "https://stackoverflow.com/questions/67522617/elasticsearch-bulk-insert-w-python-socket-timeout-error#:~:text=The%20connection%20to%20elasticsearch%20has,be%20handled%20as%20an%20error."

# "https://github.com/elastic/elasticsearch-py/issues/297"

# def load_data(docs, idx_):
#     for doc in docs:
#         idx, title, text = doc
#         doc_ = {"id": idx, "title": title, "text": text}

#         yield {
#             "_index": idx_,
#             "document": {
#                 "id": idx,
#                 "title": title,
#                 "text": text
#             }
#         }

# # TODOs: Utils, Duration Function as Decorator
# es = wiki_ev
# errors_count = 0

# # TODOs: Increase Chunk Size, with extended Timeout + handeling
# # TODOs: Experiment-Check with Yield OK
# for ok, result in parallel_bulk(es, load_data(wiki_data, "wiki_evidence"), chunk_size=500, request_timeout=60*3):
#     if ok is not True:
#             logging.error('Failed to import data')
#             logging.error(str(result))
#             errors_count += 1

#             if errors_count == errors_before_interrupt:
#                 logging.fatal('Too many import errors, exiting with error code')
#                 exit(1)

In [None]:

# import spacy

# nlp = spacy.load("en_core_web_sm")

# def sentence_window(article, window=3, step=2): 
#     """ Generates a list of sentences of sliding size = window """
    
#     sents = list(nlp(article).sents)
    
#     if len(sents) == window:
#         yield sents

#     for i in range(0, len(sents)):
#         yield(sents[i:i + window])

# def load_data(docs, idx_):
#     """ Generates an evidence document to be inserted into ES Index """
#     for doc in docs:
#         idx, title, text = doc

#         for paragraph in sentence_window(text):
#             yield {
#                 "_index": idx_,
#                 "document": {
#                     "id": idx,
#                     "title": title,
#                     "text": paragraph
#                 }
#             }


In [None]:
# import time
# from elasticsearch import helpers
# from elasticsearch.helpers import streaming_bulk
# from tqdm import tqdm
#
# "https://stackoverflow.com/questions/67522617/elasticsearch-bulk-insert-w-python-socket-timeout-error#:~:text=The%20connection%20to%20elasticsearch%20has,be%20handled%20as%20an%20error."
#
# def load_data(docs, idx_):
#     for doc in docs:
#         idx, title, text = doc
#         doc_ = {"id": idx, "title": title, "text": text}
#
#         yield {
#             "_index": idx_,
#             "document": doc
#         }
#
# # TODOs: Utils, Duration Function as Decorator
# es = wiki_ev
# #helpers.bulk(es, load_data(wiki_data, "wiki_evidence"), raise_on_error=False, chunk_size=500)
# errors_count = 0
# # TODOs: Increase Chunk Size, with extended Timeout + handelling
# for ok, result in streaming_bulk(es, load_data(wiki_data, "wiki_evidence"), chunk_size=500, request_timeout=60*3, yield_ok=yield_ok, refresh=refresh_index_after_insert):
#     if ok is not True:
#             logging.error('Failed to import data')
#             logging.error(str(result))
#             errors_count += 1
#
#             if errors_count == errors_before_interrupt:
#                 logging.fatal('Too many import errors, exiting with error code')
#                 exit(1)

In [None]:
# for article in wiki_data:
#     _id, title, text = article
#     doc = {"id": _id, "title": title, "text": text}
#
#     wiki_ev.add_doc(doc)

In [None]:
# https://github.com/elastic/elasticsearch-py/issues/297

In [None]:
### DOCUMENT IMPORT: FULL-TEXTS ###
# from multiprocessing import Pool
# # from utils import get_contents
# import utils
# from tqdm import tqdm
# import json

# # TODOs: USE A GENERATOR OBJECT
# import spacy
# nlp = spacy.load("en_core_web_sm")
#
# p = Pool(8)
# files = [f for f in kw_sample]
#
# count = 0
# test = []
# with tqdm(total=len(files)) as pbar:
#     for documents in p.map(utils.get_contents_2, files):
#             for doc in documents:
#                 _id, title, text = doc
#
#             # count += 1
#             # doc_ = {"id": _id, "title": title, "text": text}
#
#             #wiki_ev.add_doc(doc)
#             test.append(doc_)
#     pbar.update()

In [None]:
# from elasticsearch import Elasticsearch

# # INIT OBJECT

# # TODOs: Persist a Generator Object
# PORT = "http://localhost:9200"
# INDEX_NAME = "wiki_evidence"
# errors_before_interrupt = 5
# refresh_index_after_insert = False
# max_insert_retries = 3
# yield_ok = False

# wiki_ev = Elasticsearch(
#     PORT,
#     #http_auth=(es_api_user, es_api_password)
#     retry_on_timeout=True,  # should timeout trigger a retry on different node?
# )

# wiki_ev.elastic_index = INDEX_NAME

# wiki_ev

<Elasticsearch(['http://localhost:9200'])>

In [None]:
# ### DOCUMENT IMPORT: SEGMENTED-TEXTS ###

# from multiprocessing import Pool
# # from utils import get_contents
# from tqdm import tqdm

# # p = Pool(8)
# # files = [f for f in kw_sample]
# #
# # count = 0
# # test = []
# # with tqdm(total=len(files)) as pbar:
# #     for documents in p.map(get_contents, files):
# #         for doc in documents:
# #             _id, title, text = doc
# #
# #             count += 1
# #             doc_ = {"id": _id, "title": title, "text": text}
# #
# #             #wiki_ev.add_doc(doc)
# #             test.append(doc)
# #     pbar.update()

In [None]:
# ### QUERY DB ###
# import elastic_db
# # from elastic_db import ElasticDB
# #
# # # Params
# # PORT = "http://localhost:9200"
# # INDEX = "wiki_evidence"
# # DOC = "evidence"
# #
# # # Init Elasticsearch DB
# # wiki_ev_ = ElasticDB(elastic_port=PORT, elastic_index=INDEX, elastic_doc=DOC)
# #
# # results = wiki_ev_.search("exploitation a wider public debate indecency adult")
# # results

In [None]:
### SQLITE LOAD ###
# from multiprocessing import Pool
# import utils
# from tqdm import tqdm
# import sqlite3

# import spacy
# import uuid

# nlp = spacy.load("en_core_web_sm")

# def paragraphs(document):
#     start = 0
#     document = nlp(document)
#     passages = []
#     for token in document:
#         if token.is_space and token.text.count("\n") > 1:
#             yield document[start:token.i]
#             start = token.i
#     yield document[start:]


# def get_contents(filename):
#     """Parse the contents of a file. Each line is a JSON encoded document."""
#     documents = []

#     with open(filename) as f:
#         for line in f:
#             doc = json.loads(line)

#             if doc["text"] == "": continue
#             if not doc: continue

#             passages = [str(i) for i in paragraphs(doc["text"])][0].split("\n")

#             for passage in passages:
#                 if len(passage) < 50:
#                     continue

#                 documents.append((str(uuid.uuid4()).replace('-',''), doc['id'], doc["title"], passage))

#     return documents

# save_path = "../data/wiki_evidence.db"
#
# p = Pool(8)
# files = [f for f in kw_sample]
#
# conn = sqlite3.connect(save_path)
# c = conn.cursor()
#
# documents = "documents"
# c.execute(f"CREATE TABLE documents (id PRIMARY KEY, id_, title, text);")
#
# count = 0
# step = 100
# batches = [files[i:i + step] for i in range(0, len(files), step)]
#
# for i, batch in enumerate(batches):
#     logger.info(f"[.... Batch #{i} .....]")
#     with tqdm(total=len(batch)) as pbar:
#         for document in tqdm(p.imap_unordered(get_contents, files)):
#             count += 1
#             for content in document:
#                 # _id, title, passage = content
#                 c.executemany("INSERT INTO documents VALUES (?,?,?,?)", (content,))
#
#         pbar.update()
#         logger.info(f"[Uploaded {count} documents]")
#
# conn.commit()
# conn.close()

In [None]:
# seq = [0, 1, 2, 3, 4, 5]
# window_size = 3
# step = 2

# # steps = 0, 2, 4 

# for i in range(0, len(seq) - window_size + 1, step):
#     print(i)
#     # print(i + window_size)
#     if i + window_size > len(seq):
#         # print(window_size)
#         window_size = i + window_size - len(seq)
    
#     print(seq[i: i + window_size])