## Dataset Preparation


In [87]:
import os
import json

def crawl_directory(folder_path):
    res = []
    for r, d, f in os.walk(folder_path):
        for directory in d:
                res.append(os.path.join(r, directory))
    return res

path_ls = crawl_directory("../../raw_domo_kb/article/")
path_ls[0:5]

['../../raw_domo_kb/article/000005048',
 '../../raw_domo_kb/article/000005059',
 '../../raw_domo_kb/article/000005073',
 '../../raw_domo_kb/article/000005080',
 '../../raw_domo_kb/article/000005090']

In [39]:
import os
import json
import datetime
from dateutil import parser
from bs4 import BeautifulSoup


def trim_newline(serie):
    serie = serie.replace('\n', ' ')
    serie = serie.replace('\\n', ' ')
    serie = serie.replace('  ', ' ')
    serie = serie.replace('  ', ' ')
    return serie

# def trim_newline(string):
#     return string.replace("\n\n\n", '')

def extract_text_from(html):
    soup = BeautifulSoup(html, features="html.parser")
    article = soup.find('article', class_ = "content") 
    text = article.get_text()

    lines = (line.strip() for line in text.splitlines())
    return '\n'.join(line for line in lines if line)

def load_dataset(path_ls, limit=None):
    dataset = []

    path_ls = path_ls[:limit] if limit else path_ls
    for file_path in path_ls:
        process_path = os.path.join(file_path, 'process.json')
        if not os.path.exists(process_path):
            continue
        with open(process_path, encoding="utf-8") as f:
            data = json.loads(f.read())
            article = {'title': trim_newline(data['content']['title']).strip(),
                       'id': trim_newline(data['content']['article_number']).strip(),
                       'url': f"https://domo-support.domo.com/s/article/{trim_newline(data['content']['article_number']).strip()}",
                       'lastModifiedDate': parser.parse(data['content']['article_created_date'].strip())
                       }

        html_path = os.path.join(file_path, 'index.html')
        with open(html_path, encoding="utf-8") as f:
            article_html = f.read()

            article.update(
                {'article': extract_text_from(article_html)})

        # md_path = os.path.join(file_path, 'index.md')
        # with open(md_path, encoding="utf-8") as f:
        #     article_md = f.read()
        #     article.update(
        #         {'article_md': article_md.replace('\n\n', '\n')})

        dataset.append(article)
    return dataset

kb_ds = load_dataset(path_ls, 10)
print(len(kb_ds))
kb_ds[:5]


10


[{'title': 'Flex Map v2 User Guide',
  'id': '000005048',
  'url': 'https://domo-support.domo.com/s/article/000005048',
  'lastModifiedDate': datetime.datetime(2022, 11, 4, 23, 29),
  'article': "Information\nTitle\nFlex Map v2 User Guide\nArticle Body\nIntro\nFlex Map v2 is a premium app available in the Appstore. Its main purpose is to display your data in different ways on a map to gain geographical and relational insights from the data. The app can map out various sets of locations and related data for those locations, create and display geographical territories to show the boundaries inside which mapped locations lie, and display heat maps to correlate locations with important metrics or demographics.\nBefore you begin using the app, make sure you have completed all the required configurations. Learn about these in the\nFlex Map v2 Implementation Guide\n.\nImportant:\nFlex Map v2 is not the same as Flex Map v1. Learn about how to use that app in the\nFlex Map v1 User Guide\n.\nThi

In [89]:
import tiktoken
from langchain.text_splitter import MarkdownTextSplitter
from langchain.text_splitter import CharacterTextSplitter

TOKEN_CHUNK_SIZE = 512

markdown_splitter = MarkdownTextSplitter(
    chunk_size=TOKEN_CHUNK_SIZE,
    chunk_overlap=20,  # number of tokens overlap between chunks
)

text_splitter = CharacterTextSplitter(chunk_size=TOKEN_CHUNK_SIZE, separator="\n")

def article_chunker(article, splitter):
    doc_chunk = splitter.create_documents([article['article']],
                                                   metadatas=[{
                                                       'article_id': article['id'],
                                                       'title': article['title'],
                                                       'url': article['url'],
                                                       'lastModifiedDate': article['lastModifiedDate'].strftime('%Y-%m-%d'),
                                                       'full_text': article['article']
                                                   }])

    for index,  chunk in enumerate(doc_chunk):
        chunk.metadata.update({'id':  f"{chunk.metadata['article_id'].strip()}-{index}",
                               })

    return doc_chunk

data = article_chunker(kb_ds[0], text_splitter)
data[0:5]

[Document(page_content='Information\nTitle\nFlex Map v2 User Guide\nArticle Body\nIntro', metadata={'article_id': '000005048', 'title': 'Flex Map v2 User Guide', 'url': 'https://domo-support.domo.com/s/article/000005048', 'lastModifiedDate': '2022-11-04', 'full_text': "Information\nTitle\nFlex Map v2 User Guide\nArticle Body\nIntro\nFlex Map v2 is a premium app available in the Appstore. Its main purpose is to display your data in different ways on a map to gain geographical and relational insights from the data. The app can map out various sets of locations and related data for those locations, create and display geographical territories to show the boundaries inside which mapped locations lie, and display heat maps to correlate locations with important metrics or demographics.\nBefore you begin using the app, make sure you have completed all the required configurations. Learn about these in the\nFlex Map v2 Implementation Guide\n.\nImportant:\nFlex Map v2 is not the same as Flex Map 

In [27]:
# %pip install keybert
# %pip install keyphrase_vectorizers 

In [90]:
from transformers.pipelines import pipeline
from keybert import KeyBERT

KEYWORD_MODEL = 'all-MiniLM-L6-v2'
KEYWORD_MODEL = 'yanekyuk/bert-uncased-keyword-extractor'

hf_model = pipeline("feature-extraction", model=KEYWORD_MODEL)
kw_model = KeyBERT(model=hf_model)

Some weights of the model checkpoint at yanekyuk/bert-uncased-keyword-extractor were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at yanekyuk/bert-uncased-keyword-extractor and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [91]:
from keyphrase_vectorizers import KeyphraseCountVectorizer

def get_keywords(text, kw_model, keyphrase_ngram_range = (1,3), stopwords = 'english', diversity = .3 ):
    
    # k1 = kw_model.extract_keywords(text, keyphrase_ngram_range=keyphrase_ngram_range, stop_words=stopwords,
    #                             use_mmr=True, diversity=diversity)

    k2 = kw_model.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=stopwords,
                            use_mmr=True, diversity=diversity)
    
    # return list(set([*k1, *k2]))

    return [ kw[0] for kw in k2]

get_keywords(data[5].metadata['full_text'], kw_model)

['map layers display',
 'save pin layers search',
 'settings grid displays',
 'tooltipinclude',
 'flex map v2 user guide']

In [110]:
# Use fake embeddings to test your pipeline
OPENAI_EMBEDDING_MODEL = 'text-embedding-ada-002'
OPENAI_EMBEDDING_MODEL_OUTPUT_SIZE = 1536

from langchain.embeddings import FakeEmbeddings

def get_vector_embedding(text, model=OPENAI_EMBEDDING_MODEL, is_embeddings = False):
    text = text.replace("\n", " ")
    
    if not is_embeddings:
        embeddings = FakeEmbeddings(size=OPENAI_EMBEDDING_MODEL_OUTPUT_SIZE)
        return embeddings.embed_query(text)

    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

get_vector_embedding(data[5].page_content)

# df['ada_embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))   

[1.1098957641594824,
 0.1591383061659513,
 0.08927482605124819,
 0.17398028594375525,
 1.9876320533340222,
 1.1078716849926777,
 -0.6797067140347816,
 0.5691935202043082,
 -0.4761759010566716,
 -0.5953237130516185,
 -0.29904707274772,
 -1.022269077130572,
 -0.8598431148601235,
 1.361710496292801,
 0.4539762168925008,
 2.412660130482365,
 0.7798989340682979,
 -0.6124335802556845,
 -0.19765737474042488,
 0.7865092846788679,
 -1.0239879744858904,
 -2.0147424532903395,
 0.47511189114303093,
 0.22522713479236808,
 0.25497372629303644,
 -0.12557546108860204,
 2.5710057502491503,
 -0.10406728900000464,
 1.024814686166941,
 0.2872299995410132,
 0.5927960664446876,
 1.7247902292674475,
 -1.116581552065743,
 -1.5180031133436795,
 0.6325035160453463,
 -0.7114028067216888,
 1.4761505121607927,
 1.660275040851517,
 -0.6286990665463503,
 1.0136385076274295,
 0.22837788677430762,
 0.8117249677326162,
 -0.4161976284479761,
 1.3772717975551076,
 1.5687969034750278,
 1.137386867484335,
 0.43310526225611

# Dump batch

In [123]:
import json

TOKEN_CHUNK_SIZE = 512
ARTICLE_KEYWORD_JSON = 'keyword.jsonl'

EMBED_JSON = 'embed.jsonl'

BASE_JSON = 'train.jsonl'


def handle_keywords(kb_ds):
    article_keywords = [{'id': article['id'],
                         'keywords': get_keywords(article['article'], kw_model)} for article in kb_ds]

    with open(ARTICLE_KEYWORD_JSON, 'w') as f:
        f.write(json.dumps(article_keywords) + '\n')


def handle_embeddings(chunked_articles, is_embeddings=True):
    chunked_article_keywords = [{'id': article_cn.metadata['id'],
                                 'vector': get_vector_embedding(
        article_cn.page_content, is_embeddings=is_embeddings)} for article in chunked_articles for article_cn in article]

    with open(EMBED_JSON, 'w') as f:
        f.write(json.dumps(chunked_article_keywords) + '\n')


def handle_base(chunked_articles):
    with open(BASE_JSON, 'w') as f:
        f.write(json.dumps(
            [article_cn.__dict__ for article in chunked_articles for article_cn in article]) + '\n')


def read_docs(doc_name):
    with open(doc_name, 'r') as f:
        return json.load(f)



def main(INPUT_FOLDER="../../raw_domo_kb/article/",
         PROCESSING_LIMIT=10,

         is_preprocess=False,
         is_embeddings=False


         ):

    if is_preprocess:
        path_ls = crawl_directory(INPUT_FOLDER)
        kb_ds = load_dataset(path_ls, PROCESSING_LIMIT)

        handle_keywords(kb_ds)

        chunked_articles = [article_chunker(
            article, text_splitter) for article in kb_ds]

        handle_base(chunked_articles)

        handle_embeddings(chunked_articles, is_embeddings)
    
    keywords, embeds, records = [read_docs(doc_name) for doc_name in [ARTICLE_KEYWORD_JSON, EMBED_JSON, BASE_JSON]]

    ids = [x['metadata']['id'] for x in records]
    contexts = [x['page_content'] for x in records]
    keyword_vecs = [ next((kw['keywords'] for kw in keywords if kw['id'] == x['metadata']['article_id']))for x in records]
        




    print('done')
    # return ids, contexts, 
    return keyword_vecs


main()


done


[['map layers display',
  'save pin layers search',
  'settings grid displays',
  'tooltipinclude',
  'flex map v2 user guide'],
 ['map layers display',
  'save pin layers search',
  'settings grid displays',
  'tooltipinclude',
  'flex map v2 user guide'],
 ['map layers display',
  'save pin layers search',
  'settings grid displays',
  'tooltipinclude',
  'flex map v2 user guide'],
 ['map layers display',
  'save pin layers search',
  'settings grid displays',
  'tooltipinclude',
  'flex map v2 user guide'],
 ['map layers display',
  'save pin layers search',
  'settings grid displays',
  'tooltipinclude',
  'flex map v2 user guide'],
 ['map layers display',
  'save pin layers search',
  'settings grid displays',
  'tooltipinclude',
  'flex map v2 user guide'],
 ['map layers display',
  'save pin layers search',
  'settings grid displays',
  'tooltipinclude',
  'flex map v2 user guide'],
 ['map layers display',
  'save pin layers search',
  'settings grid displays',
  'tooltipinclude

In [61]:
import pinecone


def builder(records: list):
    ids = [x['metadata']['id'] for x in records]
    
    contexts = [x['page_context'] for x in records]
    # create dense vecs
    keyword_vecs = [get_keywords(x['page_content'], kw_model) for x in records]
    # create sparse vecs
    input_ids = tokenizer(
        contexts, return_tensors='pt',
        padding=True, truncation=True
    )
    with torch.no_grad():
        sparse_vecs = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    # convert to upsert format
    upserts = []
    for _id, dense_vec, sparse_vec, context in zip(ids, dense_vecs, sparse_vecs, contexts):
        # extract columns where there are non-zero weights
        indices = sparse_vec.nonzero().squeeze().cpu().tolist()  # positions
        values = sparse_vec[indices].cpu().tolist()  # weights/scores
        # build sparse values dictionary
        sparse_values = {
            "indices": indices,
            "values": values
        }
        # build metadata struct
        metadata = {'context': context}
        # append all to upserts list as pinecone.Vector (or GRPCVector)
        upserts.append({
            'id': _id,
            'values': dense_vec,
            'sparse_values': sparse_values,
            'metadata': metadata
        })
    return upserts

builder(documents)

KeyError: 'id'