In [11]:
#!pip install openai
#!pip install tiktoken
#!pip install langchain


# Build Index

In [12]:
TOKEN_CHUNK_SIZE = 4000
TEST_ARTICLE_LIMIT = 20 # or NONE

In [13]:
import json

def open_file(file_path):
    if not os.path.exists(file_path):
        raise Exception(f"{file_path} - does not exist")

    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def write_file(file_path, content, method = 'w'):
    with open (file_path, method, encoding ='utf-8') as f:
        json.dump(content, f, indent = 2)

def write_text(file_path, content, method = 'w'):
    with open (file_path, method, encoding ='utf-8') as f:
        f.write(content)

In [14]:
import os


def crawl_directory(directory_path):

    if not os.path.exists(directory_path):
        raise Exception(f"{directory_path} does not exist")
        
    file_ls = list()
    for root, dirs, files in os.walk(directory_path, topdown=False):
        [file_ls.append(os.path.join(root, name)) for name in files]
    
    return file_ls

In [26]:
import re

INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
FOOTNOTE_LINK_TEXT_RE = re.compile(r'\[([^\]]+)\]\[(\d+)\]')
FOOTNOTE_LINK_URL_RE = re.compile(r'\[(\d+)\]:\s+(\S+)')

def process_article(text):
    try:
        text = text.split('---', 2)[2]

        return re.sub(INLINE_LINK_RE, '', text)

    except Exception as e:
        print(e)


# process_article(all_text[2].get('content'))


In [15]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

tiktoken.encoding_for_model('gpt-3.5-turbo')

# recommended for use with text-embedding-ada-002
tokenizer = tiktoken.get_encoding('cl100k_base')


def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=TOKEN_CHUNK_SIZE,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)

# text_splitter.split_text(text)


In [25]:
import os
import openai

openai.api_key = os.environ['OPENAPI_SECRET_KEY']


def get_gpt3_embedding(content, engine="text-embedding-ada-002"):
    print('accessing chatgpt')
    response = openai.Embedding.create(input=[content.replace("\n", " ")],
                                       model=engine
                                       )
    vector = response['data'][0]['embedding']
    return vector


## MAIN

In [28]:
# retrieve all md files in article folder
def build_index():
    file_ls = crawl_directory('../../../raw_kb/article')
    all_text = [{'file_path': file_path,
                'content': open_file(file_path)}
                for file_path in file_ls if file_path.endswith('index.qmd')]

    # subset for testing or set TEST_ARTICLE_LIMIT to NONE
    all_text = all_text[:TEST_ARTICLE_LIMIT]

    all_text_chunks = [{'file_path': text_obj.get('file_path'),
                        'content': chunk,
                        'chunk_id': index}
                       for text_obj in all_text
                       for index, chunk in enumerate(text_splitter.split_text(
                           process_article(text_obj.get('content')))
                           ) if text_obj.get('content')]

    all_text_chunks[0]
    result = list()

    for chunk in all_text_chunks:
        embedding = get_gpt3_embedding(chunk['content'].encode(
            encoding='ASCII', errors='ignore').decode())

        chunk.update({'vector': embedding})
        print(chunk, '\n\n\n')

        result.append(chunk)

    write_file('index.json', result)

# build_index()


# Calculate Question Embedding

In [29]:
import numpy as np

def similarity(v1, v2):
    """returns dot product of two vectors"""
    return np.dot(v1,v2)
    
def search_index(text, data, count=5):
    """
    calculates the embeddings vector for the `text` question then retrieves the top N matches
    will not calculate embedding for the same question twice (retrieves from history)
    """
    ## to do extend this to include other sources

    search_scores = 'question_vectors.json'
    search_vectors = json.loads(open_file(file_path=search_scores))

    match_text = next((search_vector for search_vector in search_vectors
                       if search_vector.get('question') == text), None)

    vector = None
    if match_text:
        vector = match_text.get('vector')

    else:
        vector = get_gpt3_embedding(text)
        search_vectors.append({'question': text, 'vector': vector})
        write_file(search_scores, content=search_vectors, method='a')

    scores = list()

    for i in data:
        score = similarity(vector, i.get('vector'))
        scores.append({**i, 'score': score})
    ordered = sorted(scores, key=lambda d: d.get('score'), reverse=True)

    return ordered[:count]


In [39]:
import datetime as dt


def gpt3_completion(prompt,
                    engine='text-davinci-003',
                    temp=0.6, top_p=1.0, tokens=2000, freq_pen=0.25, pres_pen=0.0,
                    stop=['<<END>>']):

    max_retry = 1
    retry = 0
    prompt = prompt.encode(encoding='ASCII', errors='ignore').decode()

    while retry <= max_retry:
        try:
            print('accessing chat gpt')
            response = openai.Completion.create(
                engine=engine,
                prompt=prompt,
                temperature=temp,
                max_tokens=tokens,
                top_p=top_p,
                frequency_penalty=freq_pen,
                presence_penalty=pres_pen,
                stop=stop)

            text = response.get('choices')[0].get('text').strip()
            text = re.sub('\s+', ' ', text)

            filename = f"{dt.datetime.now().strftime('%Y-%m-%d %H%M')}_gpt3.txt"

            write_text(f"gpt3_logs/{filename}",
                       content=f"PROMPT:\n\n{prompt} '\n\n============\n\n RESPONSE:\n\n{text}")

            return text

        except Exception as e:
            retry += 1
            if retry >= max_retry:
                return f"GPT3 error {e}"

            print(f"GPT3 error {e} retrying")
            time.sleep(1)


In [41]:
final_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)


## MAIN

In [35]:
data = open_file(file_path='index.json')
data = json.loads(data)

# while True:
# query = input("Enter your question here: ")
query = "what is the Views Explorer"

matched_articles_chunks = search_index(query, data, count = 5)

# generates level2 prompt which sumarizes matched article chunks
answers = list()
for article_chunk in matched_articles_chunks:
    prompt = open_file('prompt_answer.txt').replace('<<PASSAGE>>', answer.get('content')).replace('<<QUERY>>', query)
    answer = gpt3_completion(prompt)
    print('\n\n', answer)
    answers.append(answer)

all_answers = "\n\n.join(answers)
chunks = final_text_splitter.text_split(all_answers)

aggregate_answer= list()
for chunk in chunks:
    prompt = open_file('prompt_summary.txt').replace('<<SUMMARY>>', chunk)
    summary =gpt3_completion(prompt)
    aggregate_answer.append(summmary)
    
print('\n\n=========\n\n', '\n\n'.join(final))
