In [108]:
import openai
import os
import json
import gradio as gr
import langchain
import sys

from langchain.document_loaders import PyPDFLoader

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key  = os.getenv('OPENAI_API_KEY')

In [3]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

Test prompt

**Option 1 : the user select a brand**

In [4]:
#The use is asked to select a brand
selected_brand = "Samsung"
selected_model = "S23"
doc_base_path = "./examples/companies"

selected_doc_path = []

#We try to get the path 
brand_match = False
model_match = False

try:
    brand_match = os.path.exists(doc_base_path.lower())
except:
    print("No brand match")

if (brand_match):
    for current_file_path in os.listdir(os.path.join(doc_base_path, selected_brand.lower())):
        if (selected_model in current_file_path) & (".pdf" == str(os.path.splitext(current_file_path)[1])):
            model_match = True
            selected_doc_path.append(os.path.join(doc_base_path, selected_brand, current_file_path))

selected_doc_path

['./examples/companies/Samsung/Samsung_S23_Series_User_Guides_English.pdf']

In [5]:
def find_model_path(selected_brand, model, debug=False):
    try:
        #TO DO : Include examples with model number in 1 part and 3 part. E.g : C3, PSP

        selected_doc_path = []
        doc_base_path = "./examples/companies"

        ## Checking outputs
        if not selected_brand in os.listdir(doc_base_path): return []
        model_split = model.split("_")
        if ((len(model_split) != 2) | (model_split[0].strip() == "") | (model_split[1].strip() == "")): return []

        for current_file_path in os.listdir(os.path.join(doc_base_path, selected_brand.lower())):
            if (
                (model_split[0] in current_file_path.lower()) & 
                (model_split[1] in current_file_path.lower()) & 
                (".pdf" == str(os.path.splitext(current_file_path)[1]).lower())
            ):
                if (current_file_path.lower().index(model_split[0]) < current_file_path.lower().index(model_split[1])):
                    selected_doc_path.append(os.path.join(doc_base_path, selected_brand, current_file_path))
        return selected_doc_path

    except Exception as error:
        if debug: 
            print("An exception occurred:", type(error).__name__)
        return []

In [6]:
find_model_path("lg", "b2_c2",True)

An exception occurred: NameError


[]

**Option 2 : no brand selected : we will determine it in the chat**

In [7]:
user_prompt ="How to improve the battery life of my s23"

user_prompt_cropped=user_prompt[:300]

expected_examples = f"""
User: I want the documentation of the marcel galaxy S21
Response:
{{
    out_of_scope = False,
    brand_exists = True,
    model_exists = True,
    selected_brand = samsung,
    selected_model = galaxy_s21,
    request_language = en,
    type_of_object = smartphone
}}

User: What is the waranty of the Apple Ibode 10
Response:
{{
    out_of_scope = False,
    brand_exists = True,
    model_exists = True,
    selected_brand = apple,
    selected_model = iphone_10,
    request_language = en,
    type_of_object = smartphone
}}

User: What is the waranty of the Apple Ibode X
Response:
{{
    out_of_scope = False,
    brand_exists = True,
    model_exists = True,
    selected_brand = apple,
    selected_model = iphone_10,
    request_language = en,
    type_of_object = smartphone
}}

User: Comment appairer mon sony xm5
Response:
{{
    out_of_scope = False,
    brand_exists = True,
    model_exists = True,
    selected_brand = sony,
    selected_model = wh-1000xm5,
    request_language = fr,
    type_of_object = car
}}

User: Comment reparer le feu arrière de ma citroen C40
Response:
{{
    out_of_scope = False,
    brand_exists = True,
    model_exists = True,
    selected_brand = citroen,
    selected_model = c4,
    request_language = fr,
    type_of_object = car
}}

User: Qui est le 5eme president des etats unis ?
Response:
{{
    out_of_scope = True,
    brand_exists = False,
    model_exists = False,
    selected_brand = "",
    selected_model = "",
    request_language = "",
    type_of_object = ""
}}

User: How much is 8 times 19 ?
Response:
{{
    out_of_scope = True,
    brand_exists = False,
    model_exists = False,
    selected_brand = "",
    selected_model = "",
    request_language = "",
    type_of_object = "",
}}

User: Qui a créer la renault 4CV ?
Response:
{{
    out_of_scope = True,
    brand_exists = False,
    model_exists = False,
    selected_brand = "",
    selected_model = "",
    request_language = "",
    type_of_object = ""
}}


"""
chat_instruction_if_no_brand_no_model = f"""
An user will ask you for a specific question about a product brand and a model. \
Your task is to determine the brand and the model that the user is refering too. \
If you don't know the brand the user is refering too set the value of "brand_exists" key to False (using the same case), set the value of "model_exists" to False and all the other keys with a blank string and return the output without any more calculations. \
If you don't see any brand in the prompt try to find it \
It the model doesn't exists set the "model_exists" key to False (using the same case) but set the right value to brand. \
You will also identify the language of the request as the key "request_language". \
Also determine the type of object and put your result in the key "type_of_object". For example it could be smartphone, camera, car, etc.. \ 
If the value doesn't concern any information about a product set the value of out_of_scope to True.

I want your output formated as a json string with the following format: \
{{
    out_of_scope = [Boolean],
    brand_exists = [Boolean], 
    model_exists = [Boolean], 
    selected_brand = [string],
    selected_model = [string],
    request_language = [string],
    type_of_object = [string]
}}

The boolean values (out_of_scope, brand_exists and model_exists value) will be in formated as camelcase, to respect python convention to boolean values.
All other values should be in lowercase.

Here are some example of answers:
```{expected_examples}```

The user demand is delimited into tripple backtick
```{user_prompt_cropped}```
"""

response = get_completion(chat_instruction_if_no_brand_no_model)
print(response)

{
    "out_of_scope": false,
    "brand_exists": true,
    "model_exists": true,
    "selected_brand": "samsung",
    "selected_model": "galaxy_s23",
    "request_language": "en",
    "type_of_object": "smartphone"
}


In [8]:
# Convert JSON String to Python
api_answer_dic = json.loads(response)
print(api_answer_dic)

while (api_answer_dic["out_of_scope"]):
    print("Please retry with a request about a specific product and documentation")



{'out_of_scope': False, 'brand_exists': True, 'model_exists': True, 'selected_brand': 'samsung', 'selected_model': 'galaxy_s23', 'request_language': 'en', 'type_of_object': 'smartphone'}


In [9]:
# Import JSON module

# Define JSON string
jsonString = '{\n    "out_of_scope": true,\n    "brand_exists": false,\n    "model_exists": false,\n    "selected_brand": "",\n    "selected_model": "",\n    "request_language": "",\n    "type_of_object": ""\n}'

# Convert JSON String to Python
student_details = json.loads(jsonString)

# Print Dictionary
print(student_details)


{'out_of_scope': True, 'brand_exists': False, 'model_exists': False, 'selected_brand': '', 'selected_model': '', 'request_language': '', 'type_of_object': ''}


**Print the UI**

In [106]:
import random

my_theme = gr.Theme.from_hub("sudeepshouche/minimalist")

def random_response(message, history):
    return random.choice(["Yes", "No"])

chat_iface = gr.ChatInterface(
    random_response, 
    title="Chat with your product documentation 💬",
    textbox=gr.Textbox(placeholder="Enter a prompt here / Poser votre question ici", container=False, scale=7, limit=20),
    retry_btn=None,
    undo_btn=None,
    examples=["How long is the warranty for my galaxy S23 ?", "How to reset my Garmin Forerunner 265 watch ?"],
    cache_examples=True,
    theme=my_theme,
)

chat_iface.launch()

  textbox=gr.Textbox(placeholder="Enter a prompt here / Poser votre question ici", container=False, scale=7, limit=20),


Caching examples at: '/home/leo/project-job-2023/chat_doc_env/code/gradio_cached_examples/915'
Caching example 1/2
Caching example 2/2
Caching complete

Running on local URL:  http://127.0.0.1:7900

To create a public link, set `share=True` in `launch()`.




# Load a specific PDF

In [83]:
loader = PyPDFLoader("./examples/companies/garmin/fenix_7_Series_OM_EN-US.pdf")
pages = loader.load()
len(pages)


page_table_of_content = pages[2]

print(page_table_of_content.page_content[0:5000])

page_table_of_content.metadata

Table of Contents
Introduction ...................................... 1
Getting Started ........................................ 1
Device Overview ...................................... 1
Enabling and Disabling the 
Touchscreen ........................................ 2
Using the Watch ...................................... 2
Clocks .............................................. 2
Setting an Alarm ...................................... 2
Editing an Alarm .................................. 2
Starting the Countdown Timer ............... 3
Deleting a Timer .................................. 3
Using the Stopwatch ............................... 4
Adding Alternate Time Zones ................. 4
Editing an Alternate Time Zone .......... 5
Activities and Apps ........................... 5
Starting an Activity .................................. 5
Tips for Recording Activities .............. 5
Stopping an Activity ................................ 6
Evaluating an Activity .......................

{'source': './examples/companies/garmin/fenix_7_Series_OM_EN-US.pdf',
 'page': 2}

**1. Split using character**

In [84]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [85]:
docs = text_splitter.split_documents(pages)

In [86]:
len(docs)

438

In [87]:
len(pages)

152

**2. Split using PDF**

In [95]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

In [96]:
docs = text_splitter.split_documents(pages)

In [102]:
len(docs)

7537