In [2]:
import numpy as np

# Webscraping functionality


In [3]:

import requests
request_headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',#'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Read data from url and write to local file
url_path = ('https://havewheelchairwilltravel.net/singapore-airlines-economy-class-review/')
html_text = requests.get(url_path, headers=request_headers).text

with open('my_html.html', 'w') as file:
    file.write(html_text)
    

In [4]:
from langchain.document_loaders import BSHTMLLoader
import os

# Load data from html file
loader = BSHTMLLoader('my_html.html')
data = loader.load()

In [5]:
data

[Document(metadata={'source': 'my_html.html', 'title': 'Singapore Airlines economy class review - Have Wheelchair Will Travel'}, page_content="\n\n\n\n\n\nSingapore Airlines economy class review - Have Wheelchair Will Travel\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content\n\nTravel Without Limits Winter/Autumn 2024 Issue OUT NOW! Subscribe Here →\n \n\n\n\n\n\n \n \n\n\n \nMenu \n\n\n\n\nMenu \nBlog\nAbout us\n\nAbout this website\nAbout the author\nAbout our family\nWheely Good Chats With Braeden\n\n\nTips\n\nKnow Before You Go\nAirlines\nCar Hire\nTravel Insurance\nWheelchair tips\nAdditional Info\nRecreational activities\nTravel, Shopping & Money Saving Tips\n\n\nEquipment & Services\nTravel\n\nAustralia\nU.S.A\nEurope\n\n\nMedia\nBook Julie\nContact\n \n\n\n \n\n\n\n\n\nFebruary 8, 2023\nSingapore Airlines economy class review\n\n\n\n\n\n\n\n\n\nI fi

In [6]:
"""Create splitter"""
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap  = 10,
    length_function = len,
    is_separator_regex = True,
    separators=["\n+", "\t+", "\n\n", "\n", " ", ],
    keep_separator=False
)

# Split into documents
split_docs = text_splitter.split_documents(data)

In [7]:
"""Preview first 10 documents"""
split_docs[:10]

[Document(metadata={'source': 'my_html.html', 'title': 'Singapore Airlines economy class review - Have Wheelchair Will Travel'}, page_content='Singapore Airlines economy class review - Have Wheelchair Will Travel\n+ \n+Skip to content\n+Travel Without Limits Winter/Autumn 2024 Issue OUT NOW! Subscribe Here →\n+ \n+ \n+ \n+ \n+Menu \n+Menu \n+Blog\n+About us\n+About this website\n+About the author\n+About our family\n+Wheely Good Chats With Braeden\n+Tips'),
 Document(metadata={'source': 'my_html.html', 'title': 'Singapore Airlines economy class review - Have Wheelchair Will Travel'}, page_content='Tips\n+Know Before You Go\n+Airlines\n+Car Hire\n+Travel Insurance\n+Wheelchair tips\n+Additional Info\n+Recreational activities\n+Travel, Shopping & Money Saving Tips\n+Equipment & Services\n+Travel\n+Australia\n+U.S.A\n+Europe\n+Media\n+Book Julie\n+Contact\n+ \n+ \n+February 8, 2023'),
 Document(metadata={'source': 'my_html.html', 'title': 'Singapore Airlines economy class review - Have Wh

In [8]:
"""LLM utilities"""
from langchain_huggingface import HuggingFaceEndpoint
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAI
from langchain_openai import OpenAI, ChatOpenAI
from langchain_chroma import Chroma



  from .autonotebook import tqdm as notebook_tqdm


In [9]:
"""Embeddings"""
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

# Load embeddings model (sentence-transformer from huggingface)
embeddings_model_name="all-MiniLM-L6-v2"
embedding_function = HuggingFaceEmbeddings(model_name=embeddings_model_name)

"""to use OpenAI Embeddings (taken from langchain docs):"""
'''
os.environ["OPENAI_API_KEY"] = "<your key>"

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    # With the `text-embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    # dimensions=1024
)
'''


  warn_deprecated(


'\nos.environ["OPENAI_API_KEY"] = "<your key>"\n\nfrom langchain_openai import OpenAIEmbeddings\n\nembeddings = OpenAIEmbeddings(\n    model="text-embedding-3-large",\n    # With the `text-embedding-3` class\n    # of models, you can specify the size\n    # of the embeddings you want returned.\n    # dimensions=1024\n)\n'

In [10]:

# Demonstrate embeddings and similarity

text1 = 'The legroom on the flight was terrible.'
text2 = 'The food was terrible'
text3 = 'The space for legs was reasonable but there is room for improvement.'

emb1 = embedding_function.embed_query(text1)
emb2 = embedding_function.embed_query(text2)
emb3 = embedding_function.embed_query(text3)

print('''
Distance 1 -> 2: %.2f
Distance 1 -> 3: %.2f
Distance 2 -> 3: %.2f
''' % (np.dot(emb1, emb2), np.dot(emb1, emb3), np.dot(emb2, emb3)))


Distance 1 -> 2: 0.57
Distance 1 -> 3: 0.49
Distance 2 -> 3: 0.15



In [12]:
"""Create vector db (delete existing directory)"""
from pathlib import Path

vecstore_dir = f"{os.getcwd()}/chroma_vecstores_singairlines"

# If existing vectorstore, use it (to update it delete all the contents of the directory and run this cell)
if Path(f'{vecstore_dir}/chroma.sqlite3').exists():
    print("using existing vector db")
    vec_db = Chroma(persist_directory=vecstore_dir, embedding_function=embedding_function)
else:
    print("no vector db found; creating new vector db")
    vec_db = Chroma.from_documents(persist_directory=vecstore_dir, embedding=embedding_function, documents=split_docs)

using existing vector db


In [13]:
"""Example similarity search of the database with a query, retrieving the top k=2 results"""
vec_db.similarity_search(query="What are customer's sentiemts on wait times?", k=2)

[Document(metadata={'source': 'my_html.html', 'title': 'Singapore Airlines economy class review - Have Wheelchair Will Travel'}, page_content='Just back from Asia . My outgoing connecting flight with Singapore airline was delayed by more than 4 hours. Flight out the food was good. First part of connecting flight back to UK was with Scoot airline like flying with Ryan Air. Flight back to the UK on Singapore airline the food not good. As it'),
 Document(metadata={'source': 'my_html.html', 'title': 'Singapore Airlines economy class review - Have Wheelchair Will Travel'}, page_content='Reply \n+ \n+Julie \n+ \n+\t\t\t\t\t\t\t\t\t\tFebruary 16, 2023 at 1:52 am\t\t\t\t\t\t\t\t\t\n+ \n+Hi Joe,\n+It’s so good that you’ve found the service and price has been right for you to keep flying with Singapore Airlines. \n+Happy and safe travels when you next fly. \n+Julie\n+Reply \n+ \n+Lynn Jack \n+')]

In [25]:
"""set up llm"""
HUGFACE_TOKEN = "<your huggingface key>"
GOOGLE_TOKEN = "<your google key>"
OPENAI_TOKEN = "<your key>"

# Some default LLM options; you can set up your own langchain LLM
default_llm_dict = {
    'mistral_7b':{
            'repo_id':"mistralai/Mistral-7B-Instruct-v0.2",
            'max_new_tokens':400,
            'min_new_tokens':2,
            'temperature':0.001,
            'huggingfacehub_api_token':HUGFACE_TOKEN,
            'seed':1
        },
        'gemini-pro':{
            "model":"gemini-pro",
            "google_api_key":GOOGLE_TOKEN,
            "temperature":0,
            'seed':1
        }
}

In [26]:
"""Query construction"""
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.chains import RetrievalQAWithSourcesChain, RetrievalQA

llm = HuggingFaceEndpoint(**default_llm_dict['mistral_7b']) # Choosing mistral for demo
# llm = OpenAI(**llm_dict['openai']) 
# llm = GoogleGenerativeAI(**llm_dict['gemini-pro'])

qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm,
    retriever=vec_db.as_retriever(search_kwargs={"k": 3})
)

                    min_new_tokens was transferred to model_kwargs.
                    Please make sure that min_new_tokens is what you intended.


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/ishanshastri/.cache/huggingface/token
Login successful


In [27]:
"""Sample invokation with simple QA chain"""
qa_chain.invoke("Are the customers satisfied with the in-flight service?")

{'question': 'Are the customers satisfied with the in-flight service?',
 'answer': ' Yes, based on the testimonies of the customers, they were satisfied with the in-flight service of Singapore Airlines.\n',
 'sources': 'my\\_html.html'}

In [28]:
"""Using a prompt template for more customization"""
from langchain.prompts import PromptTemplate

# Define the prompt template, given variables 'context' and 'question'
template = """
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try 
to make up an answer. 
Keep the answer as concise as possible. Use 1 sentence to sum all points up.
______________
{context}
Question: {question}
Helpful Answer:"""

# Construct template
qa_with_context_prompt = PromptTemplate.from_template(template)

# Construct Retrieval chain, setting llm, retriever and prompt template
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vec_db.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": qa_with_context_prompt},
)


In [29]:
# invoke with query
qa_chain.invoke({"query": "how satisfied is the customer with the wait times?"})

{'query': 'how satisfied iw the customer with the wait times?',
 'result': ' The customer mentioned that their outgoing connecting flight with Singapore Airlines was delayed by more than 4 hours. However, they were happy with the landing experience on their return flight. Overall, the wait times did not seem to significantly impact their satisfaction.',
 'source_documents': [Document(metadata={'source': 'my_html.html', 'title': 'Singapore Airlines economy class review - Have Wheelchair Will Travel'}, page_content='Reply \n+ \n+Julie \n+ \n+\t\t\t\t\t\t\t\t\t\tFebruary 16, 2023 at 1:52 am\t\t\t\t\t\t\t\t\t\n+ \n+Hi Joe,\n+It’s so good that you’ve found the service and price has been right for you to keep flying with Singapore Airlines. \n+Happy and safe travels when you next fly. \n+Julie\n+Reply \n+ \n+Lynn Jack \n+'),
  Document(metadata={'source': 'my_html.html', 'title': 'Singapore Airlines economy class review - Have Wheelchair Will Travel'}, page_content='Just back from Asia . My 

In [None]:
import torch
from torch import nn

class diffuser(nn.Module):
    def __init__(self):
        pass

    def position_embed(t, d_embed):
        sins = torch.arrange((2, t))
        coses = torch.arrange((2, t))

        t_embed = nn.functional.one_hot(np.tensor(t), num_classes=d_embed)

        embedded = torch.matmul()
