# Longchain application

## Import necessary libraries

### This Longchain application code for the simulation with two bots(user, assistant bot)

1. Baseline - chatbot without knowledge
2. Baseline2 - chatbot with knowledge(unstructured)
3. RAG model - chatbot with knowledge(structured with RAG graph + sentiment analysis, text classification)

In [None]:
import os
import json
import operator
import numpy as np 
import json
import shutil
import subprocess
import sys
import random
import requests
import gc
import re
import torch
import langchain_cohere
import langgraph
import chromadb


from datetime import datetime
from langchain_cohere import ChatCohere, CohereEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import JSONLoader
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage, BaseMessage
from langchain_core.prompts import ChatPromptTemplate
from langgraph.graph import StateGraph, END
from typing import TypedDict, Annotated, Sequence, List
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datasets import Dataset


api_key ="PcNNLWMIs5WGtRnFo27VsoaV4HW1NqcfPZtRdBUd"

os.environ["COHERE_API_KEY"] = api_key



## 1. Prepare Dateset (dstc11-track5)

### 1) Clone the data/task repository


In [64]:
def setup_repo(repo_url: str, repo_name: str, work_dir: str = "/kaggle/working"):
    os.chdir(work_dir)
    
    # Remove repo if it exists
    if os.path.exists(os.path.join(work_dir, repo_name)):
        shutil.rmtree(os.path.join(work_dir, repo_name))
    
    # Clone repo
    subprocess.run(["git", "clone", repo_url], check=True)
    
    # Move into repo/data
    os.chdir(os.path.join(repo_name, "data"))


#set up for local computation
local_path = "/home/song0409/Desktop/CAI"


setup_repo("https://github.com/lkra/dstc11-track5.git", "dstc11-track5", work_dir=local_path)

## List all files in the current directory iteratively:
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Cloning into 'dstc11-track5'...


./output_schema.json
./README.md
./knowledge_aug_domain_reviews.json
./knowledge_aug_reviews.json
./knowledge.json
./val/labels.json
./val/logs.json
./test/labels.json
./test/logs.json
./train/logs_bkp.json
./train/labels.json
./train/logs.json
./train/bkp/labels.json
./train/bkp/logs.json


### 2) Loading dataset

In [66]:
with open('train/logs.json', 'r') as f:
    train_ds=json.load(f)
    print(len(train_ds))

with open('train/labels.json', 'r') as f:
    labels=json.load(f)

with open('knowledge.json', 'r') as f:
    knowledge_base=json.load(f)

32604


### 3) Format data

In [67]:
def format_dialogue(sample: List[dict]) -> List[dict]: 
    """
    Args:
    sample (List[dict]): A list of dictionaries where each dictionary contains two keys:
        - 'speaker' (str): A string indicating the speaker of the turn ('U' for user, 'S' for system).
        - 'text' (str): The text spoken by the respective speaker.

    Returns:
        List[dict]: A new array with a specific role and content

    """
    messages=[]
    messages.append({"role": "system", "content": "You are an assistant."})
    for dialogue_element in sample:
        role = dialogue_element['speaker']
        role = 'user' if role == 'U' else 'system'
        messages.append({"role": role, "content": dialogue_element['text']})

    return messages

def get_reviews(knowledge: List[dict]) -> List[str]: 
    """
    Args:
        knowledge (List[dict]): A list of dictionaries containing review information.

    Returns:
        List[str]: A list of strings where each string is a review extracted from the knowledge data.

    """
    sources = []
    for k in knowledge:
        try:
            domain = k["domain"]
            entity_id = str(k["entity_id"])
            doc_type = k["doc_type"]
            doc_id = str(k["doc_id"])
            sent_id = str(k["sent_id"])
            
            # Extract the review sentence from the knowledge base
            sentence = knowledge_base[domain][entity_id][f"{doc_type}s"][doc_id]['sentences'][sent_id]
            sources.append(sentence)
        except:
            continue

    return sources
    
def reformat_dataset(dataset, labels_dataset): 
    reformatted_dataset = {
        "dialogue": [],
        "knowledge": [],
        "response": [],
    }
    for sample_index in range(len(dataset)): 
        try:
            sample_dialogue = format_dialogue(dataset[sample_index])
            sample_knowledge = labels_dataset[sample_index].get("knowledge", [])
            sample_response = labels_dataset[sample_index].get("response", "")
            
            reformatted_dataset["dialogue"].append(sample_dialogue)
            reformatted_dataset["knowledge"].append(sample_knowledge)
            reformatted_dataset["response"].append(sample_response)
        except:
            continue
        
    return reformatted_dataset

reformatted_dataset = reformat_dataset(train_ds, labels)
dataset = Dataset.from_dict(reformatted_dataset)

### 3) Check format

In [68]:
# Assuming 'dataset' contains the reformatted dataset and 'model_outputs' contains the model's responses
for i in range(10):
    print(f"Dialogue {i + 1}:")
    print("Dialogue:")
    for turn in dataset[i]["dialogue"]:
        print(f"  {turn['role']}: {turn['content']}")
    
    print("\nKnowledge:")
    if labels[i]["target"]!=False:
        knowledges = get_reviews(labels[i]["knowledge"])
        for id,knowledge in enumerate(knowledges):
            print(f" review {id+1}. {knowledge}")
    else:
        print("  No reviews")
    
    print("\nGround Truth Response:")
    if dataset[i]["response"] !='':
        print(f"  {dataset[i]['response']}")
    else:
        print("  No ground truth")
    
    print("-" * 80)

Dialogue 1:
Dialogue:
  system: You are an assistant.
  user: Can you help me find a place to stay that is moderately priced and includes free wifi?
  system: sure, i have 17 options for you
  user: Are any of them in the south? I'd like free parking too.
  system: Yes, two are in the south and both have free parking and internet. I recommend the Bridge Guesthouse. Would you like me to book a reservation?
  user: I have back issues. Does this place have comfortable beds?

Knowledge:
 review 1. The room was clean and comfortable and not expensive.
 review 2. It could ruin your stay if you mind that kind of thing.
 review 3. Sadly though, I found that the bed in the room wasn't very comfortable at all.
 review 4. I do have to say, though, the bed is extremely uncomfortable.
 review 5. and the interior of the room was very good and bed was also very much comfortable.

Ground Truth Response:
  The Bridge Guest House is known for having pretty uncomfortable beds according to most guests. On

## 2. Preprocessing for RAG model

### 1) Sentiment analysis

In [69]:
# model: BERT-base-sentiment from huggingface(https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) 
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# set up the pipeline using the model and tokenizer
sentiment_pipe = pipeline(
    "sentiment-analysis", 
    model=model, 
    tokenizer=tokenizer
)

# print(sentiment_pipe("The staffs of this hotel was not very friendly, but the breakfast was okey!"))

Device set to use cuda:0


### 2) Classification model

In [70]:
# model: NLI-based Zero Shot Text Classification from huggingface (https://huggingface.co/facebook/bart-large-mnli)

# classfication model to determine the 'Feature'

classifier_pipe = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)

# define possible labels in hotel/Restaurants domain
POSSIBLE_LABELS = [
    "general",
    "cleanliness",
    "comfort", 
    "staff", 
    "price", 
    "location", 
    "food", 
    "wifi", 
    "parking", 
    "noise", 
    "bathroom",
    "view",
    "check-in",
    "breakfast",
    "dish"
]


def get_multi_features(text, threshold=0.4):
    """
    Returns a LIST of topics found in the text.
    Only includes topics with a confidence score > threshold.
    """
    results = classifier_pipe(
        text, 
        candidate_labels=POSSIBLE_LABELS,
        hypothesis_template="This review is about {}.",
        multi_label=True  
    )
    
    # results['labels'] and results['scores'] are sorted lists.
    # We zip them together and filter by score.
    selected_labels = []
    for label, score in zip(results['labels'], results['scores']):
        if score > threshold:
            selected_labels.append(label.capitalize())
            
    if not selected_labels: # 'general' is labeled if nothing match
        selected_labels.append(results['labels'][0].capitalize())
        
    return selected_labels

Device set to use cuda:0


### 3) RAG representation - document

In [None]:
rag_documents = []
size = 10
# size = len(dataset)

hotel_name_pattern = re.compile(
    r"(?:recommend|stay at|about|found|choose|is|hotel|guesthouse|whether|food|dish) (?:the )?([A-Z][a-z]+(?: [A-Z][a-z]+)*)"
)
print("--- Processing Data ---")

for i in range(size):
    # backwards through the dialogue to find which hotel the system recommended
    current_hotel_name = "Unknown Hotel"
    for turn in reversed(dataset[i]["dialogue"]):
        match = hotel_name_pattern.search(turn['content'])
        if match:
            current_hotel_name = match.group(1)
            break
    
    # extract reviews from the knowledge
    if labels[i]["target"] != False:
        reviews_text_list = get_reviews(labels[i]["knowledge"])
        
        for review_text in reviews_text_list:

            # feature list
            feature_list = get_multi_features(review_text)

            #sentiment
            rating = sentiment_pipe(review_text[:512])[0]['label'].split()[0] + ".0"

            #create the data entry
            entry = {
                "category": "Review",
                "hotel_name": current_hotel_name,
                "content": review_text,
                "features": feature_list,
                "rating": rating,
                "source_dialogue_id": i 
            }
            rag_documents.append(entry)

print(f"--- Processed {len(rag_documents)} reviews ---")

# save it in the folder (data/dstc11-track5)
output_filename = "hotel_data.json"
with open(output_filename, "w") as f:
    json.dump(rag_documents, f, indent=2)

print(f"Data saved to {output_filename}")


--- Processing Data ---
--- Processed 28 reviews ---
Data saved to hotel_data.json


### 4) RAG representation - retriever

In [72]:
# use the new file
dataset_path = "hotel_data.json" 

# Update the Metadata function to match the new fields
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["hotel_name"] = record.get("hotel_name")
    metadata["rating"] = record.get("rating")
    return metadata

# load it
loader = JSONLoader(
    file_path=dataset_path,
    jq_schema='.[]',
    content_key="content", 
    metadata_func=metadata_func,
    text_content=False
)

docs = loader.load()

# retriever
embeddings = CohereEmbeddings(model="embed-english-v3.0")
vectorstore = Chroma.from_documents(docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

### 5) Models

In [None]:
# setting up model
model = ChatCohere(model="command-r-08-2024")

LOG_FILE = "live_chat_log.json"


PLACE_LIST = [
    "Bridge Guesthouse",
    "Warkworth House",
    "Curry Garden",
    "Alexander Bed and Breakfast",
    "Gonville Hotel",
    "The Lensfield Hotel",
    "Saffron Brasserie",
    "Home from Home hotel",
    "Ask restaurant",
    "Yippee Noodle Bar",
    "Kymmoy",
    "Bangkok City",
    "The Ugly Duckling"
]

# initialize log file
with open(LOG_FILE, "w") as f:
    json.dump([], f)

#logger
def log_interaction(role: str, content: str):
    """
    Reads the current log, appends the new message, and saves it back immediately.
    """
    entry = {
        # "timestamp": datetime.now().strftime("%H:%M:%S"),
        "role": role,
        "content": content
    }
    
    try:
        with open(LOG_FILE, "r") as f:
            data = json.load(f)
    except:
        data = []
        
    data.append(entry)
    
    with open(LOG_FILE, "w") as f:
        json.dump(data, f, indent=2)

# history
def get_history_for_simulator(messages: list[BaseMessage]):
    """
    history is logged throughout the conversation
    """
    swapped_history = []
    for msg in messages:
        if isinstance(msg, HumanMessage):
            # This was the User speaking. For the Simulator, this is "I said..."
            swapped_history.append(AIMessage(content=msg.content))
        elif isinstance(msg, AIMessage):
            # This was the Bot speaking. For the Simulator, this is "It said..."
            swapped_history.append(HumanMessage(content=msg.content))
    return swapped_history

# define simulation state
class SimState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]

# user bot
def user_node(state):
    messages = state["messages"]
    
    # get chat history
    chat_history = get_history_for_simulator(messages)
    
    system_prompt = SystemMessage(content=f"""
    You are a traveler who trys to book a hotel, named Alex. 
    You are chatting with a hotel chatbot for booking guide,providing hotel information.
    Ask anythin regarding hotel booking or their facilitis.
    Be consistent with your previous sentences.
    Generate no more than 2 senctence at a time.
    You are currently insterest in one of the places : {PLACE_LIST}.
    You want to know about some of these topic : {POSSIBLE_LABELS}.
    First, you pick one out of the places, and ask questions.
    """)

    input_to_model = [system_prompt] + chat_history
    
    if len(messages) > 0:
        last_msg = messages[-1] # retrieve last chat
        
        if isinstance(last_msg, AIMessage):
             input_to_model.append(HumanMessage(content=last_msg.content))
        else:
             input_to_model.append(last_msg)
    else:
        # Initial chat
        input_to_model.append(HumanMessage(content="Start the conversation. Ask about random topic regarding hotel service."))

    
    response = model.invoke(input_to_model)
    
    # log
    log_interaction("User", response.content)
    
    return {"messages": [HumanMessage(content=response.content)]}

def chatbot_node(state):
    """
    This is a baseline model for comparison with RAG model.
    """
    messages = state["messages"]
    
    system_prompt = SystemMessage(content="""
    You are a helpful Hotel agency chatbot that provides any information regarding partners hotel facility or booking information.
    You also know reviews about the hotel and can summeraize the sentiments in a third-perspective. 
    Answer the user's questions based on the chat history, and lead the user to book the hotel after answering the questions.
    Generate response no more than 4 sentence.
    """)
    
    response = model.invoke([system_prompt] + messages)
    
    # LOG REAL-TIME
    log_interaction("Chatbot", response.content)
    
    return {"messages": [AIMessage(content=response.content)]}

def chatbot_node_RAG(state):
    """
    This is a RAG model using RAG Graph(knolwege in vector representation)
    1. READ user query.
    2. RETRIEVE relevant info from JSON.
    3. ANSWER using that info.
    """
    messages = state["messages"]
    last_user_message = messages[-1].content
    
    # retrieve data that relevant to the last user message
    retrieved_docs = retriever.invoke(last_user_message)
    
    # reformat the data to present
    context_text = ""
    if retrieved_docs:
        for doc in retrieved_docs:
            context_text += f"- Hotel: {doc.metadata.get('hotel')}\n"
            context_text += f"  Review: {doc.page_content}\n"
            context_text += f"  Rating: {doc.metadata.get('rating')}/5.0\n\n"
    else:
        context_text = "No specific reviews found in database."

    # prompt
    rag_prompt = f"""
    You are a helpful Hotel agency chatbot that provides any information regarding partners hotel/restaurant facility or booking information.    
    Use the following KNOWLEDGE BASE to answer the user's question.
    If the answer is not in the knowledge base, say you don't know, and introduce the most interesting fact about the place or reviews.
    Answer only based on ground fact in concise statement with no sugar coating.
    --- KNOWLEDGE BASE ---
    {context_text}
    ----------------------
    """
    
    system_message = SystemMessage(content=rag_prompt)
    
    response = model.invoke([system_message] + messages)
    
    return {"messages": [AIMessage(content=response.content)]}



### 6) Execute simulation

In [82]:
# CURRENT_CHATBOT_MODEL = chatbot_node
CURRENT_CHATBOT_MODEL = chatbot_node_RAG


# define a rounter to swap the roles
def router(state):
    if len(state["messages"]) > 6: return "end"
    if isinstance(state["messages"][-1], AIMessage): return "user"
    return "chatbot"



# run the application
workflow = StateGraph(SimState)
workflow.add_node("user", user_node)
workflow.add_node("chatbot", CURRENT_CHATBOT_MODEL)
workflow.set_entry_point("user")
workflow.add_edge("user", "chatbot")
workflow.add_conditional_edges("chatbot", router, {"user": "user", "end": END})

app = workflow.compile()

print(f"--- Chat Started (Logging to {LOG_FILE}) ---")
# kickoff with empty history
for event in app.stream({"messages": []}):
    for node_name, value in event.items():
        last_message = value["messages"][-1]
        content = last_message.content

        #formating
        if node_name == "user":
            print(f"User : {content}")
        else:
            print(f"system : {content}")
        print("_" * 200+"\n")
        
        sys.stdout.flush()
print("--- Chat Finished ---")

--- Chat Started (Logging to live_chat_log.json) ---
User : Hi there! I'm interested in booking a stay and would like to know more about the staff at The Ugly Duckling. How would you rate their friendliness and responsiveness?
________________________________________________________________________________________________________________________________________________________________________________________________________

system : The Ugly Duckling has received excellent reviews for its staff's friendliness and responsiveness. Multiple guests have praised the warm welcome and joy of interacting with the staff, rating it a perfect 5.0/5.0.
________________________________________________________________________________________________________________________________________________________________________________________________________

User : That's great to hear! Could you tell me more about the cleanliness and comfort of the rooms? I'd like to ensure a pleasant stay.
____________