In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [28]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [29]:
# Loading the hotel bookings dataset
file_path = 'hotel_bookings.csv'  # Update the path if needed
df = pd.read_csv(file_path)

In [30]:
rag_df = df[['hotel', 'is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'adr', 'country']].copy()


In [31]:
# Converting the dataframe into strings for embedding
rag_df['combined_text'] = rag_df.apply(
    lambda row: f"Hotel: {row['hotel']}, Canceled: {row['is_canceled']}, Lead Time: {row['lead_time']} days, "
                f"Date: {row['arrival_date_month']} {row['arrival_date_year']}, Price: ${row['adr']}, "
                f"Country: {row['country']}", axis=1
)


In [32]:
# Loading the sentence transformer model for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient and fast


In [33]:
# Generating embeddings
embeddings = model.encode(rag_df['combined_text'].tolist(), convert_to_tensor=False)


In [34]:
# FAISS index
# Convert embeddings to a NumPy array
embeddings_np = np.array(embeddings)  # Assign the embeddings to embeddings_np

# Now you can use embeddings_np
dimension = embeddings_np.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings_np)

In [35]:
# Using GPT-Neo as the LLM
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

In [36]:
# Load GPT-Neo pipeline
generator = pipeline("text-generation", model="EleutherAI/gpt-neo-1.3B", max_new_tokens=200)  # Setting max_new_tokens
llm = HuggingFacePipeline(pipeline=generator)

Device set to use cuda:0


In [None]:
# Create a RAG prompt template
template = """
I am your booking assistant. I will answer the query based on the retrieved booking data.
Query: {query}
Booking Info: {context}
Answer:
"""
prompt = PromptTemplate(input_variables=["query", "context"], template=template)
chain = LLMChain(llm=llm, prompt=prompt)

  chain = LLMChain(llm=llm, prompt=prompt)


In [38]:
# 🔍 RAG FUNCTION
# ---------------------------
def rag_query(question, top_k=5):
    # Convert the question into an embedding
    question_embedding = model.encode([question])[0].reshape(1, -1)

    # Search for relevant data in FAISS
    _, indices = faiss_index.search(question_embedding, top_k)

    # Retrieve the matching booking info
    context = "\n".join(rag_df.iloc[idx]['combined_text'] for idx in indices[0])

    # Generate the answer using the LLM
    response = chain.run(query=question, context=context)

    return response

In [39]:
# ---------------------------
# 🚀 TEST THE SYSTEM
# ---------------------------
print("✅ RAG system is ready!")



✅ RAG system is ready!


In [40]:
# ---------------------------
# 🚀 INTERACTIVE RAG SYSTEM
# ---------------------------

print(" Ask your booking-related questions! (Type 'exit' to stop)\n")

while True:
    query = input(" Your Query: ")

    if query.lower() == 'exit':
        print(" Exiting the RAG system. See you later")
        break

    # Get the answer using RAG
    answer = rag_query(query)
    print(f"\n Answer: {answer}\n")


 Ask your booking-related questions! (Type 'exit' to stop)

 Your Query: What is the average price of a hotel booking?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Answer: 
I am your booking assistant. I will answer the query based on the retrieved booking data.
Query: What is the average price of a hotel booking?
Booking Info: Hotel: Resort Hotel, Canceled: 0, Lead Time: 0 days, Date: August 2017, Price: $116.61, Country: ESP
Hotel: Resort Hotel, Canceled: 0, Lead Time: 192 days, Date: July 2017, Price: $155.52, Country: BEL
Hotel: Resort Hotel, Canceled: 0, Lead Time: 11 days, Date: October 2015, Price: $116.71, Country: BEL
Hotel: Resort Hotel, Canceled: 0, Lead Time: 132 days, Date: June 2017, Price: $155.82, Country: AUT
Hotel: Resort Hotel, Canceled: 0, Lead Time: 222 days, Date: April 2017, Price: $116.8, Country: BEL
Answer:
$1,000

A:

Yes, that's how average price of hotel booking would look like. 



 Your Query: Which locations had the highest booking cancellations?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Answer: 
I am your booking assistant. I will answer the query based on the retrieved booking data.
Query: Which locations had the highest booking cancellations?
Booking Info: Hotel: City Hotel, Canceled: 1, Lead Time: 132 days, Date: August 2016, Price: $137.7, Country: ESP
Hotel: City Hotel, Canceled: 1, Lead Time: 141 days, Date: August 2016, Price: $137.7, Country: ESP
Hotel: City Hotel, Canceled: 1, Lead Time: 27 days, Date: June 2016, Price: $137.76, Country: ARE
Hotel: City Hotel, Canceled: 1, Lead Time: 191 days, Date: August 2016, Price: $137.7, Country: BEL
Hotel: City Hotel, Canceled: 1, Lead Time: 142 days, Date: August 2016, Price: $243.6, Country: MAR
Answer:
Hotel: City Hotel, Canceled: 1, Lead Time: 139 days, Date: August 2016, Price: $227.35, Country: AL
Hotel: City Hotel, Canceled: 2, Lead Time: 141 days, Date: August 2016, Price: $227.35, Country: AL
Hotel: City Hotel, Canceled: 1, Lead Time: 142 days, Date: August 2016, Price: $227.35, Country: AL
Hotel: City Hotel

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Answer: 
I am your booking assistant. I will answer the query based on the retrieved booking data.
Query: Show me total revenue for July 2017.
Booking Info: Hotel: City Hotel, Canceled: 1, Lead Time: 149 days, Date: June 2017, Price: $210.0, Country: IRL
Hotel: City Hotel, Canceled: 0, Lead Time: 157 days, Date: June 2017, Price: $148.5, Country: IRL
Hotel: Resort Hotel, Canceled: 0, Lead Time: 152 days, Date: July 2017, Price: $149.0, Country: IRL
Hotel: City Hotel, Canceled: 0, Lead Time: 80 days, Date: June 2017, Price: $185.0, Country: IRL
Hotel: City Hotel, Canceled: 0, Lead Time: 146 days, Date: July 2017, Price: $139.5, Country: NLD
Answer:

Hotel: City Hotel, Canceled: 0, Lead Time: 151 days, Date: June 2017, Price: $139.5, Country: IRL
Hotel: City Hotel, Canceled: 0, Lead Time: 146 days, Date: July 2017, Price: $139.5, Country: IRL
Hotel: Resort Hotel, Canceled: 0, Lead Time: 152 days, Date: July 2017, Price: $148.5, Country: IRL
Hotel: City Hotel, Canceled: 0, Lead Time: 84