In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [3]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [6]:
# Loading the hotel bookings dataset
file_path = 'hotel_bookings.csv'  # Update the path if needed
df = pd.read_csv(file_path)

In [7]:
rag_df = df[['hotel', 'is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'adr', 'country']].copy()


In [8]:
# Converting the dataframe into strings for embedding
rag_df['combined_text'] = rag_df.apply(
    lambda row: f"Hotel: {row['hotel']}, Canceled: {row['is_canceled']}, Lead Time: {row['lead_time']} days, "
                f"Date: {row['arrival_date_month']} {row['arrival_date_year']}, Price: ${row['adr']}, "
                f"Country: {row['country']}", axis=1
)


In [9]:
# Loading the sentence transformer model for embedding
model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient and fast


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# Generating embeddings
embeddings = model.encode(rag_df['combined_text'].tolist(), convert_to_tensor=False)


In [11]:
# FAISS index
# Convert embeddings to a NumPy array
embeddings_np = np.array(embeddings)  # Assign the embeddings to embeddings_np

# Now you can use embeddings_np
dimension = embeddings_np.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings_np)

In [12]:
# Using GPT-Neo as the LLM
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

In [13]:
# Load GPT-Neo pipeline
generator = pipeline("text-generation", model="EleutherAI/gpt-neo-1.3B", max_new_tokens=200)  # Setting max_new_tokens
llm = HuggingFacePipeline(pipeline=generator)

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=generator)


In [14]:
# Create a RAG prompt template
template = """
I am your booking assistant. I will answer the query based on the retrieved booking data.
Query: {query}
Booking Info: {context}
Answer:
"""
prompt = PromptTemplate(input_variables=["query", "context"], template=template)
chain = LLMChain(llm=llm, prompt=prompt)

  chain = LLMChain(llm=llm, prompt=prompt)


In [15]:
# 🔍 RAG FUNCTION
# ---------------------------
def rag_query(question, top_k=5):
    # Convert the question into an embedding
    question_embedding = model.encode([question])[0].reshape(1, -1)

    # Search for relevant data in FAISS
    _, indices = faiss_index.search(question_embedding, top_k)

    # Retrieve the matching booking info
    context = "\n".join(rag_df.iloc[idx]['combined_text'] for idx in indices[0])

    # Generate the answer using the LLM
    response = chain.run(query=question, context=context)

    return response

In [16]:
# ---------------------------
# 🚀 TEST THE SYSTEM
# ---------------------------
print("✅ RAG system is ready!")



✅ RAG system is ready!


In [17]:
# ---------------------------
# 🚀 INTERACTIVE RAG SYSTEM
# ---------------------------

print(" Ask your booking-related questions! (Type 'exit' to stop)\n")

while True:
    query = input(" Your Query: ")

    if query.lower() == 'exit':
        print(" Exiting the RAG system. See you later")
        break

    # Get the answer using RAG
    answer = rag_query(query)
    print(f"\n Answer: {answer}\n")


 Ask your booking-related questions! (Type 'exit' to stop)

 Your Query: what is the total revenue 


  response = chain.run(query=question, context=context)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Answer: 
I am your booking assistant. I will answer the query based on the retrieved booking data.
Query: what is the total revenue 
Booking Info: Hotel: Resort Hotel, Canceled: 0, Lead Time: 109 days, Date: August 2017, Price: $266.0, Country: GBR
Hotel: Resort Hotel, Canceled: 0, Lead Time: 7 days, Date: August 2015, Price: $166.05, Country: NLD
Hotel: Resort Hotel, Canceled: 1, Lead Time: 281 days, Date: July 2017, Price: $138.38, Country: IRL
Hotel: Resort Hotel, Canceled: 0, Lead Time: 109 days, Date: June 2017, Price: $318.82, Country: PRT
Hotel: Resort Hotel, Canceled: 1, Lead Time: 168 days, Date: August 2017, Price: $338.0, Country: ESP
Answer:
Booking Info: Hotel: Resort Hotel, Canceled: 0, Lead Time: 109 days, Date: August 2017, Price: $316.0, Country: IRL
Hotel: Resort Hotel, Canceled: 0, Lead Time: 7 days, Date: August 2015, Price: $164.05, Country: NLD
Hotel: Resort Hotel, Canceled: 1, Lead Time: 281 days, Date: July 2017, Price: $138.38, Country: PRT
Hotel: Resort Hote

In [18]:
import pickle
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

# Load Dataset
df = pd.read_csv("hotel_bookings.csv")

# Load Sentence Transformer Model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate Embeddings
embeddings = model.encode(df.astype(str).agg(' '.join, axis=1).tolist())

# Create FAISS Index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

# ✅ Save FAISS Index
faiss.write_index(index, "faiss_index.bin")

# ✅ Save Sentence Transformer Model
model.save("sentence_transformer_model")

# ✅ Save Processed Dataset
with open("processed_data.pkl", "wb") as f:
    pickle.dump(df, f)

print("🎯 RAG Model Saved Successfully!")


🎯 RAG Model Saved Successfully!


In [19]:
import pickle
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

# ✅ Load FAISS Index
index = faiss.read_index("faiss_index.bin")

# ✅ Load Sentence Transformer Model
model = SentenceTransformer("sentence_transformer_model")

# ✅ Load Processed Dataset
with open("processed_data.pkl", "rb") as f:
    df = pickle.load(f)

# 🔥 Define the Query Function
def rag_query(query):
    query_embedding = model.encode([query])
    _, indices = index.search(np.array(query_embedding), k=5)
    results = df.iloc[indices[0]].to_dict(orient="records")
    return results

print("🎯 RAG Model Loaded Successfully!")


🎯 RAG Model Loaded Successfully!


In [21]:
query = "which country has highest revenue?"
response = rag_query(query)
print("✅ Answer:", response)


✅ Answer: [{'hotel': 'Resort Hotel', 'is_canceled': 0, 'lead_time': 7, 'arrival_date_year': 2015, 'arrival_date_month': 'October', 'arrival_date_week_number': 42, 'arrival_date_day_of_month': 12, 'stays_in_weekend_nights': 1, 'stays_in_week_nights': 3, 'adults': 2, 'children': 0.0, 'babies': 0, 'meal': 'FB', 'country': 'PRT', 'market_segment': 'Corporate', 'distribution_channel': 'Corporate', 'is_repeated_guest': 0, 'previous_cancellations': 0, 'previous_bookings_not_canceled': 0, 'reserved_room_type': 'A', 'assigned_room_type': 'A', 'booking_changes': 0, 'deposit_type': 'Non Refund', 'agent': nan, 'company': 174.0, 'days_in_waiting_list': 0, 'customer_type': 'Transient-Party', 'adr': 109.0, 'required_car_parking_spaces': 0, 'total_of_special_requests': 0, 'reservation_status': 'Check-Out', 'reservation_status_date': '16-10-15'}, {'hotel': 'Resort Hotel', 'is_canceled': 0, 'lead_time': 7, 'arrival_date_year': 2015, 'arrival_date_month': 'October', 'arrival_date_week_number': 42, 'arriv

In [22]:
import pickle
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import os

# Set Save Directory
save_dir = r"C:\Users\nandh\Downloads"
os.makedirs(save_dir, exist_ok=True)  # Ensure the folder exists

# Load Dataset
df = pd.read_csv("hotel_bookings.csv")

# Load Sentence Transformer Model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate Embeddings
embeddings = model.encode(df.astype(str).agg(' '.join, axis=1).tolist())

# Create FAISS Index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

# ✅ Save FAISS Index
faiss.write_index(index, os.path.join(save_dir, "faiss_index.bin"))

# ✅ Save Sentence Transformer Model
model.save(os.path.join(save_dir, "sentence_transformer_model"))

# ✅ Save Processed Dataset
with open(os.path.join(save_dir, "processed_data.pkl"), "wb") as f:
    pickle.dump(df, f)

print(f"🎯 RAG Model Saved Successfully at {save_dir}!")


🎯 RAG Model Saved Successfully at C:\Users\nandh\Downloads!
