In [10]:
# STEP 1: Install Required Libraries
!pip install -q langchain sentence-transformers faiss-cpu pandas

In [11]:
# STEP 2: Load and Prepare Dataset
import zipfile, os, pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

# Unzip
dataset_path = 'chatbot dataset.zip'
with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall('data')

# Load CSV
csv_file = [f for f in os.listdir("data") if f.endswith(".csv")][0]
df = pd.read_csv(os.path.join("data", csv_file))
df = df.sample(n=500, random_state=42)

# Preprocessing
df['cleaned'] = df[df.columns[0]].astype(str).str.replace('<[^<]+?>', '', regex=True).str.replace('\n', ' ')
docs = [Document(page_content=txt) for txt in df['cleaned'].tolist()]

# Split text
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = splitter.split_documents(docs)

# Embedding
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")

# FAISS index
vector_store = FAISS.from_documents(split_docs, embedding_model)
vector_store.save_local("faiss_index")
print("✅ Saved FAISS index.")

✅ Saved FAISS index.


In [12]:
# Step 5: Create Streamlit App
%%writefile app.py
import streamlit as st
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import os

# App Title
st.set_page_config(page_title="✈️ Travel Guide Chatbot")
st.title("✈️ Travel Guide Chatbot")
st.write("Ask about travel tips and hotel experiences based on real reviews!")

# Load FAISS index
@st.cache_resource
def load_vector_store():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")
    return FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

# Load local model (Flan-T5 small)
@st.cache_resource
def load_local_llm():
    model_name = "google/flan-t5-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
    return HuggingFacePipeline(pipeline=pipe)

vector_store = load_vector_store()
llm = load_local_llm()

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 3})
)

# Chat UI
if "messages" not in st.session_state:
    st.session_state.messages = [
        {"role": "assistant", "content": "Hello! Ask me anything about hotels or travel destinations!"}
    ]

for msg in st.session_state.messages:
    with st.chat_message(msg["role"]):
        st.markdown(msg["content"])

if prompt := st.chat_input("What would you like to know?"):
    st.session_state.messages.append({"role": "user", "content": prompt})
    with st.chat_message("user"):
        st.markdown(prompt)

    with st.chat_message("assistant"):
        with st.spinner("Thinking..."):
            reply = qa_chain.run(prompt)
            st.markdown(reply)
    st.session_state.messages.append({"role": "assistant", "content": reply})



Overwriting app.py


In [13]:
# Step 6: Run the Streamlit App in Colab
!pip install streamlit pyngrok --upgrade
!ngrok authtoken 2wSsy7ImsmmD6od8Tw41BywrTWC_5LycKZaNvg2dEfsVSXGV2

# Run Streamlit in the background
!streamlit run app.py --server.port 8501 &>/dev/null &

# Connect with the new ngrok API format
from pyngrok import ngrok
public_url = ngrok.connect(addr=8501, proto="http")  # Explicitly specify protocol
print("Your Streamlit app is live at:", public_url)

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Your Streamlit app is live at: NgrokTunnel: "https://03f1-34-125-138-129.ngrok-free.app" -> "http://localhost:8501"
