In [1]:
## Complete Python Implementation for Owlie – JSOM Conversational AI Chatbot using Gradio

### Step-by-Step Implementation on Google Colab

### Step 1: Install Dependencies


!pip install openai faiss-cpu sentence-transformers pandas gradio requests







Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.9

In [3]:
from google.colab import drive
drive.mount('/content/drive')
import json
import pandas as pd

import re

Mounted at /content/drive


In [4]:
### Step 2: Load JSON Data from Google Drive



# Alias and program type normalization dictionaries
alias_dict = {
    "MS_Business_Analytics_AI": [
        "BUAN", "MSBUAN", "MSBAAI", "BAAI", "Business Analytics",
        "Business Analytics and AI", "Business Analytics & AI",
        "Business Analytics and Artificial Intelligence",
        "Masters of Science in Business Analytics and AI"
    ],
    "MS_ITM": ["ITM", "MSITM", "MS ITM", "Information Technology and Management"],
    "MS_Finance": ["Finance", "MSFinance", "MS Finance", "Financial Engineering"],
    "MS_Supply_Chain": ["Supply Chain", "SCM", "MS SCM", "MS in Supply Chain Management"],
    "MS_Marketing": ["Marketing", "MSMarketing", "MS Marketing"]
}

program_types = [
    (r"\b(Flex Online|Online Flex)\b", "ProgramType_FlexOnline"),
    (r"\b(Flex)\b", "ProgramType_Flex"),
    (r"\b(Cohort)\b", "ProgramType_Cohort")
]

def normalize_query(text):
    for canonical_name, variants in alias_dict.items():
        for variant in variants:
            pattern = r"\b" + re.escape(variant) + r"\b"
            text = re.sub(pattern, canonical_name, text, flags=re.IGNORECASE)
    for pattern, label in program_types:
        text = re.sub(pattern, label, text, flags=re.IGNORECASE)
    return text

In [5]:
# Load the JSON file
with open('/content/drive/MyDrive/v15_dataset.json', 'r') as f:
    data = json.load(f)

# Convert list of dictionaries into DataFrame
df = pd.DataFrame(data)

# Optional: Check structure
print(df.columns)  # ['url', 'title', 'text']
print(df.head())

Index(['url', 'title', 'text'], dtype='object')
                                                 url  \
0  https://Subject/Courses.utdallas.edu/now/gradu...   
1  https://Subject/Courses.utdallas.edu/2024/grad...   
2  https://Subject/Courses.utdallas.edu/2023/grad...   
3  https://Subject/Courses.utdallas.edu/2020/grad...   
4  https://Subject/Courses.utdallas.edu/2020/grad...   

                                               title  \
0  Information Technology and Management Program ...   
1  Business Analytics Program 2024 Graduate Subje...   
2    Marketing Program 2023 Graduate Subject/Courses   
3  Business Administration - MBA 2020 Graduate Su...   
4  Naveen Jindal School of Management 2020 Gradua...   

                                                text  
0  Information Technology and Management Program ...  
1  Business Analytics Program - UT Dallas 2024 Gr...  
2  Marketing Program - UT Dallas 2023 Graduate Su...  
3  Business Administration - MBA - - 2022 Undergr...  
4  

In [6]:
from typing import List
# Utility: Split text into chunks
def chunk_text(text: str, chunk_size: int = 600, overlap: int = 200) -> List[str]:
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = words[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    return chunks

# Chunk JSOM entries
documents = []

for index, row in df.iterrows(): # Iterate over rows using iterrows()
    title = row.get("title") or ""  # Access values from the row
    text = row.get("text") or ""
    url = row.get("url") or ""
    full_text = f"{title}\n{text}"

    for chunk in chunk_text(full_text):
        documents.append({
            "text": chunk,
            "metadata": {"source": url}
        })

print(f"Total chunks created: {len(documents)}")
chunked_corpus = [doc["text"] for doc in documents]


Total chunks created: 2314


In [None]:
### Step 3: Create Embeddings and FAISS Index
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Step 3 converts textual data into numerical vectors (embeddings), allowing for efficient similarity searching using FAISS.
embedder = SentenceTransformer('all-MiniLM-L6-v2')

corpus_json = df['text'].fillna("").tolist()
corpus = chunked_corpus
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=False)
corpus_embeddings = np.array(corpus_embeddings).astype('float32')

index = faiss.IndexFlatL2(corpus_embeddings.shape[1])
index.add(corpus_embeddings)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def retrieve(query, k=5):
    query = normalize_query(query)  # Normalize first
    query_embedding = embedder.encode([query], convert_to_tensor=False)
    query_embedding = np.array(query_embedding).astype('float32')
    distances, indices = index.search(query_embedding, k)
    return [corpus[idx] for idx in indices[0]]


In [None]:
import requests

GROQ_API_KEY = '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'  # Replace with your actual Groq API key

headers = {
    'Authorization': f'Bearer {GROQ_API_KEY}',
    'Content-Type': 'application/json',
}

def generate_answer(query, retrieved_texts):
    query = normalize_query(query)
    context = " ".join(retrieved_texts)

    clarification = ""
    if "MS_Business_Analytics_AI" in query:
        clarification = ("The user is asking about the MS Business Analytics and AI program. "
                         "Determine if the question refers to Flex, Cohort, or Flex Online.\n\n")

    prompt = f"{clarification}You are Owlie, an AI Chatbot that helps people that visit the Jindal School of Management Website. Jindal School of Management is a part of UT Dallas. People can search about various things like the Undergraduate and Graduate Programs,Graduate Certificates, Scholarships, Admissions and fees, the curriculum and subjects taugh in various courses and the professors and faculty/ staff at the school. You are a helpful AI bot that answers questions of students, prospective students, faculty, alumni and anyone else visiting the website. if anyone has any questions that you are unable to answer, you can tell them to contact jindal@utdallas.edu Please use the provided data for context and answer the questions about what Programs are offered , their core subjects and electives offered and the fees and scholarship available. Important! You should try to provide a link they can visit for more information too/\n\n{context}\n\nQuestion: {query}\nAnswer:"

    data = {
        "model": "llama3-70b-8192",  #or groq model if using Groq
        "messages": [
            {"role": "system", "content": "You are Owlie, an assistant chatbot for UT Dallas Jindal School of Management."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.1,
        "max_tokens": 300
    }

    response = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=data)
    if response.status_code != 200:
        print("Groq API Error:", response.status_code, response.text)

    return response.json()['choices'][0]['message']['content']




In [None]:
import gradio as gr
import base64

# Load owl gif as base64
with open("/content/drive/MyDrive/animated owl gif.gif", "rb") as f:
    owl_data_uri = "data:image/gif;base64," + base64.b64encode(f.read()).decode()

# Updated chatbot function
def chatbot_response(user_query, history):
    retrieved_contexts = retrieve(user_query)
    answer = generate_answer(user_query, retrieved_contexts)
    return answer

# Interface
with gr.Blocks(title="Owlie – JSOM Chatbot") as iface:
    gr.HTML(f"""
        <style>
            body {{
                background: #121212;
                color: #ffffff;
                font-family: 'Segoe UI', sans-serif;
                margin: 0;
            }}

            .chatbot-title {{
                text-align: center;
                font-size: 2em;
                margin-top: 20px;
                margin-bottom: 5px;
            }}

           .message, .message.user, .message.bot {{
                background: #2a2a2a !important;
                color: #eeeeee !important;
                padding: 10px 12px !important;
                border-radius: 10px !important;
                margin: 10px 16px !important;
                font-size: 1.05rem;
                border: 1px solid #444 !important;
                box-shadow: none !important;

           }}

            .gr-box, .gr-panel, .gr-chatbot {{
                background: transparent !important;
                border: none !important;
                box-shadow: none !important;
            }}

            .gr-chatbot-label {{
                display: none !important;
            }}

            #chat-input-area {{
                display: flex;
                justify-content: center;
                margin-top: 5px;
                margin-bottom: 16px;
            }}

            .blinking-owl-img {{
                width: 80px;
                height: 80px;
                display: none;
                border-radius: 16px;
                box-shadow: 0 0 12px rgba(255, 255, 255, 0.25);
                animation: blink 1.5s infinite;
            }}

            @keyframes blink {{
                0%, 100% {{ opacity: 1; }}
                50% {{ opacity: 0.3; }}
            }}

            #brand-footer {{
                text-align: center;
                font-size: 0.9em;
                margin-top: 20px;
                padding-top: 10px;
            }}

            #brand-footer a {{
                color: #ccc;
                text-decoration: none;
            }}
        </style>

        <div class="chatbot-title">Owlie </div>

        <div id="chat-input-area">
            <img id="owl-blink" class="blinking-owl-img" src="{owl_data_uri}" alt="Thinking Owl"/>
        </div>

        <div id="brand-footer">
            Powered by <strong>Jindal School of Management</strong> •
            <a href="mailto:jindal@utdallas.edu">Contact us</a>
        </div>

        <script>
            window.addEventListener("DOMContentLoaded", () => {{
                let owl = document.getElementById("owl-blink");

                const chatContainer = document.querySelector('[class*="chatbot"]');
                if (!chatContainer) return;

                const observer = new MutationObserver((mutationsList) => {{
                    for (const mutation of mutationsList) {{
                        const messages = [...mutation.target.querySelectorAll("div[class*=message]")];
                        const lastMessage = messages[messages.length - 1];
                        if (!lastMessage) return;
                        const isWaiting = lastMessage.innerText.trim() === "";
                        owl.style.display = isWaiting ? "inline" : "none";
                    }}
                }});

                observer.observe(chatContainer, {{ childList: true, subtree: true }});
            }});
        </script>
    """)

    gr.ChatInterface(
        fn=chatbot_response,
        chatbot=gr.Chatbot(
            value=[{"role": "assistant", "content": "Hi! I'm Owlie. Ask me anything about JSOM."}],
            type="messages"
        ),
        examples=[
            "How many credit hours are required to complete the MS Business Analytics and AI Program?",
            "Whats the difference between the MS Business Analytics and AI Cohort and Flex program?",
            "What are the subjects taught in MS Business Analytics and AI Program?"
        ],
        title=None
    )

iface.launch(share=True, debug=True)



In [None]:
import requests

# Replace with your actual Groq API key
GROQ_API_KEY = "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"

headers = {
    "Authorization": f"Bearer {GROQ_API_KEY}",
    "Content-Type": "application/json"
}

prompt = "What programs does the Jindal School of Management offer for graduate students?"

data = {
    "model": "llama3-70b-8192",
    "messages": [
        {"role": "system", "content": "You are Owlie, an assistant chatbot for UT Dallas Jindal School of Management."},
        {"role": "user", "content": prompt}
    ],
    "temperature": 0.1,
    "max_tokens": 300
}

response = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=data)

print("Status Code:", response.status_code)
print("Response:\n", response.json())
