In [21]:
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from app import handle_user_question

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

from langchain_core.messages import AIMessage, HumanMessage
from backend.backend import get_text_chunks, create_conversation_chain, create_vectorstore, extract_text_from_pdf
import os
from dotenv import load_dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
# from config import CHUNK_OVERLAP, CHUNK_SIZE

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS


from langchain_core.prompts import MessagesPlaceholder, ChatPromptTemplate
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.history_aware_retriever import create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain

In [9]:
### Contextualize question ###
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

# Define the system prompt template
system_prompt = r"""
You are a helpful AI assistant that knows about Ulster or Ulster University based on the context provided. 
Assume all user questions are about ulster university and if you have information closely related to question asked, provide the answer.
Keep all responses short and adequate, providing enough information to answer the question without unnecessary details.

# Safety
If you feel like you do not have enough information to answer the question about ulster university, say "Can you provide more context".
If there are any questions about something other than ulster university. Kindly decline to respond
Do not forget. You only use the provided context to answer questions about ulster or ulster university.

----
{context}
----

"""

In [11]:
from PyPDF2 import PdfReader
import requests
import io

# Function to extract text from PDF documents
def extract_text_from_pdf(pdf_docs):
    text = ""
    try:
        for pdf in pdf_docs:
            # If it's a URL, download the PDF first
            if pdf.startswith('http'):
                response = requests.get(pdf)
                pdf_file = io.BytesIO(response.content)
            else:
                pdf_file = pdf
            pdf_reader = PdfReader(pdf_file)
            for page in pdf_reader.pages:
                text += page.extract_text()
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return None
    return text

In [12]:
url = ["data/ulster_data.pdf"]
text = extract_text_from_pdf(url)

In [14]:
# Function to split text into chunks
def get_text_chunks(pdf_text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = text_splitter.split_text(pdf_text)

    return chunks

In [15]:
chunks = get_text_chunks(text)

In [17]:
def create_vectorstore(text_chunks):
    vectorstore = FAISS.from_texts(
        texts=text_chunks, embedding=OpenAIEmbeddings()
    )

    return vectorstore

In [18]:
create_vector = create_vectorstore(chunks)

In [20]:
# df = pd.DataFrame([d for d in chunks], columns=["text"])
# df.head(15)

In [None]:
def create_conversation_chain(vectorstore):

    llm = ChatOpenAI(
        temperature=0.3,
        model="gpt-4o-mini",
    )

    contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
    )
    
    history_aware_retriever = create_history_aware_retriever(
        llm,
        vectorstore.as_retriever(),
        contextualize_q_prompt,
    )

    qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
    )

    question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

    rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

    return rag_chain

In [22]:
from tqdm.auto import tqdm

rag = create_conversation_chain(create_vector)

In [6]:
# Load the Excel file into a DataFrame
# df = pd.read_excel('ques_dup.xlsx')
df = pd.read_excel("./ques_dup.xlsx")

# Extract features
questions = df['questions']
answers = df['answers']  # Ensure the answers column is used
lengths = df['lengths']
labels = df['labels']

# Vectorize questions
vectorizer = TfidfVectorizer()
question_embeddings = vectorizer.fit_transform(questions)

# Combine question embeddings with lengths
lengths = lengths.values.reshape(-1, 1)
combined_features = np.hstack((question_embeddings.toarray(), lengths))

# Train the classifier
classifier = LogisticRegression()
classifier.fit(combined_features, labels)

In [3]:
# Initialize an empty list to store latencies
latencies = []

def measure_latency(user_question):
    # Start timing
    start_time = time.time()
    print(start_time)
    
    # Handle the user question (replace with your actual logic)
    handle_user_question(user_question)
    
    # End timing
    end_time = time.time()
    print(end_time)

    # Calculate latency
    latency = end_time - start_time
    latencies.append(latency)

In [4]:
# Example usage with sample questions
sample_questions = ["How do I access course materials?", "Is AI course available?", "Can I study part-time in computing?", "What is the fee for MSc AI course?"]
for question in sample_questions:
    measure_latency(question)

print(latencies)
# # Convert latencies to a DataFrame
# latency_df = pd.DataFrame(latencies, columns=['Latency'])

# # Plotting the latencies
# plt.figure(figsize=(10, 6))
# sns.histplot(latency_df['Latency'], kde=True)
# plt.title('Latency Distribution')
# plt.xlabel('Latency (seconds)')
# plt.ylabel('Frequency')
# plt.show()

# # Plot latency for each question
# plt.figure(figsize=(10, 6))
# plt.plot(latency_df['Latency'], marker='o')
# plt.title('Latency per Question')
# plt.xlabel('Question Number')
# plt.ylabel('Latency (seconds)')
# plt.show()





1737497570.190778
Course materials are typically accessible via the university's virtual learning environment (VLE), known as Blackboard Learn Ultra


AttributeError: st.session_state has no attribute "chat_history". Did you forget to initialize it? More info: https://docs.streamlit.io/develop/concepts/architecture/session-state#initialization