In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [11]:
# Step 1: Sample FAQ Data
faq_data = {
    "What courses are available?": "You can find a list of courses on our university website.",
    "What are the admission requirements?": "Admission requirements vary by course. Check the admissions page.",
    "How can I contact the admissions office?": "You can email admissions@university.edu or call +123456789."
}

In [12]:
# print(faq_questions)

In [13]:
# Step 2: Vectorize FAQ Questions
faq_questions = list(faq_data.keys())
vectorizer = TfidfVectorizer()
faq_embeddings = vectorizer.fit_transform(faq_questions)

In [14]:
questions = [
    "What courses are available?",
    "What is the fee structure?",
    "How can I apply?",
    "Tell me about research opportunities.",
    "What are the admission requirements?",
    "How can I contact the admissions office?"
]

labels = [1, 0, 0, 0, 1, 1]  # 1 = FAQ, 0 = Non-FAQ


In [15]:
# Vectorize training data
question_embeddings = vectorizer.transform(questions)
classifier = LogisticRegression()
classifier.fit(question_embeddings, labels)

In [16]:
def use_rag_pipeline(question):
    """
    Simulate a call to the RAG pipeline.
    Replace with actual RAG implementation.
    """
    return f"Processing your question using RAG: '{question}'"

In [17]:
# Step 4: Function to Determine Workflow
def classify_and_answer(user_question):
    """
    Classify the question as FAQ or Non-FAQ and generate an answer.
    """
    # Vectorize user question
    user_embedding = vectorizer.transform([user_question])
    
    # Predict if it's an FAQ
    is_faq = classifier.predict(user_embedding)[0]
    
    if is_faq:
        # Match user question with FAQ questions
        similarities = cosine_similarity(user_embedding, faq_embeddings).flatten()
        best_match_idx = np.argmax(similarities)
        best_match_score = similarities[best_match_idx]
        
        # Threshold to ensure high confidence in match
        if best_match_score > 0.7:
            return faq_data[faq_questions[best_match_idx]]
        else:
            return "I couldn't find a matching FAQ. Please provide more details."
    else:
        # Fallback to RAG pipeline
        return use_rag_pipeline(user_question)


In [18]:
# Step 6: Test the System
user_input = "What are the admission requirements?"
response = classify_and_answer(user_input)
print(response)

user_input = "Can you tell me about scholarships?"
response = classify_and_answer(user_input)
print(response)

Admission requirements vary by course. Check the admissions page.
Processing your question using RAG: 'Can you tell me about scholarships?'


In [19]:
user_input = "school requirements for ML course?"
response = classify_and_answer(user_input)
print(response)


I couldn't find a matching FAQ. Please provide more details.


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Sample dataset
data = {
    'question': [
        "What is the university's contact number?",
        "Can you summarize the course structure for Artificial Intelligence?",
        "What are the admission requirements?",
        "Explain the differences between supervised and unsupervised learning.",
        "When does the semester start?",
        "How does the RAG pipeline work?"
    ],
    'label': ['simple', 'complex', 'simple', 'complex', 'simple', 'complex']
}

# Convert dataset to DataFrame
df = pd.DataFrame(data)

# Vectorize the text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['question'])

# Encode labels
df['label'] = df['label'].map({'simple': 0, 'complex': 1})

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

# Train a Logistic Regression classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

# Function to classify a new question
def classify_question(question):
    question_vector = vectorizer.transform([question])
    prediction = classifier.predict(question_vector)
    return 'simple' if prediction == 0 else 'complex'

# Test the function
new_question = "What are the available courses?"
print(f"The question '{new_question}' is classified as {classify_question(new_question)}.")


In [3]:
import json

with open("data.json", 'r') as file:
    data = json.load(file)

In [8]:
# print(data[0])

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [10]:
# Vectorize FAQ Questions
faq_questions = list(data[0].keys())
vectorizer = TfidfVectorizer()
faq_embeddings = vectorizer.fit_transform(faq_questions)

In [12]:
# print(faq_embeddings)

In [16]:
# import pandas as pd


# df = pd.DataFrame(list(data[0].keys()), columns=['questions'])
# df['answers'] = list(data[0].values())


# # Sort DataFrame by the length of the questions
# df['length'] = df['questions'].apply(len)
# df = df.sort_values('length').drop('length', axis=1)


# df.to_excel('ques.xlsx', index=False)
# df.to_csv('ques.csv', index=False)

In [18]:
import pandas as pd

df = pd.read_excel("ques.xlsx")

# Calculate the length of each question and assign it to the 'length' column
df['length'] = df['questions'].apply(len)

# Save the updated DataFrame back to the Excel file (optional)
df.to_excel('updated_ques.xlsx', index=False)


['What courses are available?', 'What are the admission requirements?', 'How can I contact the admissions office?']


In [25]:
import pandas as pd

# Load the Excel file into a DataFrame
df = pd.read_excel("ques_dup.xlsx")

# Select the first 50 rows and only the 'questions' and 'answers' columns
subset_df = df.loc[:49, ['questions', 'answers']]

# Convert the subset to a dictionary
questions_answers_dict = subset_df.set_index('questions')['answers'].to_dict()

# Print the resulting dictionary
print(questions_answers_dict)


FileNotFoundError: [Errno 2] No such file or directory: 'data.xlsx'