In [16]:
# Install Necessary Libraries
!pip install scikit-learn transformers gradio torch pandas sentence-transformers



In [28]:
# Import Libraries
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
import torch
import re
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

In [29]:
# Load the Dataset from GitHub
url = "https://raw.githubusercontent.com/jamesiswanto/DAPHNE-AI/refs/heads/development/Conversation.csv"
data = pd.read_csv(url)

In [30]:
# Data Cleaning
# Data cleaning function that removes null entries, trims whitespace, and standardizes case
def clean_data(data):
    # Remove rows with null values in 'question' or 'answer'
    data.dropna(subset=['question', 'answer'], inplace=True)
    # Strip whitespaces and lowercase the questions and answers
    data['question'] = data['question'].str.strip().str.lower()
    data['answer'] = data['answer'].str.strip().str.lower()
    return data

# Clean the dataset
data = clean_data(data)

In [31]:
# Set Up Logging
logging.basicConfig(filename="chatbot_logs.txt", level=logging.INFO)

In [32]:
# Preprocessing Function
def preprocess_input(user_input):
    # Preprocess the user input by removing punctuation and converting to lowercase
    user_input = re.sub(r'[^\w\s]', '', user_input).lower().strip()
    return user_input

In [33]:
# Initialize Chatbot Model (DialoGPT)
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
chat_history_ids = None

In [35]:
# Load Sentence Transformer for Semantic Similarity
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

In [36]:
# Prepare TF-IDF Vectorizer and Fit to Questions
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['question'].tolist())

# Calculate Sentence Embeddings for the Questions
question_embeddings = sentence_model.encode(data['question'].tolist(), convert_to_tensor=True)

In [25]:
# Define Function to Find Best Response
def find_best_response(user_input):
    """Find the best response from the dataset using semantic similarity."""
    user_input = preprocess_input(user_input)

    # Transform the user input into TF-IDF vector
    user_input_vector = vectorizer.transform([user_input])

    # Calculate cosine similarity using TF-IDF
    tfidf_similarities = cosine_similarity(user_input_vector, tfidf_matrix).flatten()

    # Calculate semantic similarity using Sentence Transformers
    user_input_embedding = sentence_model.encode(user_input, convert_to_tensor=True)
    semantic_similarities = util.pytorch_cos_sim(user_input_embedding, question_embeddings).flatten().cpu().numpy()

    # Combine similarities equally
    combined_similarities = (tfidf_similarities + semantic_similarities) / 2

    # Find the index of the highest similarity
    best_match_index = combined_similarities.argmax()

    # If similarity is above a fixed threshold, return the best response
    if combined_similarities[best_match_index] > 0.9:
        return data['answer'].iloc[best_match_index]

    return None

In [37]:
# Define Chatbot Response Function
def respond_to_user(user_input):
    """Generate a response using the dataset or fallback to the DialoGPT model if necessary."""
    global chat_history_ids

    # Input validation
    if not user_input.strip():
        return "Please enter a valid message."

    try:
        # Look for a relevant response in the dataset
        dataset_response = find_best_response(user_input)
        if dataset_response:
            return dataset_response  # Use the found dataset response

        # Fallback to the model if no dataset response is found
        new_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')
        bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1) if chat_history_ids is not None else new_input_ids

        # Generate response with DialoGPT
        chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
        bot_response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)

        # Log conversation
        logging.info(f"User: {user_input}")
        logging.info(f"Bot: {bot_response}")

        return bot_response

    except Exception as e:
        logging.error(f"Error occurred: {str(e)}")
        return "An error occurred while processing your request. Please try again."

In [38]:
# Create Gradio Interface
iface = gr.Interface(
    fn=respond_to_user,
    inputs=gr.Textbox(lines=2, placeholder="Type your message here...", label="Your Message"),
    outputs=gr.Textbox(label="DAPHNE AI"),
    title="DAPHNE AI",
    description="Developed by James Siswanto",
    allow_flagging="never",
)

# Launch the Interface
iface.launch()



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a9702e385ce1b75ad7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


