In [1]:
# !pip install transformers torch nltk lime
# !pip install sympy --upgrade

In [2]:
import pandas as pd
import torch
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
import string
from lime.lime_text import LimeTextExplainer
import gradio as gr

# Ensure you have NLTK stopwords downloaded
nltk.download('stopwords')

# Load pre-trained model for hate speech detection
model_name = "unitary/toxic-bert"
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)

# Define categories
MESSAGE_CATEGORIES = {
    "HATE": "The message contains hate speech.",
    "OFFENSIVE": "The message is offensive.",
    "NORMAL": "The message is normal."
}

# Preprocess input text
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Classify message
def classify_message(text):
    predictions = classifier(text)

    result = MESSAGE_CATEGORIES['NORMAL']
    thresholds = {'HATE': 0.5, 'OFFENSIVE': 0.4}

    for prediction in predictions:
        label = prediction['label'].lower()
        score = prediction['score']

        if label in ['toxic', 'identity_hate'] and score > thresholds['HATE']:
            result = MESSAGE_CATEGORIES['HATE']
            break
        elif label in ['obscene', 'insult'] and score > thresholds['OFFENSIVE']:
            result = MESSAGE_CATEGORIES['OFFENSIVE']

    return result

# LIME explanation
def explain_message(text):
    explainer = LimeTextExplainer(class_names=['NORMAL', 'OFFENSIVE', 'HATE'])

    def predict_proba(texts):
        outputs = []
        for t in texts:
            preds = classifier(t)
            hate = 0
            offensive = 0
            for p in preds:
                label = p['label'].lower()
                score = p['score']
                if label in ['toxic', 'identity_hate']:
                    hate += score
                elif label in ['obscene', 'insult']:
                    offensive += score
            normal = max(0.0, 1.0 - (hate + offensive))
            outputs.append([normal, offensive, hate])
        return outputs

    exp = explainer.explain_instance(text, predict_proba, num_features=5)
    return exp.as_list()

# Gradio handler function
def chatbot_response(user_input):
    result = classify_message(user_input)
    explanation = explain_message(user_input)
    explanation_text = '\n'.join([f"{word}: {weight:.2f}" for word, weight in explanation])
    return f"{result}\n\nExplanation:\n{explanation_text}"

# Launch Gradio interface
gr.Interface(
    fn=chatbot_response,
    inputs="text",
    outputs="text",
    title="Hate Speech Detection Chatbot"
).launch()


[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




In [2]:
import pandas as pd
import torch
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
import string
from lime.lime_text import LimeTextExplainer

# Ensure you have NLTK stopwords downloaded
nltk.download('stopwords')

# Load pre-trained model for hate speech detection
model_name = "unitary/toxic-bert"
classifier = pipeline("text-classification", model=model_name, tokenizer=model_name)

# Define categories
MESSAGE_CATEGORIES = {
    "HATE": "The message contains hate speech.",
    "OFFENSIVE": "The message is offensive.",
    "NORMAL": "The message is normal."
}

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Function to classify messages
def classify_message(text):
    predictions = classifier(text)

    result = MESSAGE_CATEGORIES['NORMAL']  # Default to normal
    thresholds = {'HATE': 0.5, 'OFFENSIVE': 0.4}  # Adjusted thresholds

    for prediction in predictions:
        label = prediction['label'].lower()  # Ensure label is lowercase
        score = prediction['score']

        if label in ['toxic', 'identity_hate'] and score > thresholds['HATE']:
            result = MESSAGE_CATEGORIES['HATE']
            break  # Found hate speech, no need to check further
        elif label in ['obscene', 'insult'] and score > thresholds['OFFENSIVE']:
            result = MESSAGE_CATEGORIES['OFFENSIVE']

    return result

# Function to explain predictions using LIME
def explain_message(text):
    explainer = LimeTextExplainer(class_names=['NORMAL', 'OFFENSIVE', 'HATE'])

    # Define the prediction function for LIME (use the classifier's output)
    def predict_proba(texts):
        preds = classifier(texts)
        return [[1 if pred['label'] == 'toxic' else 0,
                 1 if pred['label'] == 'obscene' else 0,
                 1 if pred['label'] == 'normal' else 0] for pred in preds]

    exp = explainer.explain_instance(text, predict_proba, num_features=5)
    return exp.as_list()

# Chatbot interface
def chatbot():
    print("Welcome to the Offensive/Hate Speech Detection Chatbot!")
    print("Type 'exit' to stop the chat.")

    while True:
        user_input = input("You: ")

        if user_input.lower() == 'exit':
            print("Chatbot: Goodbye!")
            break

        result = classify_message(user_input)
        print(f"Chatbot: {result}")

        # Get LIME explanations
        explanation = explain_message(user_input)
        print("Explanation:", explanation)

if __name__ == "__main__":
    # Start the chatbot
    chatbot()

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Welcome to the Offensive/Hate Speech Detection Chatbot!
Type 'exit' to stop the chat.
Chatbot: The message is normal.


KeyboardInterrupt: 