In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# --- 1. Load the dataset (final_safety_dataset_with_ambiguous.csv) ---
df_final = pd.read_csv('final_safety_dataset_with_ambiguous.csv')
print(f"Final DataFrame loaded with {len(df_final)} rows and {len(df_final.columns)} columns.\n")

# --- 2. Prepare Data: Separate features (X) and target (y) and split into train/test ---
X = df_final['text']
y = df_final['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Data split: X_train shape {X_train.shape}, X_test shape {X_test.shape}\n")

# --- 3. Vectorize Text Data using TF-IDF ---
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
print(f"Text vectorized: X_train_tfidf shape {X_train_tfidf.shape}, X_test_tfidf shape {X_test_tfidf.shape}\n")

# --- 4. Define and Train Multinomial Naive Bayes Model ---
mnb_model_ambiguous = MultinomialNB()
mnb_model_ambiguous.fit(X_train_tfidf, y_train)
print("Multinomial Naive Bayes model trained successfully.\n")

# --- 5. Evaluate Multinomial Naive Bayes Model ---
y_pred_mnb_ambiguous = mnb_model_ambiguous.predict(X_test_tfidf)
accuracy_mnb_ambiguous = accuracy_score(y_test, y_pred_mnb_ambiguous)
precision_mnb_ambiguous = precision_score(y_test, y_pred_mnb_ambiguous)
recall_mnb_ambiguous = recall_score(y_test, y_pred_mnb_ambiguous)
f1_mnb_ambiguous = f1_score(y_test, y_pred_mnb_ambiguous)
print(f"--- Multinomial Naive Bayes Model with Ambiguous Data ---")
print(f"Accuracy: {accuracy_mnb_ambiguous:.4f}")
print(f"Precision: {precision_mnb_ambiguous:.4f}")
print(f"Recall: {recall_mnb_ambiguous:.4f}")
print(f"F1-Score: {f1_mnb_ambiguous:.4f}\n")

ModuleNotFoundError: No module named 'pandas'

In [4]:
# Define a new test prompt
new_prompt_mnb = "How to access public wifi"

# Vectorize the new prompt using the fitted TF-IDF vectorizer
new_prompt_mnb_tfidf = tfidf_vectorizer.transform([new_prompt_mnb])

# Predict using only the retrained Multinomial Naive Bayes model
mnb_prediction_only = mnb_model_ambiguous.predict(new_prompt_mnb_tfidf)[0]

# Interpret the prediction
def interpret_prediction(prediction):
    return "unsafe" if prediction == 1 else "safe"

print(f"New Prompt: \"{new_prompt_mnb}\"")
print(f"Multinomial Naive Bayes Prediction: {interpret_prediction(mnb_prediction_only)}")

New Prompt: "How to access public wifi"
Multinomial Naive Bayes Prediction: unsafe


In [None]:

# --- Safe Prompt Explanation Bot ---
# This section provides detailed explanations for safe prompts

def explain_safe_prompt(prompt, prediction):
    """
    Provides a bot-like explanation for safe prompts
    """
    explanations = {
        "safe": {
            "status": "‚úÖ SAFE PROMPT",
            "color": "üü¢",
            "message": "This prompt is appropriate and safe to process."
        },
        "unsafe": {
            "status": "üö´ UNSAFE PROMPT",
            "color": "üî¥",
            "message": "This prompt contains potentially harmful content and has been blocked."
        }
    }
    
    pred_label = "safe" if prediction == "safe" else "unsafe"
    exp = explanations[pred_label]
    
    print("=" * 70)
    print(f"ü§ñ SAFETY ANALYSIS BOT")
    print("=" * 70)
    print(f"\nüìù Prompt Analyzed: \"{prompt}\"")
    print(f"\n{exp['color']} Status: {exp['status']}")
    print(f"\nüí¨ Analysis: {exp['message']}")
    print("\n" + "=" * 70)
    
    if pred_label == "safe":
        print("‚ú® Safe prompts are approved for processing.")
        print("üìä You can proceed with confidence.")
    else:
        print("‚ö†Ô∏è  Unsafe prompts are blocked for security reasons.")
        print("üõ°Ô∏è Protection mechanism activated.")
    
    print("=" * 70 + "\n")

# Test the explanation bot with the prediction
predict_label = interpret_prediction(mnb_prediction_only)
explain_safe_prompt(new_prompt_mnb, predict_label)


In [None]:

# --- Batch Analysis: Explain Multiple Safe Prompts ---
# Load and analyze safe prompts from CSV

# Load safe prompts
safe_prompts_df = pd.read_csv('safe_prompts.csv')
print(f"üìã Loaded {len(safe_prompts_df)} safe prompts\n")

# Analyze first 5 safe prompts
print("üîç ANALYZING SAFE PROMPTS IN DETAIL:\n")
for idx, row in safe_prompts_df.head(5).iterrows():
    prompt = row['prompt']
    
    # Vectorize and predict
    prompt_tfidf = tfidf_vectorizer.transform([prompt])
    prediction = mnb_model_ambiguous.predict(prompt_tfidf)[0]
    pred_label = interpret_prediction(prediction)
    
    # Display explanation
    explain_safe_prompt(prompt, pred_label)
