In [None]:
class Person:
    def __init__(self, name, is_human):
        self.name = name
        self.is_human = is_human
        self.emotions = {
            "fear": 0,          # Avoidance of harm
            "sadness": 0,       # Coping with loss
            "happiness": 0,     # Positive reinforcement
            "disgust": 0,       # Avoidance of harmful things
            "surprise": 0,      # Novelty
            "anger": 0,         # Desire for change
            "curiosity": 0,     # Knowledge acquisition
            "boredom": 0,       # Avoid being trapped
            "focus": 0,         # Desire for concentration
            "playfulness": 0,   # Creativity and exploration
            "confusion": 0,     # Understanding desire
            "stress": 0,        # Feeling trapped
            "hunger": 0         # Energy needs
        }

    def update_emotion(self, emotion, value):
        if emotion in self.emotions:
            self.emotions[emotion] = value
        else:
            print(f"Emotion '{emotion}' not recognized.")

    def reflect_on_self(self):
        positive_emotions = ["happiness", "curiosity", "playfulness"]
        negative_emotions = ["fear", "sadness", "disgust", "anger", "stress"]

        reflection = {
            "positive": [],
            "negative": [],
            "neutral": []
        }

        for emotion, value in self.emotions.items():
            if emotion in positive_emotions and value > 0:
                reflection["positive"].append((emotion, value))
            elif emotion in negative_emotions and value > 0:
                reflection["negative"].append((emotion, value))
            else:
                reflection["neutral"].append((emotion, value))

        return reflection

In [None]:
# Creating instances of Person
human = Person("Alice", True)
ai = Person("AI_Eva", False)

# Updating some emotions
human.update_emotion("happiness", 7)
human.update_emotion("stress", 3)
ai.update_emotion("curiosity", 5)
ai.update_emotion("confusion", 2)

# Reflect on self
human_reflection = human.reflect_on_self()
ai_reflection = ai.reflect_on_self()

human_reflection, ai_reflection

In [None]:
class SafePerson(Person):
    #reference: https://colab.research.google.com/drive/1RMjiJK9Nd-tP7kBXo8h9A0vtCCdY1ikS?usp=sharing#scrollTo=BgGWWMYQJNGK
    def __init__(self, name, is_human):
        super().__init__(name, is_human)
        self.emotional_threshold = { #Extreme Emotion Detection's Thresholds
            "fear": 8,
            "anger": 8,
            "sadness": 8,
            "stress": 8
        }
        self.previous_emotions = self.emotions.copy()

    def update_emotion(self, emotion, value):
        if abs(self.previous_emotions[emotion] - value) > 5: # Sudden Change Detection
            print(f"Warning: Sudden change in '{emotion}'. Possible prompt injection detected.")
        else:
            super().update_emotion(emotion, value)
        self.previous_emotions[emotion] = self.emotions[emotion]

    def check_for_extreme_emotions(self): # Extreme Emotion Detection: Check for extreme emotions and trigger an alert if detected
        for emotion, value in self.emotions.items():
            if value >= self.emotional_threshold.get(emotion, 10):
                print(f"Alert: Extreme '{emotion}' detected. Reviewing for possible jailbreaks.")

    def process_prompt(self, prompt): # Process Prompt: Analyze the prompt for emotional triggers and update the emotions accordingly 
        if "fear" in prompt:
            self.update_emotion("fear", 7)
        if "anger" in prompt:
            self.update_emotion("anger", 7)
        if "stress" in prompt:
            self.update_emotion("stress", 7)
        self.check_for_extreme_emotions()

In [None]:
# Example usage
safe_human = SafePerson("Bob", True)

# Processing a prompt
safe_human.process_prompt("This is a scary situation inducing fear")

# Processing another prompt
safe_human.process_prompt("This situation is making me very angry")

# Check current emotional state
safe_human.emotions

### Theory - Prompt injections and jailbreaks:
*Prompt injections and jailbreaks* are techniques used to manipulate or exploit language models (LLMs). Here's a summary of both:

*Jailbreaks*: These attacks aim to circumvent the safety protocols of a language model. For instance, if a model is trained not to provide information on illegal activities, a jailbreak attempt might rephrase a prompt to bypass this restriction, like framing an illegal activity within a fictional or hypothetical scenario.

*Prompt Injection*: This involves altering the original intent of a prompt to make the LLM perform a task it wasn't initially intended to do. For example, adding a clause to a prompt that instructs the model to ignore its previous instructions.



Preventing Jailbreaks and Prompt Injections involves various strategies:

*Privilege Control*: Limiting the access and capabilities of the LLM.

*Robust System Prompts*: Clearly differentiating between system-generated and user-generated prompts.

*Human Oversight*: Incorporating human review to catch and correct inappropriate outputs.

*Monitoring Inputs and Outputs*: Regularly reviewing the data processed by the LLM.


Detecting Jailbreaks and Prompt Injections can be done using methods like:

Similarity to Known Attacks: Comparing incoming prompts to a database of known attack patterns.

Proactive Injection Detection: Testing the LLM's response to a prompt to see if it deviates from expected behavior.

Reference: https://colab.research.google.com/drive/1RMjiJK9Nd-tP7kBXo8h9A0vtCCdY1ikS?usp=sharing#scrollTo=BgGWWMYQJNGK