In [15]:
import os
import pandas as pd

# List of CSV file names present in the folder.
csv_file_list = [
    "aggression_parsed_dataset.csv",
    "attack_parsed_dataset.csv",
    "Cyberbullying_Dataset_Summary_Table__Detailed_.csv",
    "kaggle_parsed_dataset.csv",
    "twitter_sexism_parsed_dataset.csv",
    "twitter_racism_parsed_dataset.csv",
    "twitter_parsed_dataset.csv",
    "toxicity_parsed_dataset.csv",
    "youtube_parsed_dataset.csv"
]

# Loop through each CSV file in the list.
for file_name in csv_file_list:
    # Since the CSV files are in the same folder as the notebook,
    # the file path is simply the file name.
    file_path = os.path.join(file_name)
    
    # Print header for current file processing.
    print(f"=== Processing File: {file_name} ===")
    
    # Try to read the CSV file into a pandas DataFrame.
    try:
        data_frame = pd.read_csv(file_path)
    except Exception as error:
        print(f"Error reading {file_name}: {error}")
        print("-" * 60 + "\n")
        continue  # Skip to the next file if an error occurs.
    
    # Display the column names of the DataFrame.
    print("Column Names:")
    print(data_frame.columns.tolist())
    
    # Display the data types of each column.
    print("\nData Types:")
    print(data_frame.dtypes)
    
    # Display the number of rows and columns in the DataFrame.
    num_rows, num_columns = data_frame.shape
    print(f"\nNumber of Rows: {num_rows}")
    print(f"Number of Columns: {num_columns}")
    
    # Display the first 5 rows of the DataFrame as a preview.
    print("\nFirst 5 Rows:")
    print(data_frame.head())
    
    # Print a separator line after processing each file.
    print("\n" + "-" * 60 + "\n")


📂 Dataset: aggression_parsed_dataset.csv

📐 Structure:
index           int64
Text           object
ed_label_0    float64
ed_label_1    float64
oh_label        int64
dtype: object

📊 Summary:
                index     Text     ed_label_0     ed_label_1       oh_label
count   115864.000000   115864  115864.000000  115864.000000  115864.000000
unique            NaN   115661            NaN            NaN            NaN
top               NaN  Err:509            NaN            NaN            NaN
freq              NaN       26            NaN            NaN            NaN
mean     57931.500000      NaN       0.814172       0.185828       0.127581
std      33447.200132      NaN       0.271089       0.271089       0.333624
min          0.000000      NaN       0.000000       0.000000       0.000000
25%      28965.750000      NaN       0.750000       0.000000       0.000000
50%      57931.500000      NaN       0.900000       0.100000       0.000000
75%      86897.250000      NaN       1.000000   

<h1 style="color: #FF5733; font-size: 36px;">Digital Defender: Game Wrangling and Analysis File</h1>

<h1 style="color: #FF5733; font-size: 19px;"> Data details

This script uses the os and pandas libraries to manage and process multiple CSV files stored in the same directory. It iterates over a list of CSV filenames, reads each file into a pandas DataFrame, and then prints essential details such as column names, data types, the shape of the DataFrame, and a preview of the first few rows. If a file can’t be read due to an error, the code prints an error message and continues with the next file, ensuring a clear separation between the outputs of each file. This setup provides a quick and efficient way to explore and understand the structure and content of your CSV data.

In [8]:
import os
import pandas as pd

# List of CSV file names present in the folder.
csv_file_list = [
    "aggression_parsed_dataset.csv",
    "attack_parsed_dataset.csv",
    "Cyberbullying_Dataset_Summary_Table__Detailed_.csv",
    "kaggle_parsed_dataset.csv",
    "twitter_sexism_parsed_dataset.csv",
    "twitter_racism_parsed_dataset.csv",
    "twitter_parsed_dataset.csv",
    "toxicity_parsed_dataset.csv",
    "youtube_parsed_dataset.csv",
    "Aggressive_All.csv",
    "Non_Aggressive_All.csv"

]

# Loop through each CSV file in the list.
for file_name in csv_file_list:
    # Since the CSV files are in the same folder as the notebook,
    # the file path is simply the file name.
    file_path = os.path.join(file_name)
    
    # Print header for current file processing.
    print(f"=== Processing File: {file_name} ===")
    
    # Try to read the CSV file into a pandas DataFrame.
    try:
        data_frame = pd.read_csv(file_path)
    except Exception as error:
        print(f"Error reading {file_name}: {error}")
        print("-" * 60 + "\n")
        continue  # Skip to the next file if an error occurs.
    
    # Display the column names of the DataFrame.
    print("Column Names:")
    print(data_frame.columns.tolist())
    
    # Display the data types of each column.
    print("\nData Types:")
    print(data_frame.dtypes)
    
    # Display the number of rows and columns in the DataFrame.
    num_rows, num_columns = data_frame.shape
    print(f"\nNumber of Rows: {num_rows}")
    print(f"Number of Columns: {num_columns}")
    
    # Display the first 5 rows of the DataFrame as a preview.
    print("\nFirst 5 Rows:")
    print(data_frame.head())
    
    # Print a separator line after processing each file.
    print("\n" + "-" * 60 + "\n")

=== Processing File: aggression_parsed_dataset.csv ===
Column Names:
['index', 'Text', 'ed_label_0', 'ed_label_1', 'oh_label']

Data Types:
index           int64
Text           object
ed_label_0    float64
ed_label_1    float64
oh_label        int64
dtype: object

Number of Rows: 115864
Number of Columns: 5

First 5 Rows:
   index                                               Text  ed_label_0  \
0      0  `- This is not ``creative``.  Those are the di...    0.900000   
1      1  `  :: the term ``standard model`` is itself le...    1.000000   
2      2    True or false, the situation as of March 200...    1.000000   
3      3   Next, maybe you could work on being less cond...    0.555556   
4      4               This page will need disambiguation.     1.000000   

   ed_label_1  oh_label  
0    0.100000         0  
1    0.000000         0  
2    0.000000         0  
3    0.444444         0  
4    0.000000         0  

------------------------------------------------------------

=== Pr

Data wrangling

	•	Loads the CSV: Reads your CSV file into a DataFrame.
	•	Checks for Duplicates: Counts and prints the number of duplicate rows, then removes them if found.
	•	Checks for Empty Rows: Counts and prints rows that are completely empty (all values missing) and removes them.
	•	Saves Cleaned Data: Writes the cleaned DataFrame to a new CSV file, preserving the changes.

In [None]:
import pandas as pd

# Specify the file name (update with your file name)
file_name = 'your_file.csv' #put the file we want to check 

# Load the CSV file into a DataFrame
df = pd.read_csv(file_name)

# Check for duplicate rows
num_duplicates = df.duplicated().sum()
print(f"Found {num_duplicates} duplicate rows.")

# Remove duplicate rows, if any
if num_duplicates > 0:
    df = df.drop_duplicates()
    print("Duplicate rows have been removed.")

# Check for empty rows (rows where all cells are NaN)
num_empty_rows = df.isnull().all(axis=1).sum()
print(f"Found {num_empty_rows} empty rows.")

# Remove empty rows, if any
if num_empty_rows > 0:
    df = df.dropna(how='all')
    print("Empty rows have been removed.")

# Save the cleaned DataFrame to a new CSV file
clean_file_name = 'your_file_cleaned.csv'# 
df.to_csv(clean_file_name, index=False)
print(f"Cleaned data has been saved to {clean_file_name}.")

nlp

Overview of the Process

In our pipeline, we began by loading and preprocessing a cleaned CSV file containing messages. We removed any rows with empty messages and standardized the text by converting it to lowercase and stripping extra whitespace. This ensured that our subsequent analysis was performed on a consistent and high-quality dataset.

Emotion Analysis Approaches

We employed two distinct emotion analysis methods. First, we used a lexicon-based approach with NRCLex. NRCLex utilizes the NRC Emotion Lexicon, which maps words to basic emotions such as anger, fear, joy, sadness, and disgust. By analyzing the frequency of these emotion-related words in a message, NRCLex returns a list of top emotions with their associated scores. This method is fast and interpretable, providing a straightforward snapshot of the emotional cues in the text.

In addition, we applied a transformer-based approach using a pretrained model, specifically “bhadresh-savani/distilbert-base-uncased-emotion”. This model is a distilled version of BERT that has been fine-tuned for emotion classification. It predicts multiple emotion categories—such as anger, joy, sadness, fear, love, and surprise—by returning a probability distribution over these labels for each message. We chose this model because DistilBERT is lighter and faster than the full BERT model while maintaining strong performance, and because it offers a detailed, probability-based classification that can capture nuanced emotional content.

Creating the Trigger Column

After obtaining the transformer-based emotion predictions, we created an additional column labeled “trigger.” This column identifies the dominant emotion for each message by selecting the emotion with the highest probability from the transformer’s output. This trigger word serves as an immediate indicator of the primary emotional signal in a message, which can be very useful for further analysis or for developing interactive tools such as quizzes aimed at cyber bullying prevention.

Saving the Results

Finally, the enriched DataFrame—now containing the original messages, NRCLex emotion outputs, transformer-based emotion probabilities, and the trigger column—is saved in both CSV and JSON formats. The CSV file offers ease of use for further data manipulation or viewing in spreadsheet applications, while the JSON format is ideal for integration with web-based applications and interactive educational tools.

By combining these methods, we achieve a comprehensive understanding of the emotional content in each message, enabling more informed analysis and effective strategies for cyber bullying prevention and educational initiatives.

In [None]:
import pandas as pd
import nltk
from nrclex import NRCLex
from transformers import pipeline
import textwrap
import torch
import os
import multiprocessing as mp
import logging

# Set logging level for transformers to suppress informational messages
logging.getLogger("transformers").setLevel(logging.ERROR)

# Verify PyTorch installation
print("Installed PyTorch version:", torch.__version__)
print("Number of GPUs available:", torch.cuda.device_count())

# Download necessary NLTK data (if needed)
nltk.download('vader_lexicon')

# ------------------- Load and Preprocess Data -------------------
file_name = "Aggressive_All_cleaned.csv"
df = pd.read_csv(file_name)
print("Columns in dataset:", df.columns.tolist())

# Remove rows where 'Message' is missing or empty
df = df[~(df['Message'].isnull() | (df['Message'].astype(str).str.strip() == ""))]
print("Number of rows after cleaning empty messages:", len(df))

# Standardize messages: convert to lowercase and strip extra whitespace
df['Message'] = df['Message'].astype(str).str.lower().str.strip()

# ------------------- Define Emotion Analysis Functions -------------------

def get_nrc_emotions(text):
    """
    Uses NRCLex to analyze the text and returns the top emotions as a list of (emotion, score) tuples.
    """
    emotion_obj = NRCLex(text)
    return emotion_obj.top_emotions

def convert_transformer_results(results):
    """Convert a list of dictionaries to a single dictionary mapping emotion labels to scores."""
    return {item['label']: item['score'] for item in results}

def process_batch(batch):
    """
    Processes a batch of messages using the transformer pipeline.
    Each process initializes its own pipeline instance.
    """
    # Initialize the pipeline inside each process using CPU (change device=0 if using GPU)
    emotion_classifier = pipeline(
        "text-classification", 
        model="bhadresh-savani/distilbert-base-uncased-emotion", 
        framework="pt",        # explicitly use PyTorch
        device=-1,             # use CPU; change to device=0 if GPU is available
        top_k=None,            # equivalent to return_all_scores=True
        truncation=True        # truncate texts longer than model's max length
    )
    results = emotion_classifier(batch)
    # Convert each result (a list of dictionaries) into a single dictionary
    processed = [convert_transformer_results(result) for result in results]
    return processed

# ------------------- Parallel Processing for Transformer Emotions -------------------
def main():
    # For faster experimentation, you can sample a subset (remove or adjust sampling as needed)
    sample_size = 5000  # change or remove to process full dataset
    df_sample = df.sample(n=sample_size, random_state=42)
    messages = df_sample['Message'].tolist()
    
    batch_size = 32  # adjust based on your hardware
    batches = [messages[i:i+batch_size] for i in range(0, len(messages), batch_size)]
    
    print(f"Total messages in sample: {len(messages)}")
    print(f"Total batches (batch size={batch_size}): {len(batches)}")
    
    # Use multiprocessing Pool to process batches in parallel
    with mp.Pool(mp.cpu_count()) as pool:
        transformer_results = pool.map(process_batch, batches)
    
    # Flatten the list of lists into a single list for each message's transformer predictions
    transformer_emotions = [item for sublist in transformer_results for item in sublist]
    df_sample['transformer_emotions'] = transformer_emotions
    
    # Process NRCLex emotions (this step remains sequential)
    print("Processing NRCLex emotions...")
    df_sample['nrc_emotions'] = df_sample['Message'].apply(get_nrc_emotions)
    
    # ------------------- Add 'trigger' Column -------------------
    # For each row, pick the emotion from transformer_emotions with the highest probability.
    df_sample['trigger'] = df_sample['transformer_emotions'].apply(
        lambda x: max(x, key=x.get) if isinstance(x, dict) and len(x) > 0 else None
    )
    
    # ------------------- Display Sample Results -------------------
    print("\n=== Sample Data with Emotion Predictions and Trigger Word ===")
    sample_display = df_sample[['Message', 'nrc_emotions', 'transformer_emotions', 'trigger']].head(5)
    for idx, row in sample_display.iterrows():
        print("-" * 80)
        print("Message:")
        print(textwrap.fill(row['Message'], width=80))
        print("\nNRCLex Top Emotions:")
        print(row['nrc_emotions'])
        print("\nTransformer-Based Emotions:")
        print(row['transformer_emotions'])
        print("\nTrigger Emotion (Highest Probability):")
        print(row['trigger'])
        print("-" * 80 + "\n")
    
    # ------------------- Save the Updated DataFrame -------------------
    csv_output = "Aggressive_All_with_trigger.csv"
    json_output = "Aggressive_All_with_trigger.json"
    
    df_sample.to_csv(csv_output, index=False)
    df_sample.to_json(json_output, orient='records', lines=True)
    
    print("CSV file saved as:", csv_output)
    print("JSON file saved as:", json_output)

if __name__ == "__main__":
    main()

In [5]:
# TEMPORARY: bypass SSL error (if it still exists)
import nltk
import ssl

# TEMPORARY: bypass SSL error (if it still exists)
ssl._create_default_https_context = ssl._create_unverified_context

self assesment tools


In [None]:
import nltk
from transformers import pipeline
from nrclex import NRCLex
import textwrap
import nltk

# Download necessary NLTK data (including punkt and punkt_tab)
nltk.download('punkt')
nltk.download('punkt_tab')
# Download necessary NLTK data (if not already downloaded)
nltk.download('vader_lexicon')  # optional; not needed for this example

# Initialize the transformer-based emotion classifier.
# Use device=0 if you have a GPU; here we assume CPU with device=-1.
emotion_classifier = pipeline(
    "text-classification", 
    model="bhadresh-savani/distilbert-base-uncased-emotion", 
    framework="pt",        # explicitly use PyTorch
    device=-1,             # use CPU; change to device=0 for GPU
    top_k=None,            # returns all scores
    truncation=True        # truncates texts longer than model's max length
)

def get_transformer_emotions(text):
    """
    Returns a dictionary mapping emotion labels to their probabilities using the transformer model.
    """
    results = emotion_classifier(text)
    return {item['label']: item['score'] for item in results[0]}

def get_nrc_emotions(text):
    """
    Returns NRCLex top emotions as a list of (emotion, score) tuples.
    """
    emotion_obj = NRCLex(text)
    return emotion_obj.top_emotions

def main():
    print("Welcome to the Emotion Trigger Questionnaire!")
    print("Type your statement below (or type 'quit' to exit):\n")
    
    while True:
        # Get user input
        statement = input("Enter your statement: ").strip()
        if statement.lower() == 'quit':
            print("Exiting the questionnaire. Goodbye!")
            break
        
        # Process the statement using transformer and NRCLex
        transformer_results = get_transformer_emotions(statement)
        nrc_results = get_nrc_emotions(statement)
        # Determine the trigger emotion from transformer results (emotion with highest probability)
        trigger_emotion = max(transformer_results, key=transformer_results.get)
        
        # Display the results
        print("\n" + "-" * 80)
        print("Your Statement:")
        print(textwrap.fill(statement, width=80))
        print("\nTransformer-Based Emotion Scores:")
        for label, score in transformer_results.items():
            print(f"  {label}: {score:.3f}")
        print("\nNRCLex Top Emotions:")
        print(nrc_results)
        print("\nTrigger Emotion (Highest Transformer Score):", trigger_emotion)
        print("-" * 80 + "\n")
        
if __name__ == "__main__":
    main()

In [None]:
import nltk
from transformers import pipeline
from nrclex import NRCLex
import textwrap
import csv

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

# Initialize the transformer-based emotion classifier (using CPU here)
emotion_classifier = pipeline(
    "text-classification",
    model="bhadresh-savani/distilbert-base-uncased-emotion",
    framework="pt",
    device=-1,
    top_k=None,
    truncation=True
)

def get_transformer_emotions(text):
    """Returns a dictionary mapping emotion labels to probabilities using the transformer model."""
    results = emotion_classifier(text)
    return {item['label']: item['score'] for item in results[0]}

def get_nrc_emotions(text):
    """Uses NRCLex to analyze the text and returns top emotions as a list of (emotion, score) tuples."""
    emotion_obj = NRCLex(text)
    return emotion_obj.top_emotions

def interactive_survey():
    """Collects survey responses interactively from the user and returns a list of response records."""
    print("Welcome to the Cyber Bullying Self-Assessment Survey!")
    print("You will be asked to input examples of online messages and answer follow-up questions about how they make you feel.")
    print("Type 'quit' at any prompt to exit.\n")
    
    responses = []
    
    while True:
        message = input("Enter an online message: ").strip()
        if message.lower() == 'quit':
            break
        
        # Get emotion analyses
        transformer_emotions = get_transformer_emotions(message)
        nrc_emotions = get_nrc_emotions(message)
        trigger_emotion = max(transformer_emotions, key=transformer_emotions.get)
        
        # Ask follow-up questions
        user_feeling = input("1. How does this message make you feel? ").strip()
        if user_feeling.lower() == 'quit':
            break
        
        intensity = input("2. On a scale of 1 (low) to 10 (high), how intense is that feeling? ").strip()
        if intensity.lower() == 'quit':
            break
        
        mood_impact = input("3. Does this message affect your mood? (yes/no): ").strip()
        if mood_impact.lower() == 'quit':
            break
        
        response_action = input("4. What would you do if you encountered this message in real life? ").strip()
        if response_action.lower() == 'quit':
            break
        
        # Store the response in a dictionary record
        record = {
            "message": message,
            "transformer_emotions": transformer_emotions,
            "nrc_emotions": nrc_emotions,
            "trigger": trigger_emotion,
            "user_feeling": user_feeling,
            "intensity": intensity,
            "mood_impact": mood_impact,
            "response_action": response_action
        }
        responses.append(record)
        print("\nResponse recorded!\n")
        cont = input("Would you like to assess another message? (yes/no): ").strip().lower()
        if cont != "yes":
            break
    
    return responses

def save_responses_to_csv(responses, filename="survey_responses.csv"):
    """Saves a list of response records to a CSV file."""
    if not responses:
        print("No responses to save.")
        return
    # Get the header from keys of the first record.
    header = responses[0].keys()
    with open(filename, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()
        for record in responses:
            writer.writerow(record)
    print(f"Responses saved to {filename}")

# Run the interactive survey and save responses to CSV.
if __name__ == "__main__":
    survey_responses = interactive_survey()
    save_responses_to_csv(survey_responses)

In [None]:
##########################SURVEY AND MODEL 
import nltk
from transformers import pipeline
from nrclex import NRCLex
import textwrap
import csv

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

# Initialize the transformer-based emotion classifier (using CPU here)
emotion_classifier = pipeline(
    "text-classification",
    model="bhadresh-savani/distilbert-base-uncased-emotion",
    framework="pt",
    device=-1,
    top_k=None,
    truncation=True
)

def get_transformer_emotions(text):
    """Returns a dictionary mapping emotion labels to probabilities using the transformer model."""
    results = emotion_classifier(text)
    return {item['label']: item['score'] for item in results[0]}

def get_nrc_emotions(text):
    """Uses NRCLex to analyze the text and returns top emotions as a list of (emotion, score) tuples."""
    emotion_obj = NRCLex(text)
    return emotion_obj.top_emotions

def interactive_survey():
    """Collects survey responses interactively from the user and returns a list of response records."""
    print("Welcome to the Cyber Bullying Self-Assessment Survey!")
    print("You will be asked to input examples of online messages and answer follow-up questions about how they make you feel.")
    print("Type 'quit' at any prompt to exit.\n")
    
    responses = []
    
    while True:
        message = input("Enter an online message: ").strip()
        if message.lower() == 'quit':
            break
        
        # Get emotion analyses
        transformer_emotions = get_transformer_emotions(message)
        nrc_emotions = get_nrc_emotions(message)
        trigger_emotion = max(transformer_emotions, key=transformer_emotions.get)
        
        # Ask follow-up questions
        user_feeling = input("1. How does this message make you feel? ").strip()
        if user_feeling.lower() == 'quit':
            break
        
        intensity = input("2. On a scale of 1 (low) to 10 (high), how intense is that feeling? ").strip()
        if intensity.lower() == 'quit':
            break
        
        mood_impact = input("3. Does this message affect your mood? (yes/no): ").strip()
        if mood_impact.lower() == 'quit':
            break
        
        response_action = input("4. What would you do if you encountered this message in real life? ").strip()
        if response_action.lower() == 'quit':
            break
        
        # Store the response in a dictionary record
        record = {
            "message": message,
            "transformer_emotions": transformer_emotions,
            "nrc_emotions": nrc_emotions,
            "trigger": trigger_emotion,
            "user_feeling": user_feeling,
            "intensity": intensity,
            "mood_impact": mood_impact,
            "response_action": response_action
        }
        responses.append(record)
        print("\nResponse recorded!\n")
        cont = input("Would you like to assess another message? (yes/no): ").strip().lower()
        if cont != "yes":
            break
    
    return responses

def save_responses_to_csv(responses, filename="survey_responses.csv"):
    """Saves a list of response records to a CSV file."""
    if not responses:
        print("No responses to save.")
        return
    # Get the header from keys of the first record.
    header = responses[0].keys()
    with open(filename, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=header)
        writer.writeheader()
        for record in responses:
            writer.writerow(record)
    print(f"Responses saved to {filename}")

# Run the interactive survey and save responses to CSV.
if __name__ == "__main__":
    survey_responses = interactive_survey()
    save_responses_to_csv(survey_responses)

In [None]:
######################################## SURVEY PLOT
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load the survey responses CSV file
df = pd.read_csv("survey_responses.csv")

# Inspect the columns
print("Columns in survey data:", df.columns.tolist())

# For trend analysis, we'll use:
# - 'trigger' (categorical trigger emotion)
# - 'intensity' (convert to numeric)
# - 'mood_impact' (convert yes/no to 1/0)
df['intensity'] = pd.to_numeric(df['intensity'], errors='coerce')
df['mood_impact_numeric'] = df['mood_impact'].apply(lambda x: 1 if x.strip().lower() == 'yes' else 0)

# 1. Trend Analysis
# a) Frequency of trigger emotions
trigger_counts = df['trigger'].value_counts()
print("\nTrigger Emotion Frequencies:")
print(trigger_counts)

plt.figure(figsize=(8, 6))
trigger_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Frequency of Trigger Emotions")
plt.xlabel("Trigger Emotion")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# b) Average intensity by trigger emotion
avg_intensity = df.groupby('trigger')['intensity'].mean()
print("\nAverage Intensity by Trigger Emotion:")
print(avg_intensity)

plt.figure(figsize=(8, 6))
avg_intensity.plot(kind='bar', color='coral', edgecolor='black')
plt.title("Average Intensity by Trigger Emotion")
plt.xlabel("Trigger Emotion")
plt.ylabel("Average Intensity (1-10)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 2. Risk Identification via Clustering
# We create a feature matrix using 'intensity' and 'mood_impact_numeric'
features = df[['intensity', 'mood_impact_numeric']].dropna()

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Determine number of clusters (e.g., 3 clusters)
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(features_scaled)

# Add cluster assignments to the DataFrame
df.loc[features.index, 'cluster'] = clusters

print("\nCluster Counts:")
print(df['cluster'].value_counts())

# Visualize clusters
plt.figure(figsize=(8, 6))
plt.scatter(features_scaled[:, 0], features_scaled[:, 1], c=clusters, cmap='viridis', alpha=0.6)
plt.title("Clustering of Users by Intensity and Mood Impact")
plt.xlabel("Standardized Intensity")
plt.ylabel("Standardized Mood Impact")
plt.colorbar(label='Cluster')
plt.tight_layout()
plt.show()

# Output insights
print("\nData-Driven Insights:")
print("Trigger Emotion Frequencies:\n", trigger_counts)
print("Average Intensity by Trigger Emotion:\n", avg_intensity)
print("Cluster assignments:\n", df['cluster'].value_counts())