# Phase 1: Exploratory Data Analysis
Analysis of the synthetic social engineering dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set visual style
plt.style.use('ggplot')
sns.set_palette('viridis')

df = pd.read_csv('../data/raw/synthetic_data.csv')
display(df.head())

## 1. Label Distribution
How balanced are our multi-label targets?

In [None]:
labels = ['urgency', 'authority', 'fear', 'impersonation']
label_counts = df[labels].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.title('Distribution of Social Engineering Labels')
plt.ylabel('Count')
plt.show()

## 2. Text Length Analysis
Are certain attack types longer or shorter?

In [None]:
df['text_length'] = df['text'].apply(len)

plt.figure(figsize=(10, 6))
sns.histplot(df['text_length'], bins=20, kde=True)
plt.title('Distribution of Message Lengths')
plt.show()

## 3. Co-occurrence Matrix
Which labels often appear together? (e.g., Urgency and Fear)

In [None]:
co_occurrence = df[labels].T.dot(df[labels])
plt.figure(figsize=(8, 6))
sns.heatmap(co_occurrence, annot=True, fmt='d', cmap='Blues')
plt.title('Label Co-occurrence Matrix')
plt.show()