In [12]:
# Further analysis on filtered_email_no_vendors and public_domain_df to identify the true positive
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer 

In [13]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isnull(text):  # Handle missing values
        return ""
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove special characters and numbers
    words = word_tokenize(text)  # Tokenize text
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]  # Remove stopwords and lemmatize
    return " ".join(words)

In [14]:
combined_data = pd.read_pickle('../data/combined_data.pkl')

In [15]:
# Apply the function to the email content
combined_data['cleaned_email'] = combined_data['content'].map(preprocess_text)

In [16]:
# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Calculate sentiment scores
combined_data['sentiment_score'] = combined_data['cleaned_email'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [17]:
# Define sensitive keywords
sensitive_keywords = [
    'password', 'confidential', 'secret', 'token', 'login', 'access',
    'invoice', 'payment', 'tax', 'contract', 'NDA', 'urgent',
    'link', 'click', 'phishing', 'SSN', 'ID number', 'personal',
    'blueprint', 'source code', 'repository', 'algorithm',
    'private', 'delete this', 'keep this secret', 'burn after reading']

# Keyword detection function
def detect_keywords(content):
    return any(keyword in content for keyword in sensitive_keywords)

# Add keyword-based threat detection
combined_data['keyword_threat'] = combined_data['cleaned_email'].apply(detect_keywords)