In [1]:
# Further analysis on filtered_email_no_vendors and public_domain_df to identify the true positive
import pandas as pd
import numpy as np
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer 

from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [2]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isnull(text):  # Handle missing values
        return ""
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove special characters and numbers
    words = word_tokenize(text)  # Tokenize text
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]  # Remove stopwords and lemmatize
    return " ".join(words)

In [3]:
combined_data = pd.read_pickle('../data/combined_data.pkl')

In [4]:
# Apply the function to the email content
combined_data['cleaned_email'] = combined_data['content'].map(preprocess_text)

In [5]:
# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Calculate sentiment scores
combined_data['sentiment_score'] = combined_data['cleaned_email'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [6]:
# Sensitive keywords list
sensitive_keywords = [
    # General sensitive terms
    'password', 'confidential', 'secret', 'token', 'login', 'access',
    'invoice', 'payment', 'tax', 'contract', 'NDA', 'urgent', 'credentials',
    'restricted', 'classified', 'proprietary', 'secure', 'encrypted',
    
    # Phishing and malicious intent
    'link', 'click', 'phishing', 'malware', 'attachment', 'download', 'verify', 'authentication', 'reset',
    'security breach', 'unauthorized', 'exposure', 'compromise', 'hack', 'exploit',

    # Personal information
    'SSN', 'ID number', 'passport', 'driver’s license', 'address', 'bank account', 'credit card', 
    'personal', 'medical', 'insurance', 'privacy',

    # Technical terms
    'blueprint', 'source code', 'repository', 'algorithm', 'intellectual property', 'prototype', 'specification',
    'server', 'database', 'API', 'framework', 'architecture', 'pipeline',

    # Urgent or secretive phrases
    'private', 'delete this', 'keep this secret', 'burn after reading', 'for your eyes only', 
    'do not share', 'immediate attention', 'asap', 'strictly confidential',

    # Financial and operational terms
    'wire transfer', 'invoice', 'ledger', 'audit', 'compliance', 'budget', 'expense', 'profit', 
    'margin', 'forecast', 'valuation', 'merger', 'acquisition',

    # Industry-specific terms (customize as needed)
    'trade secret','contract negotiation', 'strategic plan', 'market analysis', 

    # Additional potential triggers
    'fraud', 'scam', 'deceptive', 'embezzlement', 'money laundering', 'corruption'
]

def detect_keywords(content):
    return any(keyword in content for keyword in sensitive_keywords)

#keyword-based threat detection
combined_data['keyword_threat'] = combined_data['cleaned_email'].apply(detect_keywords)


In [7]:
combined_data['keyword_threat'].value_counts()
#About 18% of the dataset was flagged as containing potential threats based on the keyword list.

keyword_threat
False    33602
True      7072
Name: count, dtype: int64

In [8]:
# Anomaly detection: High positive sentiment or sensitive keywords to get the higher data 
combined_data['anomaly'] = (combined_data['sentiment_score'] > 0.8) | combined_data['keyword_threat']

# Investigate anomalies
anomalous_emails = combined_data[combined_data['anomaly']]

# Output the anomalous emails
# print("Anomalous Emails:")
# print(anomalous_emails[['cleaned_email','sentiment_score','keyword_threat','anomaly']])

In [9]:
combined_data['anomaly'].value_counts()

anomaly
False    27749
True     12925
Name: count, dtype: int64

In [10]:
combined_data.columns

Index(['user', 'content', 'num_recipients', 'threat_flag', 'cleaned_email',
       'sentiment_score', 'keyword_threat', 'anomaly'],
      dtype='object')

In [11]:
#Target
y = combined_data['threat_flag']

In [12]:
combined_data

Unnamed: 0,user,content,num_recipients,threat_flag,cleaned_email,sentiment_score,keyword_threat,anomaly
0,ABC0174,gold negotiating 13 vice entry coach memorial ...,1,True,gold negotiating vice entry coach memorial mid...,0.7096,False,False
1,ABC0174,future always planets poorer jupiters only soo...,3,True,future always planet poorer jupiter soon specu...,-0.6249,True,True
2,ABC0174,100 bruins eight 1997 intensive fan reprimande...,1,True,bruin eight intensive fan reprimanded corey mv...,0.8402,False,True
3,ABC0174,nhls 01 season teammate home minnesota win str...,2,True,nhls season teammate home minnesota win streng...,0.8271,False,True
4,ABC0174,naturally formed nuclei rising hours york expe...,4,True,naturally formed nucleus rising hour york expe...,-0.0258,False,False
...,...,...,...,...,...,...,...,...
40669,RHC0335,benefits team passion dynamic technologies ski...,1,False,benefit team passion dynamic technology skill ...,0.9674,False,True
40670,IYB0918,offices inch alec load 1999 i delaware birthda...,2,False,office inch alec load delaware birthday freedo...,0.5106,False,False
40671,BWH0813,passion platform process technologies customer...,1,False,passion platform process technology customer s...,0.8979,False,True
40672,BWH0813,team contribute equivalent contribute benefits...,4,False,team contribute equivalent contribute benefit ...,0.9442,False,True


In [13]:
y = combined_data['threat_flag']

In [14]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the data ()
X = vectorizer.fit_transform(combined_data['cleaned_email'])

In [64]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LogisticRegression(class_weight='balanced', random_state=42,max_iter=500)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7165334972341734
Classification Report:
               precision    recall  f1-score   support

       False       0.88      0.78      0.83      7052
        True       0.17      0.28      0.21      1083

    accuracy                           0.72      8135
   macro avg       0.52      0.53      0.52      8135
weighted avg       0.78      0.72      0.75      8135



In [None]:


param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'class_weight': ['balanced', None],
    'max_iter': [4000]
}

grid = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5, scoring='f1_macro')
grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
model = grid.best_estimator_

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [77]:



# Adjusted class weights
pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

xgb_model = XGBClassifier(scale_pos_weight=pos_weight, random_state=42)
xgb_model.fit(X_train, y_train)

y_prob = xgb_model.predict_proba(X_test)[:, 1]
threshold = 0.3  
y_pred = (y_prob > threshold).astype(int)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

Accuracy: 0.28211432083589427
Classification Report:
               precision    recall  f1-score   support

       False       0.89      0.20      0.32      7052
        True       0.14      0.84      0.24      1083

    accuracy                           0.28      8135
   macro avg       0.51      0.52      0.28      8135
weighted avg       0.79      0.28      0.31      8135

ROC AUC Score: 0.5547720560469149


In [69]:


smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.7888137676705593
Classification Report:
               precision    recall  f1-score   support

       False       0.87      0.88      0.88      7052
        True       0.19      0.18      0.18      1083

    accuracy                           0.79      8135
   macro avg       0.53      0.53      0.53      8135
weighted avg       0.78      0.79      0.79      8135



In [18]:
## Logistic Regression: Simple and interpretable, good for baseline performance.
## Random Forest: Handles non-linear relationships, works well with mixed data types.
## XGBoost: High-performance gradient boosting models, excellent for imbalanced datasets.