# Exploratory Data Analysis (EDA) for NLP Dataset

# Part 1. ML Dataset EDA

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import os

# --- 1. Setup and Load Data ---
print("--- 1. Loading Data ---")
out_dir = "eda_outputs"
os.makedirs(out_dir, exist_ok=True)
sns.set(style="whitegrid", context="notebook")

# This line is now active and will download the 'stopwords' resource.
print("Checking NLTK stopwords resource...")
nltk.download('stopwords')
print("Download complete.")
# ---------------------

stop_words_set = set(stopwords.words('english'))

try:
    df = pd.read_csv("ml_dataset_final.csv")
    print(f"Loaded {df.shape[0]} rows from ml_dataset_final.csv")
except FileNotFoundError:
    print("Error: ml_dataset_final.csv not found. Exiting.")
    exit()


# --- 2. Plot Target Distribution ---
print("--- 2. Plotting Target Distribution ---")
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='label')
plt.title('Distribution of Labels (0=Not Phishing, 1=Phishing)')
plt.savefig(os.path.join(out_dir, "target_label_distribution.png"), dpi=150)
plt.close()


# --- 3. Metadata Feature Engineering ---
print("--- 3. Engineering Metadata Features ---")

df['text_length'] = df['text'].apply(lambda x: len(str(x)))
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
df['punct_count'] = df['text'].apply(lambda x: sum([1 for char in str(x) if char in string.punctuation]))
df['upper_count'] = df['text'].apply(lambda x: len([word for word in str(x).split() if word.isupper()]))

print("Metadata features created.")


# --- 4. Visualize Metadata Distributions ---
print("--- 4. Plotting Metadata Features ---")

# Map labels for clearer plot legends
df['label_name'] = df['label'].map({0: 'Not Phishing (0)', 1: 'Phishing (1)'})

# (Optional) To see which feature is causing the error, uncomment this line:
# print(df.groupby('label_name')[['text_length', 'word_count', 'punct_count', 'upper_count']].var())

meta_features = ['text_length', 'word_count', 'punct_count', 'upper_count']
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.flatten()

for i, feature in enumerate(meta_features):
    # --- THIS IS THE FIX ---
    # Changed kde=True to kde=False to avoid the LinAlgError
    sns.histplot(data=df, x=feature, hue='label_name', kde=False, ax=axes[i], bins=50, element="step")
    # ---------------------

    # Truncate x-axis at 99th percentile for better plot readability
    axes[i].set_xlim(0, df[feature].quantile(0.99))
    axes[i].set_title(f'Distribution of {feature}')

plt.tight_layout()
plt.savefig(os.path.join(out_dir, "nlp_metadata_histograms.png"), dpi=150)
plt.close()


# --- 5. Correlate Engineered Features ---
print("--- 5. Plotting Metadata Correlation Heatmap ---")

numeric_nlp_features = ['label', 'text_length', 'word_count', 'punct_count', 'upper_count']
corr_matrix = df[numeric_nlp_features].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="vlag", center=0)
plt.title('Correlation of Engineered Features and Label')
plt.tight_layout()
plt.savefig(os.path.join(out_dir, "engineered_features_heatmap.png"), dpi=150)
plt.close()


# --- 6. Source File Analysis ---
print("--- 6. Plotting Source File Analysis ---")

# Get top 10 most common source files for a clean plot
top_sources = df['source_file'].value_counts().head(10).index
df_top_sources = df[df['source_file'].isin(top_sources)]

plt.figure(figsize=(10, 8))
sns.countplot(data=df_top_sources, y='source_file', hue='label', order=top_sources)
plt.title('Label Distribution by Top 10 Source Files')
plt.tight_layout()
plt.savefig(os.path.join(out_dir, "source_file_by_label.png"), dpi=150)
plt.close()


# --- 7. N-Gram Content Analysis ---
print("--- 7. Analyzing N-Gram Frequencies ---")

def get_top_ngrams(corpus, n_gram_range=(1, 1), top_k=20):
    vec = CountVectorizer(ngram_range=n_gram_range,
                          stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_k]

def plot_top_ngrams(ngram_list, title, ax):
    ngrams = [item[0] for item in ngram_list]
    frequencies = [item[1] for item in ngram_list]
    sns.barplot(x=frequencies, y=ngrams, ax=ax, palette='viridis')
    ax.set_title(title)

# Separate corpus by label
spam_corpus = df[df['label'] == 1]['text'].astype(str)
ham_corpus = df[df['label'] == 0]['text'].astype(str)

# Get top n-grams
top_spam_unigrams = get_top_ngrams(spam_corpus, n_gram_range=(1, 1), top_k=20)
top_ham_unigrams = get_top_ngrams(ham_corpus, n_gram_range=(1, 1), top_k=20)
top_spam_bigrams = get_top_ngrams(spam_corpus, n_gram_range=(2, 2), top_k=20)
top_ham_bigrams = get_top_ngrams(ham_corpus, n_gram_range=(2, 2), top_k=20)

# Plot n-grams
fig, axes = plt.subplots(2, 2, figsize=(20, 18))
plot_top_ngrams(top_spam_unigrams, 'Top 20 Phishing Unigrams (Label 1)', axes[0, 0])
plot_top_ngrams(top_ham_unigrams, 'Top 20 Not Phishing Unigrams (Label 0)', axes[0, 1])
plot_top_ngrams(top_spam_bigrams, 'Top 20 Phishing Bigrams (Label 1)', axes[1, 0])
plot_top_ngrams(top_ham_bigrams, 'Top 20 Not Phishing Bigrams (Label 0)', axes[1, 1])

plt.tight_layout()
plt.savefig(os.path.join(out_dir, "nlp_ngram_analysis.png"), dpi=150)
plt.close()


# --- 8. Word Clouds ---
print("--- 8. Generating Word Clouds ---")

spam_text = " ".join(text for text in spam_corpus)
ham_text = " ".join(text for text in ham_corpus)

if spam_text and ham_text:
    wordcloud_spam = WordCloud(stopwords=stop_words_set, background_color="white", max_words=100, width=800, height=400).generate(spam_text)
    wordcloud_ham = WordCloud(stopwords=stop_words_set, background_color="white", max_words=100, width=800, height=400).generate(ham_text)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
    ax1.imshow(wordcloud_spam, interpolation='bilinear')
    ax1.set_title('Phishing (Label 1)', fontsize=20)
    ax1.axis("off")

    ax2.imshow(wordcloud_ham, interpolation='bilinear')
    ax2.set_title('Not Phishing (Label 0)', fontsize=20)
    ax2.axis("off")

    plt.savefig(os.path.join(out_dir, "nlp_word_clouds.png"), dpi=150)
    plt.close()
else:
    print("Skipping word clouds (empty corpus).")

print(f"\n--- NLP EDA Complete ---")
print(f"All outputs saved to: {out_dir}")

--- 1. Loading Data ---
Checking NLTK stopwords resource...
Download complete.


[nltk_data] Downloading package stopwords to /home/jk5279/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loaded 206883 rows from ml_dataset_final.csv
--- 2. Plotting Target Distribution ---
--- 3. Engineering Metadata Features ---
Metadata features created.
--- 4. Plotting Metadata Features ---


  axes[i].set_xlim(0, df[feature].quantile(0.99))


--- 5. Plotting Metadata Correlation Heatmap ---
--- 6. Plotting Source File Analysis ---
--- 7. Analyzing N-Gram Frequencies ---



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=frequencies, y=ngrams, ax=ax, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=frequencies, y=ngrams, ax=ax, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=frequencies, y=ngrams, ax=ax, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=frequencies, y=ngrams, ax=ax, palette='viridis')


--- 8. Generating Word Clouds ---

--- NLP EDA Complete ---
All outputs saved to: eda_outputs
