# 02 Sentiment Analysis

This notebook performs sentiment analysis on the email dataset using NLP techniques. We'll classify emails into Positive, Negative, or Neutral sentiments using both rule-based and machine learning approaches.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('ggplot')
sns.set_palette("husl")

## 2. Load Processed Data

In [None]:
df = pd.read_csv('../data/processed/email_data_processed.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

## 3. Rule-Based Sentiment Analysis (TextBlob)

In [None]:
def get_sentiment_textblob(text):
    """Get sentiment using TextBlob"""
    blob = TextBlob(str(text))
    polarity = blob.sentiment.polarity
    
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

def get_polarity_score(text):
    """Get polarity score"""
    blob = TextBlob(str(text))
    return blob.sentiment.polarity

# Apply sentiment analysis
df['sentiment_textblob'] = df['combined_text'].apply(get_sentiment_textblob)
df['polarity_score'] = df['combined_text'].apply(get_polarity_score)

# Display sentiment distribution
sentiment_counts = df['sentiment_textblob'].value_counts()
print("Sentiment Distribution (TextBlob):")
print(sentiment_counts)
print(f"\nPercentages:")
print(sentiment_counts / len(df) * 100)

## 4. Sentiment Analysis Visualizations

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Sentiment distribution pie chart
sentiment_counts.plot(kind='pie', ax=axes[0,0], autopct='%1.1f%%', startangle=90)
axes[0,0].set_title('Sentiment Distribution')
axes[0,0].set_ylabel('')

# Sentiment distribution bar chart
sentiment_counts.plot(kind='bar', ax=axes[0,1], color=['green', 'red', 'gray'])
axes[0,1].set_title('Sentiment Counts')
axes[0,1].set_xlabel('Sentiment')
axes[0,1].set_ylabel('Count')
axes[0,1].tick_params(axis='x', rotation=45)

# Polarity score distribution
axes[1,0].hist(df['polarity_score'], bins=50, color='skyblue', alpha=0.7)
axes[1,0].set_title('Polarity Score Distribution')
axes[1,0].set_xlabel('Polarity Score')
axes[1,0].set_ylabel('Frequency')

# Sentiment by email domain (top 10)
if 'email_domain' in df.columns:
    top_domains = df['email_domain'].value_counts().head(10).index
    domain_sentiment = df[df['email_domain'].isin(top_domains)].groupby(['email_domain', 'sentiment_textblob']).size().unstack(fill_value=0)
    domain_sentiment.plot(kind='bar', stacked=True, ax=axes[1,1])
    axes[1,1].set_title('Sentiment by Email Domain (Top 10)')
    axes[1,1].set_xlabel('Email Domain')
    axes[1,1].set_ylabel('Count')
    axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Machine Learning-Based Sentiment Analysis

In [None]:
# Use TextBlob sentiment as labels for training (in real scenario, you'd have manually labeled data)
# For demonstration, we'll use a subset of data with clear sentiments
clear_sentiment_mask = df['polarity_score'].abs() > 0.1
df_train = df[clear_sentiment_mask].copy()

print(f"Training data shape: {df_train.shape}")
print(f"Training sentiment distribution:\n{df_train['sentiment_textblob'].value_counts()}")

# Prepare data for ML models
X = df_train['combined_text']
y = df_train['sentiment_textblob']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF feature matrix shape: {X_train_tfidf.shape}")

In [None]:
# Train Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)

# Train Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)

# Evaluate models
print("Naive Bayes Results:")
print(f"Accuracy: {accuracy_score(y_test, nb_pred):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, nb_pred))

print("\n" + "="*50)
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, lr_pred):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, lr_pred))

## 6. Apply Best Model to Full Dataset and Save Results

In [None]:
# Apply the logistic regression model to the full dataset
X_full_tfidf = tfidf.transform(df['combined_text'])
df['sentiment_ml'] = lr_model.predict(X_full_tfidf)
df['sentiment_ml_proba'] = lr_model.predict_proba(X_full_tfidf).max(axis=1)

# Compare TextBlob vs ML results
comparison = pd.crosstab(df['sentiment_textblob'], df['sentiment_ml'], margins=True)
print("Sentiment Comparison (TextBlob vs ML):")
print(comparison)

# Save the dataset with sentiment analysis results
df.to_csv('../data/processed/email_data_with_sentiment.csv', index=False)
print("\nDataset with sentiment analysis saved to '../data/processed/email_data_with_sentiment.csv'")

# Display sample results
print("\nSample Results:")
sample_cols = ['Subject', 'sentiment_textblob', 'sentiment_ml', 'polarity_score', 'sentiment_ml_proba']
print(df[sample_cols].head(10))