# 02 Sentiment Analysis

This notebook performs sentiment analysis on the email dataset using NLP techniques. We'll classify emails into Positive, Negative, or Neutral sentiments using both rule-based and machine learning approaches.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('ggplot')
sns.set_palette("husl")

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download VADER lexicon if not already downloaded
try:
    nltk.data.find('vader_lexicon')
except LookupError:
    nltk.download('vader_lexicon')

# Initialize VADER analyzer
vader_analyzer = SentimentIntensityAnalyzer()

## 2. Load Processed Data

In [None]:
df = pd.read_csv('../data/processed/email_data_processed.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

## 3. Rule-Based Sentiment Analysis (TextBlob)

In [None]:
def get_sentiment_textblob(text):
    """Get sentiment using TextBlob"""
    blob = TextBlob(str(text))
    polarity = blob.sentiment.polarity
    
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

def get_sentiment_vader(text):
    """Get sentiment using VADER"""
    scores = vader_analyzer.polarity_scores(str(text))
    compound = scores['compound']
    
    if compound >= 0.05:
        return 'Positive'
    elif compound <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

def get_combined_sentiment(text):
    """Combine TextBlob and VADER for more robust sentiment classification"""
    textblob_sentiment = get_sentiment_textblob(text)
    vader_sentiment = get_sentiment_vader(text)
    
    # If both agree, use their result
    if textblob_sentiment == vader_sentiment:
        return textblob_sentiment
    
    # If they disagree, use VADER (generally more robust for social media/informal text)
    # But if one is neutral and other is not, use the non-neutral
    if textblob_sentiment == 'Neutral':
        return vader_sentiment
    elif vader_sentiment == 'Neutral':
        return textblob_sentiment
    else:
        # Both are non-neutral but different, use VADER
        return vader_sentiment

def get_polarity_scores(text):
    """Get detailed polarity scores"""
    blob = TextBlob(str(text))
    vader_scores = vader_analyzer.polarity_scores(str(text))
    
    return {
        'textblob_polarity': blob.sentiment.polarity,
        'textblob_subjectivity': blob.sentiment.subjectivity,
        'vader_compound': vader_scores['compound'],
        'vader_positive': vader_scores['pos'],
        'vader_negative': vader_scores['neg'],
        'vader_neutral': vader_scores['neu']
    }

# Apply sentiment analysis to the dataset
print("Applying sentiment analysis...")
df['sentiment_textblob'] = df['combined_text'].apply(get_sentiment_textblob)
df['sentiment_vader'] = df['combined_text'].apply(get_sentiment_vader)
df['sentiment_final'] = df['combined_text'].apply(get_combined_sentiment)

# Get detailed scores for analysis
polarity_scores = df['combined_text'].apply(get_polarity_scores)
polarity_df = pd.json_normalize(polarity_scores)
df = pd.concat([df, polarity_df], axis=1)

print("Sentiment analysis completed!")
print(f"\nSentiment Distribution (TextBlob):")
print(df['sentiment_textblob'].value_counts())
print(f"\nSentiment Distribution (VADER):")
print(df['sentiment_vader'].value_counts())
print(f"\nSentiment Distribution (Combined - FINAL):")
print(df['sentiment_final'].value_counts())

## 4. Sentiment Analysis Visualizations

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# TextBlob sentiment distribution
sentiment_tb = df['sentiment_textblob'].value_counts()
sentiment_tb.plot(kind='pie', ax=axes[0,0], autopct='%1.1f%%', startangle=90)
axes[0,0].set_title('TextBlob Sentiment Distribution')
axes[0,0].set_ylabel('')

# VADER sentiment distribution
sentiment_vader = df['sentiment_vader'].value_counts()
sentiment_vader.plot(kind='pie', ax=axes[0,1], autopct='%1.1f%%', startangle=90)
axes[0,1].set_title('VADER Sentiment Distribution')
axes[0,1].set_ylabel('')

# Final combined sentiment distribution
sentiment_final = df['sentiment_final'].value_counts()
sentiment_final.plot(kind='pie', ax=axes[0,2], autopct='%1.1f%%', startangle=90)
axes[0,2].set_title('Combined Sentiment Distribution (FINAL)')
axes[0,2].set_ylabel('')

# Comparison bar chart
comparison_data = pd.DataFrame({
    'TextBlob': sentiment_tb,
    'VADER': sentiment_vader,
    'Combined': sentiment_final
}).fillna(0)

comparison_data.plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Sentiment Method Comparison')
axes[1,0].set_xlabel('Sentiment')
axes[1,0].set_ylabel('Count')
axes[1,0].tick_params(axis='x', rotation=45)

# VADER compound score distribution
axes[1,1].hist(df['vader_compound'], bins=50, color='skyblue', alpha=0.7)
axes[1,1].set_title('VADER Compound Score Distribution')
axes[1,1].set_xlabel('Compound Score')
axes[1,1].set_ylabel('Frequency')
axes[1,1].axvline(x=0.05, color='green', linestyle='--', label='Positive threshold')
axes[1,1].axvline(x=-0.05, color='red', linestyle='--', label='Negative threshold')
axes[1,1].legend()

# Sentiment by email domain (top 10)
if 'email_domain' in df.columns:
    top_domains = df['email_domain'].value_counts().head(10).index
    domain_sentiment = df[df['email_domain'].isin(top_domains)].groupby(['email_domain', 'sentiment_final']).size().unstack(fill_value=0)
    domain_sentiment.plot(kind='bar', stacked=True, ax=axes[1,2])
    axes[1,2].set_title('Final Sentiment by Email Domain (Top 10)')
    axes[1,2].set_xlabel('Email Domain')
    axes[1,2].set_ylabel('Count')
    axes[1,2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../visualizations/sentiment_analysis_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# Agreement analysis
agreement_analysis = pd.crosstab(df['sentiment_textblob'], df['sentiment_vader'], margins=True)
print("\nAgreement between TextBlob and VADER:")
print(agreement_analysis)

# Calculate agreement percentage
total_agreements = (df['sentiment_textblob'] == df['sentiment_vader']).sum()
agreement_percentage = (total_agreements / len(df)) * 100
print(f"\nOverall agreement between TextBlob and VADER: {agreement_percentage:.2f}%")

## 5. Machine Learning-Based Sentiment Analysis

In [None]:
# Use TextBlob sentiment as labels for training (in real scenario, you'd have manually labeled data)
# For demonstration, we'll use a subset of data with clear sentiments
clear_sentiment_mask = df['polarity_score'].abs() > 0.1
df_train = df[clear_sentiment_mask].copy()

print(f"Training data shape: {df_train.shape}")
print(f"Training sentiment distribution:\n{df_train['sentiment_textblob'].value_counts()}")

# Prepare data for ML models
X = df_train['combined_text']
y = df_train['sentiment_textblob']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF feature matrix shape: {X_train_tfidf.shape}")

In [None]:
# Train Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)

# Train Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)

# Evaluate models
print("Naive Bayes Results:")
print(f"Accuracy: {accuracy_score(y_test, nb_pred):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, nb_pred))

print("\n" + "="*50)
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, lr_pred):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, lr_pred))

## 6. Apply Best Model to Full Dataset and Save Results

In [None]:
# Save the dataset with sentiment analysis results
df.to_csv('../data/processed/email_data_with_sentiment.csv', index=False)
print("Dataset with sentiment analysis saved to '../data/processed/email_data_with_sentiment.csv'")

# Create summary report
print("\n" + "="*60)
print("TASK 1: SENTIMENT LABELING - SUMMARY REPORT")
print("="*60)

print(f"\nDataset Overview:")
print(f"- Total messages analyzed: {len(df):,}")
print(f"- Unique employees: {df['from'].nunique():,}")
print(f"- Date range: {df['date'].min()} to {df['date'].max()}")

print(f"\nFinal Sentiment Distribution:")
sentiment_counts = df['sentiment_final'].value_counts()
for sentiment, count in sentiment_counts.items():
    percentage = (count / len(df)) * 100
    print(f"- {sentiment}: {count:,} messages ({percentage:.1f}%)")

print(f"\nMethod Comparison:")
print(f"- TextBlob-VADER Agreement: {agreement_percentage:.2f}%")
print(f"- Combined method used for final labels")

print(f"\nSample Results:")
sample_cols = ['Subject', 'sentiment_final', 'vader_compound', 'textblob_polarity']
sample_data = df[sample_cols].head(10)
print(sample_data.to_string(index=False))

print(f"\nOutput Files:")
print(f"- Processed dataset: ../data/processed/email_data_with_sentiment.csv")
print(f"- Visualization: ../visualizations/sentiment_analysis_comparison.png")

print(f"\n" + "="*60)
print("Task 1 completed successfully!")
print("Next: Run Task 2 (Exploratory Data Analysis)")
print("="*60)