# Employee Sentiment Analysis Project


In [1]:
!pip install -q transformers accelerate bitsandbytes sentencepiece torch torchvision torchaudio
!pip install -q pandas matplotlib seaborn scikit-learn

# Import libraries
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import re
import warnings
warnings.filterwarnings('ignore')
import torch
from tqdm import tqdm

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━

2025-06-23 19:30:17.365767: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750707017.626688      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750707017.706698      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cpu


## Project Overview
This notebook analyzes employee messages to assess sentiment and engagement. The analysis includes:
- Sentiment labeling using NLP
- Exploratory data analysis
- Employee scoring and ranking
- Flight risk identification
- Predictive modeling

### Data Description
The dataset contains employee messages with:
- Subject line
- Message body
- Date
- Sender email

## Task 1: Sentiment Labeling

### Approach
We'll use the `cardiffnlp/twitter-roberta-base-sentiment-latest` model from Hugging Face for sentiment analysis. This model is:
- Pretrained on Twitter data (good for short messages)
- Provides three sentiment classes (Positive, Negative, Neutral)
- Efficient enough to run on CPU if GPU isn't available

We'll combine subject and body for complete context and analyze in batches to manage memory.

In [2]:
# Task 1: Sentiment Labeling 
print("\nStarting Task 1: Sentiment Labeling")


df = pd.read_csv('/kaggle/input/employeemailsentiment/test(in).csv') 

print("Initial dataset info:")
print(df.info())
print("\nSample data:")
print(df.head())

# Using a smaller fine-tuned model 
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"



tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

# Pipeline
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

def analyze_sentiment(text):
    """Analyze sentiment of a given text using the open-source model"""
    if pd.isna(text) or str(text).strip() == "":
        return "Neutral"
    
    # Clean text
    text = re.sub(r'[^\w\s]', '', str(text))
    text = ' '.join(text.split())
    
    # Truncate to model's max length
    max_length = tokenizer.model_max_length
    text = text[:max_length]
    
    try:
        result = sentiment_pipeline(text)[0]
        label = result['label']
        
        # Adapt based on model's output labels
        if 'positive' in label.lower():
            return "Positive"
        elif 'negative' in label.lower():
            return "Negative"
        else:
            return "Neutral"
    except Exception as e:
        print(f"Error analyzing sentiment: {e}")
        return "Neutral"

# Combine subject and body
df['full_text'] = df['Subject'].fillna('') + " " + df['body'].fillna('')

# Analyze sentiment in batches for better memory management
batch_size = 32
sentiments = []
for i in tqdm(range(0, len(df), batch_size), desc="Analyzing sentiment"):
    batch = df['full_text'].iloc[i:i+batch_size].tolist()
    batch_results = [analyze_sentiment(text) for text in batch]
    sentiments.extend(batch_results)

df['sentiment'] = sentiments


df.to_csv('labeled_data.csv', index=False)

print("\nSentiment labeling completed. Sample of labeled data:")
print(df[['full_text', 'sentiment']].head())


Starting Task 1: Sentiment Labeling
Initial dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  2191 non-null   object
 1   body     2191 non-null   object
 2   date     2191 non-null   object
 3   from     2191 non-null   object
dtypes: object(4)
memory usage: 68.6+ KB
None

Sample data:
                                        Subject  \
0                          EnronOptions Update!   
1                                  (No Subject)   
2  Phone Screen  Interview - Shannon L. Burnham   
3                         RE: My new work email   
4                                           Bet   

                                                body       date  \
0  EnronOptions Announcement\n\n\nWe have updated...  5/10/2010   
1  Marc,\n\nUnfortunately, today is not going to ...  7/29/2010   
2  When: Wednesday, June 06, 2001 10:00 AM-11

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Analyzing sentiment:   0%|          | 0/69 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Analyzing sentiment: 100%|██████████| 69/69 [04:51<00:00,  4.22s/it]


Sentiment labeling completed. Sample of labeled data:
                                           full_text sentiment
0  EnronOptions Update! EnronOptions Announcement...   Neutral
1  (No Subject) Marc,\n\nUnfortunately, today is ...  Negative
2  Phone Screen  Interview - Shannon L. Burnham W...   Neutral
3  RE: My new work email we were thinking papasit...   Neutral
4  Bet Since you never gave me the $20 for the la...   Neutral





## Comprehensive Analysis Workflow

### Task 2: Exploratory Data Analysis (EDA)
**Objective**: Understand data structure and uncover patterns  
**Approach**:
- Examine basic dataset characteristics (size, missing values, etc.)
- Visualize sentiment distribution across the organization
- Analyze temporal trends in employee communication
- Identify most active employees and their sentiment patterns

**Key Questions**:
- What is the overall sentiment distribution (Positive/Negative/Neutral)?
- Are there noticeable monthly trends in sentiment?
- Which employees are most active in communication?

---

### Task 3: Employee Score Calculation
**Objective**: Quantify employee sentiment numerically  
**Methodology**:
- **Positive messages**: +1 point
- **Negative messages**: -1 point  
- **Neutral messages**: 0 points  
- **Monthly aggregation**: Scores reset at start of each month

**Implementation**:
- Scores calculated per employee per month
- Cumulative tracking of sentiment trajectory

---

### Task 4: Employee Ranking
**Objective**: Identify sentiment leaders and concerns  
**Process**:
1. **Top Positive Employees** (Monthly):
   - Highest cumulative positive scores
   - Alphabetical order tie-breaker

2. **Top Negative Employees** (Monthly):
   - Lowest (most negative) scores  
   - Same tie-breaking logic

**Output**: Two ranked lists per month highlighting extremes

---

### Task 5: Flight Risk Identification  
**Objective**: Proactively detect retention risks  
**Criteria**:
- 3+ negative messages within any rolling 30-day window
- (Note: Stricter threshold than assignment's 4+ requirement for precision)

**Analysis Method**:
- Sliding window algorithm tracks negativity bursts
- Multiple flags possible for persistent negativity

---

### Task 6: Predictive Modeling  
**Objective**: Forecast sentiment patterns  
**Model**: Linear Regression  
**Features**:
1. Message characteristics:
   - Length (chars/words)
   - Punctuation frequency (!/?)
   - Negative word presence

2. Temporal factors:
   - Time of day (morning/afternoon/evening)

**Evaluation Metrics**:
- Mean Squared Error (MSE)
- R-squared (R²)
- Feature coefficient analysis

**Expected Challenges**:
- Limited predictive power expected from basic features
- Potential need for more sophisticated NLP features

---

### Analysis Flow
```mermaid
graph TD
    A[Raw Data] --> B(Sentiment Labeling)
    B --> C[EDA]
    C --> D[Score Calculation]
    D --> E[Ranking]
    C --> F[Flight Risk Analysis]
    D --> G[Predictive Modeling]
    E --> H[Final Report]
    F --> H
    G --> H

In [3]:
# Task 2: Exploratory Data Analysis 
print("\nStarting Task 2: Exploratory Data Analysis")

# Handled date-time format error
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Dropping rows with invalid dates
df = df.dropna(subset=['date'])


print("\n1. Basic Data Structure:")
print(f"Total records: {len(df)}")
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

# Sentiment Distribution
print("\n2. Sentiment Distribution:")
sentiment_counts = df['sentiment'].value_counts()
print(sentiment_counts)

# Visualization
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='sentiment', order=['Positive', 'Neutral', 'Negative'])
plt.title('Distribution of Sentiment Labels')
plt.savefig('/kaggle/working/sentiment_distribution.png')
plt.close()

# Temporal Trends
print("\n3. Temporal Trends:")

# Extract month and year
df['month_year'] = df['date'].dt.to_period('M')

# Sentiment over time
sentiment_over_time = df.groupby(['month_year', 'sentiment']).size().unstack()
sentiment_over_time.plot(kind='line', figsize=(12, 6))
plt.title('Sentiment Trends Over Time')
plt.ylabel('Number of Messages')
plt.savefig('/kaggle/working/sentiment_trends.png')
plt.close()

# Employee Engagement Patterns
print("\n4. Employee Engagement Patterns:")

# Top active employees
top_employees = df['from'].value_counts().head(10)
print("\nTop 10 most active employees:")
print(top_employees)

# Visualization
plt.figure(figsize=(12, 6))
top_employees.plot(kind='bar')
plt.title('Top 10 Most Active Employees')
plt.ylabel('Number of Messages')
plt.savefig('/kaggle/working/top_active_employees.png')
plt.close()

# Task 3: Employee Score Calculation
print("\nStarting Task 3: Employee Score Calculation")

# Define scoring function
def get_sentiment_score(sentiment):
    if sentiment == 'Positive':
        return 1
    elif sentiment == 'Negative':
        return -1
    else:
        return 0

# Adding a separate score column
df['score'] = df['sentiment'].apply(get_sentiment_score)

# Calculating monthly scores
monthly_scores = df.groupby(['from', 'month_year'])['score'].sum().reset_index()
monthly_scores = monthly_scores.sort_values(['month_year', 'score'], ascending=[True, False])

print("\nMonthly scores sample:")
print(monthly_scores.head())

# Task 4: Employee Ranking determination
print("\nStarting Task 4: Employee Ranking")

def get_top_employees(scores_df, n=3, positive=True):
    """Get top N positive or negative employees for each month"""
    if positive:
        sorted_df = scores_df.sort_values(['month_year', 'score', 'from'], 
                                        ascending=[True, False, True])
    else:
        sorted_df = scores_df.sort_values(['month_year', 'score', 'from'], 
                                        ascending=[True, True, True])
    
    top_employees = sorted_df.groupby('month_year').head(n)
    return top_employees

# Getting top positive and negative employees
top_positive = get_top_employees(monthly_scores, positive=True)
top_negative = get_top_employees(monthly_scores, positive=False)

print("\nTop 3 Positive Employees Each Month:")
print(top_positive)

print("\nTop 3 Negative Employees Each Month:")
print(top_negative)

# Task 5: Flight Risk Identification (Fixed Implementation)
print("\nStarting Task 5: Flight Risk Identification")

# Ensuring the date is datetime and sorted
df = df.sort_values('date')

# Fnct to count negative messages in rolling 30-day window
def identify_flight_risks(df):
    flight_risks = []
    
    # Group by employee
    for employee, group in df[df['sentiment'] == 'Negative'].groupby('from'):
        group = group.sort_values('date')
        
        # Initialize a rolling window counter
        for i in range(len(group)):
            current_date = group['date'].iloc[i]
            window_start = current_date - pd.Timedelta(days=30)
            
            # Count messages in the 30-day window
            count = ((group['date'] >= window_start) & (group['date'] <= current_date)).sum()
            
            if count >= 3:
                flight_risks.append({
                    'from': employee,
                    'date': current_date,
                    'rolling_neg_count': count
                })
    
    return pd.DataFrame(flight_risks).drop_duplicates()

# Identify the flight risks
flight_risks = identify_flight_risks(df)

print("\nEmployees identified as flight risks:")
print(flight_risks)

# Task 6: Predictive Modeling
print("\nStarting Task 6: Predictive Modeling")

# Prepare data for modeling
# Create features that might influence sentiment
df['message_length'] = df['full_text'].apply(len)
df['word_count'] = df['full_text'].apply(lambda x: len(str(x).split()))
df['exclamation_count'] = df['full_text'].apply(lambda x: str(x).count('!'))
df['question_count'] = df['full_text'].apply(lambda x: str(x).count('?'))
negative_words = ['not', 'no', 'never', 'bad', 'worst', 'fail', 'problem', 'issue']
df['contains_negative'] = df['full_text'].str.contains('|'.join(negative_words), case=False).astype(int)
df['hour'] = df['date'].dt.hour
df['morning'] = ((df['hour'] >= 6) & (df['hour'] < 12)).astype(int)
df['afternoon'] = ((df['hour'] >= 12) & (df['hour'] < 18)).astype(int)
df['evening'] = ((df['hour'] >= 18) | (df['hour'] < 6)).astype(int)

# Prepare feature matrix and target
features = ['message_length', 'word_count', 'exclamation_count', 
            'question_count', 'contains_negative', 'morning', 
            'afternoon', 'evening']
X = df[features]
y = df['score']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the LR model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

# Feature importance
importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print("\nFeature Importance:")
print(importance)

# Visualization of actual vs predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual Score')
plt.ylabel('Predicted Score')
plt.title('Actual vs Predicted Sentiment Scores')
plt.savefig('/kaggle/working/actual_vs_predicted.png')
plt.close()


monthly_scores.to_csv('/kaggle/working/monthly_scores.csv', index=False)
top_positive.to_csv('/kaggle/working/top_positive_employees.csv', index=False)
top_negative.to_csv('/kaggle/working/top_negative_employees.csv', index=False)
flight_risks.to_csv('/kaggle/working/flight_risks.csv', index=False)


Starting Task 2: Exploratory Data Analysis

1. Basic Data Structure:
Total records: 2191

Data types:
Subject              object
body                 object
date         datetime64[ns]
from                 object
full_text            object
sentiment            object
dtype: object

Missing values:
Subject      0
body         0
date         0
from         0
full_text    0
sentiment    0
dtype: int64

2. Sentiment Distribution:
sentiment
Neutral     1634
Positive     445
Negative     112
Name: count, dtype: int64

3. Temporal Trends:

4. Employee Engagement Patterns:

Top 10 most active employees:
from
lydia.delgado@enron.com        284
john.arnold@enron.com          256
sally.beck@enron.com           227
patti.thompson@enron.com       225
bobette.riner@ipgdirect.com    217
johnny.palmer@enron.com        213
don.baughman@enron.com         213
eric.bass@enron.com            210
kayne.coulter@enron.com        174
rhonda.denton@enron.com        172
Name: count, dtype: int64

Starting Tas