# Task 6: Predictive Modeling

This notebook implements **Task 6** of the Employee Sentiment Analysis project. The objective is to develop a linear regression model to analyze sentiment trends and predict sentiment scores.

## Model Objectives:
- **Analyze sentiment trends** using various independent variables
- **Predict sentiment scores** based on message characteristics
- **Identify significant factors** that influence sentiment

## Selected Features:
- **Message frequency** (messages per month)
- **Message length** (character count)
- **Average message length**
- **Word count** (total and average)
- **Temporal features** (month, day of week, weekend)
- **Employee domain** (encoded categorically)

## Requirements:
- Split data into training and testing sets
- Develop and validate linear regression model
- Evaluate using appropriate metrics
- Interpret results and significance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Load processed data
df = pd.read_csv('../data/processed/email_data_with_sentiment.csv')
monthly_scores = pd.read_csv('../data/processed/monthly_scores.csv')

print(f"Dataset loaded: {df.shape}")
print(f"Monthly scores loaded: {monthly_scores.shape}")

# Feature Engineering for Predictive Modeling
def create_features_for_prediction(df):
    """Create features for sentiment prediction"""
    
    # Convert date to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    # Create monthly aggregated features per employee
    monthly_features = df.groupby(['from', df['date'].dt.to_period('M')]).agg({
        'sentiment_score': 'sum',  # Target variable (monthly score)
        'combined_text_length': ['mean', 'std', 'sum'],
        'subject_word_count': ['mean', 'sum'],
        'body_word_count': ['mean', 'sum'],
        'sentiment_final': 'count',  # Message frequency
        'is_weekend': 'mean',  # Proportion of weekend messages
        'email_domain': 'first'
    }).reset_index()
    
    # Flatten column names
    monthly_features.columns = ['employee', 'year_month', 'monthly_score',
                              'avg_msg_length', 'std_msg_length', 'total_msg_length',
                              'avg_subject_words', 'total_subject_words',
                              'avg_body_words', 'total_body_words',
                              'message_frequency', 'weekend_proportion', 'email_domain']
    
    # Fill NaN values
    monthly_features['std_msg_length'] = monthly_features['std_msg_length'].fillna(0)
    
    # Encode email domain
    le = LabelEncoder()
    monthly_features['domain_encoded'] = le.fit_transform(monthly_features['email_domain'])
    
    # Add temporal features
    monthly_features['month'] = monthly_features['year_month'].dt.month
    monthly_features['year'] = monthly_features['year_month'].dt.year
    
    return monthly_features, le

# Create features
features_df, domain_encoder = create_features_for_prediction(df)

print(f"Features created: {features_df.shape}")
print(f"Columns: {list(features_df.columns)}")

# Select features for modeling
feature_columns = [
    'message_frequency', 'avg_msg_length', 'std_msg_length', 'total_msg_length',
    'avg_subject_words', 'total_subject_words', 'avg_body_words', 'total_body_words',
    'weekend_proportion', 'domain_encoded', 'month'
]

X = features_df[feature_columns]
y = features_df['monthly_score']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_train_pred = lr_model.predict(X_train_scaled)
y_test_pred = lr_model.predict(X_test_scaled)

# Evaluate model
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print(f"\n=== MODEL PERFORMANCE ===")
print(f"Training Set:")
print(f"- MSE: {train_mse:.4f}")
print(f"- MAE: {train_mae:.4f}")
print(f"- R²: {train_r2:.4f}")

print(f"\nTesting Set:")
print(f"- MSE: {test_mse:.4f}")
print(f"- MAE: {test_mae:.4f}")
print(f"- R²: {test_r2:.4f}")

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'coefficient': lr_model.coef_,
    'abs_coefficient': np.abs(lr_model.coef_)
}).sort_values('abs_coefficient', ascending=False)

print(f"\n=== FEATURE IMPORTANCE ===")
print(feature_importance)

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Actual vs Predicted (Test Set)
axes[0,0].scatter(y_test, y_test_pred, alpha=0.6)
axes[0,0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0,0].set_xlabel('Actual Monthly Score')
axes[0,0].set_ylabel('Predicted Monthly Score')
axes[0,0].set_title(f'Actual vs Predicted (Test Set)\nR² = {test_r2:.3f}')

# 2. Residuals plot
residuals = y_test - y_test_pred
axes[0,1].scatter(y_test_pred, residuals, alpha=0.6)
axes[0,1].axhline(y=0, color='r', linestyle='--')
axes[0,1].set_xlabel('Predicted Monthly Score')
axes[0,1].set_ylabel('Residuals')
axes[0,1].set_title('Residuals Plot')

# 3. Feature importance
top_features = feature_importance.head(8)
axes[1,0].barh(top_features['feature'], top_features['abs_coefficient'])
axes[1,0].set_xlabel('Absolute Coefficient Value')
axes[1,0].set_title('Feature Importance (Top 8)')

# 4. Distribution of predictions vs actual
axes[1,1].hist(y_test, bins=20, alpha=0.5, label='Actual', density=True)
axes[1,1].hist(y_test_pred, bins=20, alpha=0.5, label='Predicted', density=True)
axes[1,1].set_xlabel('Monthly Score')
axes[1,1].set_ylabel('Density')
axes[1,1].set_title('Distribution: Actual vs Predicted')
axes[1,1].legend()

plt.tight_layout()
plt.savefig('../visualizations/predictive_model_results.png', dpi=300, bbox_inches='tight')
plt.show()

# Save model results
model_results = {
    'model_type': 'Linear Regression',
    'train_r2': train_r2,
    'test_r2': test_r2,
    'train_mse': train_mse,
    'test_mse': test_mse,
    'train_mae': train_mae,
    'test_mae': test_mae,
    'feature_count': len(feature_columns),
    'training_samples': len(X_train),
    'testing_samples': len(X_test)
}

pd.DataFrame([model_results]).to_csv('../data/processed/model_performance.csv', index=False)
feature_importance.to_csv('../data/processed/feature_importance.csv', index=False)

print(f"\n" + "="*60)
print("TASK 6: PREDICTIVE MODELING - SUMMARY REPORT")
print("="*60)
print(f"Model successfully trained and evaluated!")
print(f"Test R² Score: {test_r2:.3f} ({'Good' if test_r2 > 0.7 else 'Moderate' if test_r2 > 0.5 else 'Poor'} fit)")
print(f"Most important features:")
for i, row in feature_importance.head(3).iterrows():
    print(f"  {i+1}. {row['feature']}: {row['coefficient']:.4f}")

print(f"\nOutput files:")
print(f"- Model performance: ../data/processed/model_performance.csv")
print(f"- Feature importance: ../data/processed/feature_importance.csv")
print(f"- Visualizations: ../visualizations/predictive_model_results.png")

print(f"\n" + "="*60)
print("ALL TASKS COMPLETED SUCCESSFULLY!")
print("Review notebooks 01-07 for complete analysis")
print("="*60)