In [12]:
import re
import json
from datetime import datetime
from collections import Counter
import pandas as pd
from afinn import Afinn

afinn = Afinn()

def tokenize(text):
    """Clean and tokenize text into words (alphabetic only)."""
    if not isinstance(text, str):
        return []
    return re.findall(r'\b[a-zA-Z]+\b', text.lower())

def get_sentiment_words(text):
    """Extract words that have AFINN sentiment scores."""
    words = tokenize(text)
    sentiment_words = {w: afinn.score(w) for w in words if afinn.score(w) != 0}
    return sentiment_words

def process_emails(data):
    """Flatten all email types into a unified list with status."""
    emails = []

    for status_key, status_label in [
        ('rejection_emails', 'rejection'),
        ('feedback_rejection', 'feedback_rejection'),
        ('ghosted_applications', 'ghosted')
    ]:
        for email in data.get(status_key, []):
            email['status'] = status_label
            emails.append(email)

    return pd.DataFrame(emails)

# --- Load data ---
with open('../data/email.json', 'r') as f:
    data = json.load(f)

df = process_emails(data)

# --- Compute sentiment scores ---
df['afinn_score'] = df['email_text'].apply(lambda text: afinn.score(text) if text else 0)

results = []
for _, row in df[df['status'] != 'ghosted'].iterrows():
    text = row.get('email_text', '')
    sentiment_words = get_sentiment_words(text)

    positive = {w: s for w, s in sentiment_words.items() if s > 0}
    negative = {w: s for w, s in sentiment_words.items() if s < 0}

    results.append({
        'company_id': row.get('company_id'),
        'position_applied': row.get('position_applied'),
        'status': row['status'],
        'afinn_score': row['afinn_score'],
        'positive_words': positive,
        'negative_words': negative,
        'positive_count': len(positive),
        'negative_count': len(negative),
        'email_length': len(tokenize(text))
    })

results_df = pd.DataFrame(results)

# --- Aggregate most frequent positive/negative words ---
all_positive, all_negative = Counter(), Counter()

for text in df[df['status'] != 'ghosted']['email_text']:
    sentiment_words = get_sentiment_words(text)
    for word, score in sentiment_words.items():
        if score > 0:
            all_positive[word] += 1
        else:
            all_negative[word] += 1

# --- Save results ---
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_df_csv = results_df.copy()
results_df_csv['positive_words'] = results_df_csv['positive_words'].apply(json.dumps)
results_df_csv['negative_words'] = results_df_csv['negative_words'].apply(json.dumps)

results_df_csv.to_csv(f'../data/afinn_results.csv', index=False)

print("✅ Analysis complete.")
print(f"Top positive words: {all_positive.most_common(10)}")
print(f"Top negative words: {all_negative.most_common(10)}")




✅ Analysis complete.
Top positive words: [('thank', 10), ('interest', 9), ('best', 7), ('wish', 5), ('appreciate', 4), ('opportunities', 4), ('like', 4), ('better', 4), ('good', 4), ('luck', 4)]
Top negative words: [('sorry', 3), ('regret', 2), ('lack', 1), ('hard', 1), ('mistakes', 1), ('no', 1), ('disappointing', 1), ('forget', 1), ('forced', 1)]
