## Data Cleaning

In [None]:
import json
import pandas as pd
import re
import string
# import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

try:
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('omw-1.4')

In [None]:

class RedditTextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Remove some emotion-related words from stopwords to preserve sentiment
        emotion_words = {
            'not', 'no', 'nor', 'but', 'however', 'although', 'though',
            'very', 'really', 'quite', 'too', 'so', 'more', 'most',
            'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
            'some', 'such', 'only', 'own', 'same', 'than', 'too', 'very'
        }
        self.stop_words = self.stop_words - emotion_words
        
        print(f"Initialized preprocessor with {len(self.stop_words)} stopwords")
    
    def remove_urls(self, text):
        """Remove URLs from text"""
        # Remove http/https URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        # Remove www URLs
        text = re.sub(r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        # Remove reddit links like /r/subreddit
        text = re.sub(r'/r/[A-Za-z0-9_]+', '', text)
        return text
    
    def remove_mentions_hashtags(self, text):
        """Remove @mentions and #hashtags"""
        # Remove @mentions
        text = re.sub(r'@[A-Za-z0-9_]+', '', text)
        # Remove #hashtags but preserve the word (e.g., #BombsAway -> BombsAway)
        text = re.sub(r'#([A-Za-z0-9_]+)', r'\1', text)
        return text
    
    def remove_html_tags(self, text):
        """Remove HTML tags"""
        text = re.sub(r'<[^>]+>', '', text)
        return text
    
    def handle_reddit_formatting(self, text):
        """Handle Reddit-specific formatting"""
        # Remove markdown links [text](url)
        text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
        # Remove reddit user references like u/username
        text = re.sub(r'u/[A-Za-z0-9_]+', '', text)
        # Remove markdown formatting **bold** and *italic*
        text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
        text = re.sub(r'\*([^\*]+)\*', r'\1', text)
        # Remove quote markers
        text = re.sub(r'^>', '', text, flags=re.MULTILINE)
        return text
    
    def convert_emojis(self, text):
        """Convert emojis to text descriptions"""
        try:
            import emoji
            # Convert emojis to text
            text = emoji.demojize(text, delimiters=(" ", " "))
            # Clean up the emoji text formatting
            text = re.sub(r':[a-zA-Z_]+:', lambda m: m.group().replace('_', ' ').replace(':', ''), text)
        except ImportError:
            # If emoji package is not available, just return the text as is
            pass
        return text
    
    def clean_special_characters(self, text):
        """Remove special characters but preserve emotionally relevant punctuation"""
        # Preserve ! and ? as they convey emotion
        # First, protect exclamation and question marks
        text = re.sub(r'!+', ' EXCLAMATION ', text)
        text = re.sub(r'\?+', ' QUESTION ', text)
        
        # Remove other punctuation except apostrophes (for contractions)
        text = re.sub(r'[^\w\s\']', ' ', text)
        
        # Restore exclamation and question marks
        text = text.replace(' EXCLAMATION ', ' ! ')
        text = text.replace(' QUESTION ', ' ? ')
        
        # Handle contractions by removing apostrophes after processing
        text = re.sub(r"'", '', text)
        
        return text
    
    def normalize_whitespace(self, text):
        """Remove redundant whitespace"""
        # Replace multiple whitespace with single space
        text = re.sub(r'\s+', ' ', text)
        # Strip leading/trailing whitespace
        text = text.strip()
        return text
    
    def remove_stopwords(self, text):
        """Remove stopwords while preserving sentence structure"""
        try:
            words = word_tokenize(text.lower())
            filtered_words = [word for word in words if word not in self.stop_words]
            return ' '.join(filtered_words)
        except:
            # Fallback: simple split if tokenization fails
            words = text.lower().split()
            filtered_words = [word for word in words if word not in self.stop_words]
            return ' '.join(filtered_words)
    
    def lemmatize_text(self, text):
        """Lemmatize words to their base form"""
        try:
            words = word_tokenize(text)
            lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
            return ' '.join(lemmatized_words)
        except:
            # Fallback: simple split if tokenization fails
            words = text.split()
            lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
            return ' '.join(lemmatized_words)
    
    def anonymize_identifiers(self, text):
        """Remove or mask identifiable information"""
        # Remove phone numbers
        text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text)
        # Remove email addresses
        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
        # Remove potential usernames (sequences of letters/numbers/underscores)
        text = re.sub(r'\b[A-Za-z0-9_]{8,}\b', '[USERNAME]', text)
        return text
    
    def preprocess_text(self, text):
        """Apply all preprocessing steps"""
        if not isinstance(text, str):
            return ""
        
        original_text = text
        
        # Step 1: Handle Reddit-specific formatting
        text = self.handle_reddit_formatting(text)
        
        # Step 2: Remove URLs
        text = self.remove_urls(text)
        
        # Step 3: Remove mentions and hashtags
        text = self.remove_mentions_hashtags(text)
        
        # Step 4: Remove HTML tags
        text = self.remove_html_tags(text)
        
        # Step 5: Convert emojis
        text = self.convert_emojis(text)
        
        # Step 6: Anonymize identifiers
        text = self.anonymize_identifiers(text)
        
        # Step 7: Clean special characters (preserve ! and ?)
        text = self.clean_special_characters(text)
        
        # Step 8: Normalize whitespace
        text = self.normalize_whitespace(text)
        
        # Step 9: Convert to lowercase
        text = text.lower()
        
        # Step 10: Remove stopwords
        text = self.remove_stopwords(text)
        
        # Step 11: Lemmatize
        text = self.lemmatize_text(text)
        
        # Final cleanup
        text = self.normalize_whitespace(text)
        
        return text

In [None]:

def load_and_preprocess_data():
    """Load the Reddit data and apply preprocessing"""
    
    print("Loading Reddit posts data...")
    
    # Load the JSON data
    try:
        with open('combined_y_labeled_data.json', 'r', encoding='utf-8') as file:
            data = json.load(file)
        print(f"Loaded {len(data)} Reddit posts/comments")
    except FileNotFoundError:
        print("Error: combined_y_labeled_data.json not found!")
        return
    
    # Initialize preprocessor
    preprocessor = RedditTextPreprocessor()
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    print("Starting text preprocessing...")
    print("This may take a few minutes...")
    
    # Apply preprocessing
    processed_texts = []
    for i, text in enumerate(df['text_content']):
        if i % 10 == 0:
            print(f"Processing {i+1}/{len(df)} texts...")
        
        processed_text = preprocessor.preprocess_text(text)
        processed_texts.append(processed_text)
    
    # Create final dataset
    final_df = pd.DataFrame({
        'id': df['id'],
        'text_content': processed_texts,
        'original_text': df['text_content'],
        'type': df['type'],
        'score': df['score'],
        'subjectivity': df['subjectivity']
    })
    
    # Remove entries where cleaned text is empty or too short
    final_df = final_df[final_df['text_content'].str.len() >= 3]
    
    print(f"\nPreprocessing complete!")
    print(f"Original dataset: {len(df)} entries")
    print(f"Final dataset: {len(final_df)} entries")
    print(f"Removed {len(df) - len(final_df)} entries with insufficient content")
    
    # Save to CSV
    final_df.to_csv('cleaned_reddit_posts.csv', index=False, encoding='utf-8')
    print(f"\nCleaned dataset saved to: cleaned_reddit_posts.csv")
    
    # Display sample results
    print("\n" + "="*60)
    print("SAMPLE PREPROCESSING RESULTS")
    print("="*60)
    
    for i in range(min(3, len(final_df))):
        print(f"\nSample {i+1}:")
        print(f"Original: {final_df.iloc[i]['original_text'][:150]}...")
        print(f"Cleaned:  {final_df.iloc[i]['text_content'][:150]}...")
        print("-" * 40)
    
    # Generate summary statistics
    print("\n" + "="*60)
    print("DATASET SUMMARY")
    print("="*60)
    
    print(f"Total entries: {len(final_df)}")
    print(f"Posts: {len(final_df[final_df['type'] == 'post'])}")
    print(f"Comments: {len(final_df[final_df['type'] == 'comment'])}")
    
    # Text length statistics
    text_lengths = final_df['text_content'].str.len()
    print(f"\nText length statistics (after cleaning):")
    print(f"  Average: {text_lengths.mean():.1f} characters")
    print(f"  Median: {text_lengths.median():.1f} characters")
    print(f"  Min: {text_lengths.min()} characters")
    print(f"  Max: {text_lengths.max()} characters")
    
    # Word count statistics
    word_counts = final_df['text_content'].str.split().str.len()
    print(f"\nWord count statistics (after cleaning):")
    print(f"  Average: {word_counts.mean():.1f} words")
    print(f"  Median: {word_counts.median():.1f} words")
    print(f"  Min: {word_counts.min()} words")
    print(f"  Max: {word_counts.max()} words")
    
    # Score and subjectivity statistics
    print(f"\nScore statistics:")
    print(f"  Average: {final_df['score'].mean():.2f}")
    print(f"  Range: {final_df['score'].min()} to {final_df['score'].max()}")
    
    print(f"\nSubjectivity statistics:")
    print(f"  Average: {final_df['subjectivity'].mean():.3f}")
    print(f"  Range: {final_df['subjectivity'].min():.3f} to {final_df['subjectivity'].max():.3f}")
    
    return final_df

In [None]:
print("Reddit Posts Text Preprocessing")
print("=" * 50)

# Install required packages if not available
try:
    import emoji
except ImportError:
    print("Installing emoji package...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'emoji'])
    import emoji

# Run preprocessing
cleaned_data = load_and_preprocess_data()

if cleaned_data is not None:
    print("\n" + "="*60)
    print("✅ PREPROCESSING COMPLETE!")
    print("="*60)
    print("Your dataset is now ready for sentiment and emotion analysis.")
    print("Output file: cleaned_reddit_posts.csv")
    print("\nNext steps:")
    print("1. Review the cleaned data")
    print("2. Apply sentiment analysis models")
    print("3. Apply emotion classification models")
    print("4. Analyze results for PR crisis insights")

## Data Annotation

In [None]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
def setup_models():
    # VADER analyzer
    vader_analyzer = SentimentIntensityAnalyzer()
    
    # BERTweet models
    print("Loading BERTweet models...")
    bertweet_sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    bertweet_emotion_model = "j-hartmann/emotion-english-distilroberta-base"
    
    # RoBERTa models  
    print("Loading RoBERTa models...")
    roberta_sentiment_model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    roberta_emotion_model = "j-hartmann/emotion-english-distilroberta-base"
    
    # Create pipelines
    bertweet_sentiment_pipe = pipeline("sentiment-analysis", 
                                      model=bertweet_sentiment_model,
                                      tokenizer=bertweet_sentiment_model,
                                      max_length=512, 
                                      truncation=True)
    
    bertweet_emotion_pipe = pipeline("text-classification", 
                                   model=bertweet_emotion_model,
                                   tokenizer=bertweet_emotion_model,
                                   max_length=512,
                                   truncation=True)
    
    roberta_sentiment_pipe = pipeline("sentiment-analysis", 
                                    model=roberta_sentiment_model,
                                    tokenizer=roberta_sentiment_model,
                                    max_length=512,
                                    truncation=True)
    
    roberta_emotion_pipe = pipeline("text-classification", 
                                  model=roberta_emotion_model,
                                  tokenizer=roberta_emotion_model,
                                  max_length=512,
                                  truncation=True)
    
    print("All models loaded successfully!")
    return vader_analyzer, bertweet_sentiment_pipe, bertweet_emotion_pipe, roberta_sentiment_pipe, roberta_emotion_pipe

def analyze_vader_sentiment(text, analyzer):
    scores = analyzer.polarity_scores(text)
    compound = scores['compound']
    
    if compound >= 0.05:
        return "Positive"
    elif compound <= -0.05:
        return "Negative"
    else:
        return "Neutral"

def analyze_bertweet_sentiment(text, pipe):
    try:
        result = pipe(text)[0]
        label = result['label']
        # Map labels to consistent format
        if label in ['LABEL_0', 'NEGATIVE']:
            return "Negative"
        elif label in ['LABEL_1', 'NEUTRAL']:
            return "Neutral"
        elif label in ['LABEL_2', 'POSITIVE']:
            return "Positive"
        else:
            return label.title()
    except Exception as e:
        print(f"Error in BERTweet sentiment: {e}")
        return "Neutral"

def analyze_emotion(text, pipe):
    try:
        result = pipe(text)[0]
        emotion = result['label']
        # Map to consistent emotion labels
        emotion_mapping = {
            'joy': 'Joy',
            'sadness': 'Sadness', 
            'anger': 'Anger',
            'fear': 'Fear',
            'surprise': 'Surprise',
            'no_emotion': 'No Emotion'
        }
        return emotion_mapping.get(emotion.lower(), emotion.title())
    except Exception as e:
        print(f"Error in emotion analysis: {e}")
        return "Neutral"

def analyze_roberta_sentiment(text, pipe):
    try:
        result = pipe(text)[0]
        label = result['label']
        # Map labels to consistent format
        if label in ['LABEL_0', 'NEGATIVE']:
            return "Negative"
        elif label in ['LABEL_1', 'NEUTRAL']:
            return "Neutral"
        elif label in ['LABEL_2', 'POSITIVE']:
            return "Positive"
        else:
            return label.title()
    except Exception as e:
        print(f"Error in RoBERTa sentiment: {e}")
        return "Neutral"

def process_dataset(input_file, output_file):
    print(f"Loading dataset from {input_file}...")
    df = pd.read_csv(input_file)
    print(f"Dataset loaded: {len(df)} entries")
    
    # Setup models
    vader_analyzer, bertweet_sentiment_pipe, bertweet_emotion_pipe, roberta_sentiment_pipe, roberta_emotion_pipe = setup_models()
    
    # Initialize new columns
    df['sentiment_vader'] = ""
    df['emotion_vader'] = "NA"  # VADER doesn't do emotion
    df['sentiment_bertweet'] = ""
    df['emotion_bertweet'] = ""
    df['sentiment_roberta'] = ""
    df['emotion_roberta'] = ""
    
    print("🔄 Processing annotations...")
    
    # Process each row with progress bar
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Annotating"):
        text = str(row['text_content'])
        
        # Skip if text is empty or too short
        if len(text.strip()) < 3:
            df.at[idx, 'sentiment_vader'] = "Neutral"
            df.at[idx, 'sentiment_bertweet'] = "Neutral"
            df.at[idx, 'emotion_bertweet'] = "Neutral"
            df.at[idx, 'sentiment_roberta'] = "Neutral"
            df.at[idx, 'emotion_roberta'] = "Neutral"
            continue
        
        # 1. VADER Analysis
        df.at[idx, 'sentiment_vader'] = analyze_vader_sentiment(text, vader_analyzer)
        
        # 2. BERTweet Analysis
        df.at[idx, 'sentiment_bertweet'] = analyze_bertweet_sentiment(text, bertweet_sentiment_pipe)
        df.at[idx, 'emotion_bertweet'] = analyze_emotion(text, bertweet_emotion_pipe)
        
        # 3. RoBERTa Analysis
        df.at[idx, 'sentiment_roberta'] = analyze_roberta_sentiment(text, roberta_sentiment_pipe)
        df.at[idx, 'emotion_roberta'] = analyze_emotion(text, roberta_emotion_pipe)
    
    # Save annotated dataset
    print(f"💾 Saving annotated dataset to {output_file}...")
    df.to_csv(output_file, index=False)
    print(f"✅ Complete! Annotated dataset saved with {len(df)} entries")
    
    # Print summary statistics
    print("\n📈 ANNOTATION SUMMARY:")
    print("=" * 50)
    
    print("\n🎯 VADER Sentiment Distribution:")
    print(df['sentiment_vader'].value_counts())
    
    print("\n🐦 BERTweet Sentiment Distribution:")
    print(df['sentiment_bertweet'].value_counts())
    
    print("\n🐦 BERTweet Emotion Distribution:")
    print(df['emotion_bertweet'].value_counts())
    
    print("\n🤖 RoBERTa Sentiment Distribution:")
    print(df['sentiment_roberta'].value_counts())
    
    print("\n🤖 RoBERTa Emotion Distribution:")
    print(df['emotion_roberta'].value_counts())
    
    return df

In [None]:
print("🚀 Starting Sentiment & Emotion Annotation Pipeline")
print("=" * 55)

input_file = "cleaned_reddit_posts.csv"
output_file = "annotated_reddit_posts.csv"

try:
    annotated_df = process_dataset(input_file, output_file)
    print(f"\n🎉 SUCCESS! Annotated dataset ready: {output_file}")
    
    # Show sample of annotated data
    print("\n📋 Sample of annotated data:")
    sample_cols = ['id', 'text_content', 'sentiment_vader', 'sentiment_bertweet', 
                    'emotion_bertweet', 'sentiment_roberta', 'emotion_roberta']
    print(annotated_df[sample_cols].head())
    
except Exception as e:
    print(f"❌ Error: {e}")
    raise