## Data Cleaning

In [None]:
import json
import pandas as pd
import re
import string
# import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

try:
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('omw-1.4')

In [None]:

class RedditTextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Remove some emotion-related words from stopwords to preserve sentiment
        emotion_words = {
            'not', 'no', 'nor', 'but', 'however', 'although', 'though',
            'very', 'really', 'quite', 'too', 'so', 'more', 'most',
            'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
            'some', 'such', 'only', 'own', 'same', 'than', 'too', 'very'
        }
        self.stop_words = self.stop_words - emotion_words
        
        print(f"Initialized preprocessor with {len(self.stop_words)} stopwords")
    
    def remove_urls(self, text):
        # Remove http/https URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        # Remove www URLs
        text = re.sub(r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        # Remove reddit links like /r/subreddit
        text = re.sub(r'/r/[A-Za-z0-9_]+', '', text)
        return text
    
    def remove_mentions_hashtags(self, text):
        # Remove @mentions
        text = re.sub(r'@[A-Za-z0-9_]+', '', text)
        # Remove #hashtags but preserve the word (e.g., #BombsAway -> BombsAway)
        text = re.sub(r'#([A-Za-z0-9_]+)', r'\1', text)
        return text
    
    def remove_html_tags(self, text):
        text = re.sub(r'<[^>]+>', '', text)
        return text
    
    def handle_reddit_formatting(self, text):
        # Remove markdown links [text](url)
        text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
        # Remove reddit user references like u/username
        text = re.sub(r'u/[A-Za-z0-9_]+', '', text)
        # Remove markdown formatting **bold** and *italic*
        text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
        text = re.sub(r'\*([^\*]+)\*', r'\1', text)
        # Remove quote markers
        text = re.sub(r'^>', '', text, flags=re.MULTILINE)
        return text
    
    def convert_emojis(self, text):
        try:
            import emoji
            # Convert emojis to text
            text = emoji.demojize(text, delimiters=(" ", " "))
            # Clean up the emoji text formatting
            text = re.sub(r':[a-zA-Z_]+:', lambda m: m.group().replace('_', ' ').replace(':', ''), text)
        except ImportError:
            # If emoji package is not available, just return the text as is
            pass
        return text
    
    def clean_special_characters(self, text):
        # Preserve ! and ? as they convey emotion
        # First, protect exclamation and question marks
        text = re.sub(r'!+', ' EXCLAMATION ', text)
        text = re.sub(r'\?+', ' QUESTION ', text)
        
        # Remove other punctuation except apostrophes (for contractions)
        text = re.sub(r'[^\w\s\']', ' ', text)
        
        # Restore exclamation and question marks
        text = text.replace(' EXCLAMATION ', ' ! ')
        text = text.replace(' QUESTION ', ' ? ')
        
        # Handle contractions by removing apostrophes after processing
        text = re.sub(r"'", '', text)
        
        return text
    
    def normalize_whitespace(self, text):
        # Replace multiple whitespace with single space
        text = re.sub(r'\s+', ' ', text)
        # Strip leading/trailing whitespace
        text = text.strip()
        return text
    
    def remove_stopwords(self, text):
        try:
            words = word_tokenize(text.lower())
            filtered_words = [word for word in words if word not in self.stop_words]
            return ' '.join(filtered_words)
        except:
            # Fallback: simple split if tokenization fails
            words = text.lower().split()
            filtered_words = [word for word in words if word not in self.stop_words]
            return ' '.join(filtered_words)
    
    def lemmatize_text(self, text):
        try:
            words = word_tokenize(text)
            lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
            return ' '.join(lemmatized_words)
        except:
            # Fallback: simple split if tokenization fails
            words = text.split()
            lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
            return ' '.join(lemmatized_words)
    
    def anonymize_identifiers(self, text):
        # Remove phone numbers
        text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]', text)
        # Remove email addresses
        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
        # Remove potential usernames (sequences of letters/numbers/underscores)
        text = re.sub(r'\b[A-Za-z0-9_]{8,}\b', '[USERNAME]', text)
        return text
    
    def preprocess_text(self, text):
        if not isinstance(text, str):
            return ""
        
        original_text = text
        
        # Step 1: Handle Reddit-specific formatting
        text = self.handle_reddit_formatting(text)
        
        # Step 2: Remove URLs
        text = self.remove_urls(text)
        
        # Step 3: Remove mentions and hashtags
        text = self.remove_mentions_hashtags(text)
        
        # Step 4: Remove HTML tags
        text = self.remove_html_tags(text)
        
        # Step 5: Convert emojis
        text = self.convert_emojis(text)
        
        # Step 6: Anonymize identifiers
        text = self.anonymize_identifiers(text)
        
        # Step 7: Clean special characters (preserve ! and ?)
        text = self.clean_special_characters(text)
        
        # Step 8: Normalize whitespace
        text = self.normalize_whitespace(text)
        
        # Step 9: Convert to lowercase
        text = text.lower()
        
        # Step 10: Remove stopwords
        text = self.remove_stopwords(text)
        
        # Step 11: Lemmatize
        text = self.lemmatize_text(text)
        
        # Final cleanup
        text = self.normalize_whitespace(text)
        
        return text

In [None]:

def load_and_preprocess_data():
    # Load the JSON data
    try:
        with open('combined_y_labeled_data.json', 'r', encoding='utf-8') as file:
            data = json.load(file)
        print(f"Loaded {len(data)} Reddit posts/comments")
    except FileNotFoundError:
        print("Error: combined_y_labeled_data.json not found!")
        return
    
    # Initialize preprocessor
    preprocessor = RedditTextPreprocessor()
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    print("Starting text preprocessing...")
    print("This may take a few minutes...")
    
    # Apply preprocessing
    processed_texts = []
    for i, text in enumerate(df['text_content']):
        if i % 10 == 0:
            print(f"Processing {i+1}/{len(df)} texts...")
        
        processed_text = preprocessor.preprocess_text(text)
        processed_texts.append(processed_text)
    
    # Create final dataset
    final_df = pd.DataFrame({
        'id': df['id'],
        'text_content': processed_texts,
        'original_text': df['text_content'],
        'type': df['type'],
        'score': df['score'],
        'subjectivity': df['subjectivity']
    })
    
    # Remove entries where cleaned text is empty or too short
    final_df = final_df[final_df['text_content'].str.len() >= 3]
    
    print(f"\nPreprocessing complete!")
    print(f"Original dataset: {len(df)} entries")
    print(f"Final dataset: {len(final_df)} entries")
    print(f"Removed {len(df) - len(final_df)} entries with insufficient content")
    
    # Save to CSV
    final_df.to_csv('cleaned_reddit_posts.csv', index=False, encoding='utf-8')
    print(f"\nCleaned dataset saved to: cleaned_reddit_posts.csv")
    
    # Display sample results
    print("\n" + "="*60)
    print("SAMPLE PREPROCESSING RESULTS")
    print("="*60)
    
    for i in range(min(3, len(final_df))):
        print(f"\nSample {i+1}:")
        print(f"Original: {final_df.iloc[i]['original_text'][:150]}...")
        print(f"Cleaned:  {final_df.iloc[i]['text_content'][:150]}...")
        print("-" * 40)

    
    return final_df

In [None]:
print("Reddit Posts Text Preprocessing")

# Run preprocessing
cleaned_data = load_and_preprocess_data()

## Data Preprocessing

In [1]:
pip install -r requirements.txt

INFO: pip is looking at multiple versions of datasets to determine which version is compatible with other requirements. This could take a while.
Collecting cycler==0.12.1 (from -r requirements.txt (line 11))
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting contourpy==1.3.2 (from -r requirements.txt (line 10))
  Using cached contourpy-1.3.2-cp312-cp312-win_amd64.whl.metadata (5.5 kB)
Collecting colorama==0.4.6 (from -r requirements.txt (line 9))
  Using cached colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting click==8.2.1 (from -r requirements.txt (line 8))
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting charset-normalizer==3.4.2 (from -r requirements.txt (line 7))
  Using cached charset_normalizer-3.4.2-cp312-cp312-win_amd64.whl.metadata (36 kB)
Collecting certifi==2025.6.15 (from -r requirements.txt (line 6))
  Using cached certifi-2025.6.15-py3-none-any.whl.metadata (2.4 kB)
Collecting attrs==25.3.0 (from -r requirement

ERROR: Cannot install datasets==3.6.0 and fsspec==2025.5.1 because these package versions have conflicting dependencies.
ERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts


In [3]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('annotated_reddit_posts.csv')

# Count unique values in 'sentiment' column
sentiment_counts = df['sentiment'].value_counts()
print('Sentiment counts:')
print(sentiment_counts)
print('\n')

# Count unique values in 'emotion' column
emotion_counts = df['emotion'].value_counts()
print('Emotion counts:')
print(emotion_counts)


Sentiment counts:
sentiment
Negative    52
Positive    22
Neutral     21
Name: count, dtype: int64


Emotion counts:
emotion
Anger         24
No Emotion    22
Sadness       21
Surprise      12
Joy           12
Fear           4
Name: count, dtype: int64


In [5]:
df.head(5)

Unnamed: 0,id,text_content,original_text,type,score,subjectivity,sentiment,emotion
0,d8kzu3m,ya screw username really looking forward note ...,Ya this screws me over completely. I was reall...,comment,2,0.464167,Negative,Sadness
1,5jd0fx,username samsung galaxy note7 still more user ...,Cancelled Samsung Galaxy Note7 still has more ...,post,95,0.5,Neutral,Surprise
2,dbfyq3r,traded note 7 s7 edge really hope samsung user...,I traded my Note 7 in for an S7 Edge. I reall...,comment,9,0.414286,Negative,Sadness
3,dbgcdgl,reading username report battery design failed ...,From reading the independent report of why its...,comment,2,0.40625,Positive,Sadness
4,dbftpbx,maybe phone unique username feature explode us...,Maybe the phone's unique exploding feature (or...,comment,-4,0.8,Negative,Anger


In [6]:
import pandas as pd

def oversample_minority_classes(df, label_column):
    # Find the size of the largest class
    max_size = df[label_column].value_counts().max()
    # List to hold oversampled DataFrames
    frames = []
    # Oversample each class
    for class_label, group in df.groupby(label_column):
        # Sample with replacement to match max_size
        oversampled_group = group.sample(max_size, replace=True, random_state=42)
        frames.append(oversampled_group)
    # Concatenate all oversampled groups
    oversampled_df = pd.concat(frames).sample(frac=1, random_state=42).reset_index(drop=True)
    return oversampled_df


# Oversample sentiment
oversampled_sentiment = oversample_minority_classes(df, 'sentiment')
print("Sentiment oversampled counts:")
print(oversampled_sentiment['sentiment'].value_counts())

# Oversample emotion
oversampled_emotion = oversample_minority_classes(df, 'emotion')
print("\nEmotion oversampled counts:")
print(oversampled_emotion['emotion'].value_counts())

# Save to new CSVs if you want
oversampled_sentiment.to_csv('data/oversampled_sentiment.csv', index=False)
oversampled_emotion.to_csv('data/oversampled_emotion.csv', index=False)

Sentiment oversampled counts:
sentiment
Neutral     52
Positive    52
Negative    52
Name: count, dtype: int64

Emotion oversampled counts:
emotion
Sadness       24
Anger         24
No Emotion    24
Joy           24
Surprise      24
Fear          24
Name: count, dtype: int64
