# Data Preprocessing for Multilingual Stance Detection

This notebook handles the preprocessing of our collected Reddit data, including:
1. Text cleaning
2. Stance determination
3. Feature extraction
4. `train`/`val`/`test` splitting

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
import os
import joblib
import glob

In [2]:
# Load the data
print("Loading data...")
latest_file = max(glob.glob('../data/raw/*.csv'), key=os.path.getctime)
data = pd.read_csv(latest_file)
print(f"Loaded {len(data)} samples")

# Display first few rows
data.head()

Loading data...
Loaded 8081 samples


Unnamed: 0,id,title,body,score,num_comments,created_utc,language,subreddit,collected_at,title_length,body_length,language_verified
0,wu9zny,The r/climatechange Verified User Flair Program,r/climatechange is a community centered around...,43,20,2022-08-21 16:53:23,en,climatechange,2024-11-06 21:54:55.841587,47,3164,True
1,1glbafl,I’m incredibly sad for our environment today.,Trump has all but signed a death warrant for o...,538,170,2024-11-06 17:38:25,en,climatechange,2024-11-06 21:54:57.573950,46,527,True
2,1glc2te,I’m sad. We really do live in a post-truth wor...,,211,31,2024-11-06 18:13:11,en,climatechange,2024-11-06 21:54:57.577986,109,0,True
3,1gl6msi,The US is about to make a sharp turn on climat...,,130,21,2024-11-06 14:21:42,en,climatechange,2024-11-06 21:54:57.581039,54,0,True
4,1gl7j9u,The US just abdicated from any role in climate...,Climate engineering is now the only chance hum...,105,14,2024-11-06 14:59:18,en,climatechange,2024-11-06 21:54:57.583582,151,70,True


In [3]:
def determine_stance(text):
    """Determine stance based on text content"""
    text = text.lower()
    
    # Keywords for stance detection
    positive_keywords = [
        'support', 'agree', 'real', 'urgent', 'action', 'crisis', 
        'emergency', 'important', 'necessary', 'must', 'believe', 
        'serious', 'threat', 'danger'
    ]
    
    negative_keywords = [
        'hoax', 'fake', 'exaggerated', 'myth', 'scam', 'conspiracy',
        'alarmist', 'overblown', 'propaganda', 'false'
    ]
    
    # Count keyword occurrences
    positive_count = sum(1 for keyword in positive_keywords if keyword in text)
    negative_count = sum(1 for keyword in negative_keywords if keyword in text)
    
    # Determine stance
    if positive_count > negative_count:
        return 0  # Positive stance
    elif negative_count > positive_count:
        return 1  # Negative stance
    else:
        return 2  # Neutral stance

# Prepare text and determine stance
print("Preparing text and determining stance...")
texts = data['title'] + ' ' + data['body'].fillna('')
languages = data['language']
data['stance'] = texts.apply(determine_stance)

# Display stance distribution
print("\nStance distribution:")
print(data['stance'].value_counts())
print("\nStance distribution by language:")
print(pd.crosstab(data['language'], data['stance']))

Preparing text and determining stance...

Stance distribution:
stance
2    7002
0     996
1      83
Name: count, dtype: int64

Stance distribution by language:
stance      0   1     2
language               
de         54   9  1930
en        842  73  3057
es          4   0   126
fr         66   0   922
it         30   1   967


In [4]:
# Create separate vectorizers for each language
print("Creating TF-IDF features...")
X_by_language = {}

for lang in set(languages):
    print(f"Processing {lang} texts...")
    # Get texts for this language
    lang_mask = languages == lang
    lang_texts = texts[lang_mask]
    
    # Create and fit vectorizer
    vectorizer = TfidfVectorizer(
        max_features=15000,
        ngram_range=(1, 3),
        lowercase=True,
        strip_accents='unicode'
    )
    
    # Transform texts
    X_lang = vectorizer.fit_transform(lang_texts)
    X_by_language[lang] = {
        'vectorizer': vectorizer,
        'features': X_lang,
        'indices': np.where(lang_mask)[0]
    }
    
    print(f"{lang}: {X_lang.shape[1]} features")

Creating TF-IDF features...
Processing es texts...
es: 4455 features
Processing it texts...
it: 15000 features
Processing fr texts...
fr: 15000 features
Processing de texts...
de: 15000 features
Processing en texts...
en: 15000 features


In [5]:
# Combine all features into one sparse matrix
print("Combining features...")
n_samples = len(texts)
n_features = max(X_lang['features'].shape[1] for X_lang in X_by_language.values())
X = sp.lil_matrix((n_samples, n_features))

for lang_data in X_by_language.values():
    features = lang_data['features']
    indices = lang_data['indices']
    X[indices, :features.shape[1]] = features

# Convert to CSR format for efficiency
X = X.tocsr()
print(f"Combined feature matrix shape: {X.shape}")

Combining features...
Combined feature matrix shape: (8081, 15000)


In [6]:
# Get labels
y = data['stance']

# Split the data
print("Splitting data...")
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

Splitting data...
Training samples: 5656
Validation samples: 1212
Test samples: 1213


In [None]:
# Save the processed data
print("Saving processed data...")
os.makedirs('../data/processed', exist_ok=True)

processed_data = {
    'train': (X_train, y_train),
    'val': (X_val, y_val),
    'test': (X_test, y_test),
    'vectorizers': {lang: data['vectorizer'] for lang, data in X_by_language.items()},
    'feature_size': n_features
}

joblib.dump(processed_data, '../data/processed/processed_data.joblib')

print("\nPreprocessing complete!")
print(f"Feature dimensionality: {n_features}")

Saving processed data...

Preprocessing complete!
Feature dimensionality: 15000
