# Data Preprocessing for Multilingual Stance Detection

This notebook handles the preprocessing of our collected Reddit data, including:
1. Text cleaning
2. Stance determination
3. Feature extraction
4. `train`/`val`/`test` splitting

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
import os
import joblib
import glob

In [4]:
# Load the data
print("Loading data...")
latest_file = max(glob.glob('../data/processed/preprocessed_stance/*.csv'), key=os.path.getctime)
data = pd.read_csv(latest_file)
print(f"Loaded {len(data)} samples")

# Display first few rows
data.head()

Loading data...
Loaded 8079 samples


Unnamed: 0,id,title,body,score,num_comments,language,subreddit,collected_at,title_length,body_length,language_verified,verification_status,total_length,stance
0,wu9zny,The r/climatechange Verified User Flair Program,r/climatechange is a community centered around...,41,20,en,climatechange,2024-11-07 18:05:59.306484,47,3164,True,verified,3212,0
1,1gm271u,1.5C is dead. The climate fight isn’t.,,79,18,en,climatechange,2024-11-07 18:06:00.729802,38,0,True,too_short,38,0
2,1glbafl,I’m incredibly sad for our environment today.,Trump has all but signed a death warrant for o...,1521,359,en,climatechange,2024-11-07 18:06:00.729809,46,527,True,verified,573,1
3,1glc2te,I’m sad. We really do live in a post-truth wor...,,787,102,en,climatechange,2024-11-07 18:06:00.731326,109,0,True,verified,108,1
4,1glsfp1,Anthropogenic warming has ushered in an era of...,,34,1,en,climatechange,2024-11-07 18:06:00.732643,95,0,True,verified,95,0


In [5]:
# Display stance distribution
print("\nStance distribution:")
print(data['stance'].value_counts())
print("\nStance distribution by language:")
print(pd.crosstab(data['language'], data['stance']))


Stance distribution:
stance
 0    6787
 1    1081
-1     211
Name: count, dtype: int64

Stance distribution by language:
stance     -1     0    1
language                
de         38  1829  126
en        148  3146  677
es          0   117   13
fr         20   879   88
it          5   816  177


In [6]:
# Prepare text
texts = data['title'] + ' ' + data['body'].fillna('')
languages = data['language']

# Create separate vectorizers for each language
print("Creating TF-IDF features...")
X_by_language = {}

for lang in set(languages):
    print(f"Processing {lang} texts...")
    # Get texts for this language
    lang_mask = languages == lang
    lang_texts = texts[lang_mask]
    
    # Create and fit vectorizer
    vectorizer = TfidfVectorizer(
        max_features=15000,
        ngram_range=(1, 3),
        lowercase=True,
        strip_accents='unicode'
    )
    
    # Transform texts
    X_lang = vectorizer.fit_transform(lang_texts)
    X_by_language[lang] = {
        'vectorizer': vectorizer,
        'features': X_lang,
        'indices': np.where(lang_mask)[0]
    }
    
    print(f"{lang}: {X_lang.shape[1]} features")

Creating TF-IDF features...
Processing en texts...
en: 15000 features
Processing es texts...
es: 4455 features
Processing de texts...
de: 15000 features
Processing fr texts...
fr: 15000 features
Processing it texts...
it: 15000 features


In [7]:
# Combine all features into one sparse matrix
print("Combining features...")
n_samples = len(texts)
n_features = max(X_lang['features'].shape[1] for X_lang in X_by_language.values())
X = sp.lil_matrix((n_samples, n_features))

for lang_data in X_by_language.values():
    features = lang_data['features']
    indices = lang_data['indices']
    X[indices, :features.shape[1]] = features

# Convert to CSR format for efficiency
X = X.tocsr()
print(f"Combined feature matrix shape: {X.shape}")

Combining features...
Combined feature matrix shape: (8079, 15000)


In [8]:
# Get labels
y = data['stance']

# Split the data
print("Splitting data...")
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

Splitting data...
Training samples: 5655
Validation samples: 1212
Test samples: 1212


In [9]:
# Save the processed data
print("Saving processed data...")
os.makedirs('../data/processed', exist_ok=True)

processed_data = {
    'train': (X_train, y_train),
    'val': (X_val, y_val),
    'test': (X_test, y_test),
    'vectorizers': {lang: data['vectorizer'] for lang, data in X_by_language.items()},
    'feature_size': n_features
}

joblib.dump(processed_data, '../data/processed/processed_data.joblib')

print("\nPreprocessing complete!")
print(f"Feature dimensionality: {n_features}")

Saving processed data...

Preprocessing complete!
Feature dimensionality: 15000
