In [3]:
import pandas as pd
from datasets import load_dataset

# Load the dataset directly from Hugging Face using the datasets library
dataset = load_dataset("cardiffnlp/tweet_topic_multi", split="train_all")

# Convert to pandas DataFrame
tweets_raw = dataset.to_pandas()

# Filter the dataset to text only (no numbers)
def filter_to_text_only(dataframe, text_col='text', label_col='label_name', label_num_col='label'):
    # Jetzt auch 'label' Spalte behalten
    df_filtered = dataframe[[text_col, label_col, label_num_col]].copy()
    df_filtered[text_col] = df_filtered[text_col].str.replace(r'\d+', '', regex=True)
    
    if isinstance(df_filtered[label_col].iloc[0], list):
        pass
    else:
        df_filtered[label_col] = df_filtered[label_col].astype(str)
    
    df_filtered[text_col] = df_filtered[text_col].str.replace(r'\s+', ' ', regex=True).str.strip()
    
    return df_filtered

tweets_text_only = filter_to_text_only(df)

print("\n✓ Dataset successfully loaded and filtered to text only")


✓ Dataset successfully loaded and filtered to text only


In [2]:
# Download required NLTK data
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Load SpaCy model
import spacy
try:
    nlp = spacy.load('en_core_web_sm')
    print("✓ SpaCy model loaded successfully")
except:
    print("Installing SpaCy model...")
    import os
    os.system('python -m spacy download en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')
    print("✓ SpaCy model loaded successfully")

def is_latin_alphabet(word):
    """
    Check if a word contains only Latin alphabet characters.
    Filters out words with Cyrillic, Arabic, Chinese, etc.
    """
    if not word:
        return False
    return all(ord('a') <= ord(c.lower()) <= ord('z') for c in word)

def segment_camelcase(text):
    """
    Segmentiert CamelCase-Wörter in separate Wörter ohne Regex.
    Beispiel: 'GameOfThrones' → 'Game Of Thrones'
    Dies ist wichtig für Hashtags wie #GameOfThrones nach Entfernung des #
    """
    if not text:
        return text
    
    result = []
    
    for i, char in enumerate(text):
        # Füge aktuellen Character hinzu
        result.append(char)
        
        # Prüfe, ob wir ein Leerzeichen einfügen müssen
        if i < len(text) - 1:
            current = char
            next_char = text[i + 1]
            
            # Fall 1: lowercase → uppercase (z.B. 'e' → 'O' in 'GameOf')
            if current.islower() and next_char.isupper():
                result.append(' ')
            
            # Fall 2: uppercase → uppercase → lowercase (z.B. 'HTML' → 'Parser')
            elif i < len(text) - 2:
                after_next = text[i + 2]
                if current.isupper() and next_char.isupper() and after_next.islower():
                    result.append(' ')
    
    return ''.join(result)

def preprocess_tweet(text):
    """
    Topic-optimized preprocessing for tweet classification.
    Preserves topic-relevant information while removing noise.
    Removes special characters, emojis, and non-Latin script words.
    """
    if not isinstance(text, str):
        return ""
    
    # Step 1: Remove RT (retweet indicator)
    text = text.replace('RT ', ' ').replace('rt ', ' ')
    
    # Step 2: Remove URLs and placeholders
    text = text.replace('{{URL}}', ' ')
    text = text.replace('{{USERNAME}}', ' ')
    for protocol in ['https://', 'http://', 'www.']:
        if protocol in text:
            parts = text.split(protocol)
            text = parts[0] + ' ' + ' '.join([' '.join(p.split()[1:]) if p.split() else '' for p in parts[1:]])
    
    # Step 3: Remove mentions
    words_list = text.split()
    words_list = [w for w in words_list if not (w.startswith('{@') or w.startswith('@'))]
    text = ' '.join(words_list)
    
    # Step 4: Extract hashtag text (#Gaming → Gaming, #GameOfThrones → GameOfThrones)
    words_list = text.split()
    words_list = [w[1:] if w.startswith('#') else w for w in words_list]
    text = ' '.join(words_list)
    
    # Step 4.5: Segment CamelCase words (WICHTIG: VOR dem Lowercase!)
    # GameOfThrones → Game Of Thrones
    text = segment_camelcase(text)
    
    # Step 5: Normalize whitespace and lowercase
    text = ' '.join(text.split())
    text = text.lower()
    
    # Step 6: Tokenize with SpaCy
    doc = nlp(text)
    
    # Step 7: Filter and lemmatize tokens
    processed_tokens = []
    for token in doc:
        # Skip punctuation
        if token.is_punct:
            continue
        
        # Skip if not alphabetic (removes special characters, emojis, numbers)
        if not token.is_alpha:
            continue
        
        # Skip tokens shorter than 2 characters
        if len(token.text) < 2:
            continue
        
        # Remove stopwords (using SpaCy's stopword detection)
        if token.is_stop:
            continue
        
        # Check if word uses Latin alphabet (filters out Cyrillic, Arabic, Chinese, etc.)
        if not is_latin_alphabet(token.text):
            continue
        
        # Use lemmatized form
        processed_tokens.append(token.lemma_)
    
    return ' '.join(processed_tokens)

# Create a copy of the original dataframe
tweets_preprocessed_train = tweets_text_only.copy()

# Apply preprocessing
tweets_preprocessed_train['text'] = tweets_preprocessed_train['text'].apply(preprocess_tweet)

print("\n✓ Preprocessing complete!")
print(f"✓ Processed {len(tweets_preprocessed_train)} tweets")
print(f"✓ Original 'tweets_text_only' unchanged | Processed data in 'tweets_preprocessed_train'")

# Speichere den DataFrame im Data Ordner
import os

# Erstelle Data Ordner falls nicht vorhanden
os.makedirs('Data', exist_ok=True)

# Speichere ab
output_path = 'Data/tweets_preprocessed_train.parquet'
tweets_preprocessed_train.to_parquet(output_path, index=False)

print(f"\n✓ DataFrame for training saved under path: {output_path}")
print(f"✓ Features: {list(tweets_preprocessed_train.columns)}")
print(f"✓ Shape: {tweets_preprocessed_train.shape}")

✓ SpaCy model loaded successfully

✓ Preprocessing complete!
✓ Processed 6090 tweets
✓ Original 'tweets_text_only' unchanged | Processed data in 'tweets_preprocessed_train'

✓ DataFrame for training saved under path: Data/tweets_preprocessed_train.parquet
✓ Features: ['text', 'label_name', 'label']
✓ Shape: (6090, 3)

✓ Preprocessing complete!
✓ Processed 6090 tweets
✓ Original 'tweets_text_only' unchanged | Processed data in 'tweets_preprocessed_train'

✓ DataFrame for training saved under path: Data/tweets_preprocessed_train.parquet
✓ Features: ['text', 'label_name', 'label']
✓ Shape: (6090, 3)
