# Minimal Preprocessing For Top2Vec and BERTopic

## Dependencies

In [1]:
import pandas as pd
import datetime
import re
import emoji
import html

## Preprocess Function

In [4]:
def simple_preprocess(text):
    text = text.lower()  # Lowercase for consistency
    text = html.unescape(text)  # Convert HTML entities (e.g., &amp; → &)
    text = emoji.replace_emoji(text, replace="")  # Remove emojis
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"#(\w+)", r"\1", text)  # Convert hashtags to normal words
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
    return text

## Minimal Cleaning Script

In [5]:
# Load data
df = pd.read_csv('../data/raw/merged_tweets_en.csv')
print(f"Initial rows: {len(df)}")

# Sample for testing (optional)
# df = df.sample(100, random_state=41)
print(f"After sampling: {len(df)}")

# Keep relevant columns
df = df[["date", "content"]]
print(f"After column filtering: {len(df)}")

# Drop missing content
df = df.dropna(subset=["content"])
print(f"After dropping missing content: {len(df)}")

# Drop duplicates based on content
df = df.drop_duplicates(subset=["content"])
print(f"After dropping duplicates: {len(df)}")

# Apply text preprocessing and directly assign to final_text
df["final_text"] = df["content"].apply(simple_preprocess)
print(f"After text preprocessing: {len(df)}")

# Drop empty/blank rows after preprocessing
df = df[df["final_text"].str.strip().astype(bool)]
print(f"After removing blank processed_text rows: {len(df)}")

# Save clean data
date_today = datetime.datetime.today().strftime("%Y%m%d_%H%M")
df.to_csv(f"../data/processed/{date_today}_minimal_clean_merged_tweets.csv", index=False)

# Show preview
df.head()

Initial rows: 499123
After sampling: 499123
After column filtering: 499123
After dropping missing content: 499123
After dropping duplicates: 497007
After text preprocessing: 497007
After removing blank processed_text rows: 497007


Unnamed: 0,date,content,final_text
0,2022-04-29 23:59:28+00:00,"@KunstJonas ""Singing lullabies like 'Twinkle T...","""singing lullabies like 'twinkle twinkle littl..."
1,2022-04-29 23:58:50+00:00,"Useful, but one of the problems with Long COVI...","useful, but one of the problems with long covi..."
2,2022-04-29 23:58:28+00:00,@AnneMaine3 @Reactively @freemarketrules @Dail...,i am but i know people with long covid who are...
3,2022-04-29 23:58:22+00:00,@NSWHealth Dr Moy ( VP AMA)- C19 not done with...,dr moy ( vp ama)- c19 not done with us yet. au...
4,2022-04-29 23:57:56+00:00,@MandateMasksNY @GovKathyHochul @DrMaryTBasset...,over 4000 cases in fingerlakes region last wee...
