In [1]:
import re
import emoji
import pandas as pd

In [3]:
def clean_tweet(text: str) -> str:
    if not isinstance(text, str):
        return ""

    # 1) normalize URLs
    text = re.sub(r"http\S+", "http", text)

    # 2) normalize @handles (replace with @user for consistency)
    text = re.sub(r"@\w+", "@user", text)

    # 3) remove emoji aliases like ":face_with_tears_of_joy:" or ":smiling_face:"
    text = re.sub(r':[a-zA-Z0-9_&\-]+:', '', text)

    # 4) remove all actual Unicode emojis
    # (emoji.replace_emoji replaces all emojis with an empty string)
    text = emoji.replace_emoji(text, replace='')

    # 5) remove pipe separators or artifacts
    text = text.replace("||", " ")
    text = text.replace("|", " ")

    # 6) collapse multiple spaces and dots
    text = re.sub(r"\s+", " ", text)          # multiple spaces → single
    text = re.sub(r"\.{3,}", "...", text)     # 3+ dots → "..."

    # 7) strip spaces
    text = text.strip()

    return text

# ----------------------------------------------------------------------
# Example usage
# ----------------------------------------------------------------------
df = pd.read_csv("/Users/hasancan/Desktop/irony_detection/datasets/additional_dataset_2.csv")
df["text_clean"] = df["text"].apply(clean_tweet)
df.to_csv("additional_dataset_2_clean_no_emoji.csv", index=False)
print("Cleaned dataset saved to additional_dataset_2_clean.csv")

Cleaned dataset saved to additional_dataset_2_clean.csv
