# Loading Dataset

In [2]:
import pandas as pd

# Load datasets
df1 = pd.read_csv("data_1.csv")
df2 = pd.read_csv("data_2.csv")
df3 = pd.read_csv("data_3.csv")

In [3]:
# Merge them by stacking (row-wise)
merged_df = pd.concat([df1, df2, df3], ignore_index=True)

In [9]:
# Drop duplicates based on the "Title" column to keep only unique titles
df_unique = merged_df.drop_duplicates(subset="Title")

In [24]:
df = df_unique.dropna(subset=["Title"])

In [25]:
df

Unnamed: 0,Query Id,Query Name,Date,Title,Snippet,Url,Domain,Sentiment,Page Type,Language,...,Subreddit,Subreddit NSFW,Subreddit Subscribers,Subreddit Topics,Subscriptions,Tiktok Comments,Tiktok Likes,Tiktok Shares,Weblog Title,Emotion
0,2002928223,wildfire,2025-01-11 04:59:42.0,RT @npdcoalition Are you ready to manage burns...,RT @npdcoalition Are you ready to manage burns...,http://twitter.com/pfrostamis/statuses/1877943...,twitter.com,neutral,twitter,en,...,,False,,,,0,0,0,"pfrostamis (Patricia Frost PNP, Pediatric Disa...",
1,2002928223,wildfire,2025-01-11 04:59:34.0,RT @MattWallace888 üö®üö®üö® CALIFORNIA FIRE PLANE C...,RT @MattWallace888 üö®üö®üö® CALIFORNIA FIRE PLANE C...,http://twitter.com/mbmenlo/statuses/1877943412...,twitter.com,neutral,twitter,en,...,,False,,,,0,0,0,mbmenlo (Mary Bethüá∫üá∏ üá∫üá∏üá∫üá∏),
2,2002928223,wildfire,2025-01-11 04:59:32.0,RT @dog_rates This dog was found near a devast...,RT @dog_rates This dog was found near a devast...,http://twitter.com/Ogre_42/statuses/1877943404...,twitter.com,neutral,twitter,en,...,,False,,,,0,0,0,Ogre_42 (EMJ-Ogre_42),
3,2002928223,wildfire,2025-01-11 04:59:17.0,RT @GavinNewsom California is preventing insur...,RT @GavinNewsom California is preventing insur...,http://twitter.com/GellerLaurie/statuses/18779...,twitter.com,negative,twitter,en,...,,False,,,,0,0,0,GellerLaurie (Laurie Geller ‚òÆÔ∏è),
4,2002928223,wildfire,2025-01-11 04:59:16.0,RT @Weather_West Due to increased activity on ...,RT @Weather_West Due to increased activity on ...,http://twitter.com/FreeSpkr/statuses/187794333...,twitter.com,neutral,twitter,en,...,,False,,,,0,0,0,FreeSpkr (Cheri Hierbaum),
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131263,2002928223,wildfire,2025-01-18 20:00:55.0,No Doubt to reunite for performance at FireAid...,No Doubt to reunite for performance at FireAid...,http://twitter.com/THR/statuses/18807069578220...,twitter.com,neutral,twitter,en,...,,False,,,,0,0,0,THR (The Hollywood Reporter),
131267,2002928223,wildfire,2025-01-18 20:00:03.0,@TMZ Full story üîó https://t.co/eIs7xovUh8,@TMZ Full story üîó https://t.co/eIs7xovUh8,http://twitter.com/TMZ/statuses/18807067394497...,twitter.com,neutral,twitter,en,...,,False,,,,0,0,0,TMZ (TMZ),Joy
131268,2002928223,wildfire,2025-01-18 20:00:02.0,Vanessa Bryant gifted L.A. wildfire victims Ni...,Vanessa Bryant gifted L.A. wildfire victims Ni...,http://twitter.com/TMZ/statuses/18807067378476...,twitter.com,neutral,twitter,en,...,,False,,,,0,0,0,TMZ (TMZ),
131269,2002928223,wildfire,2025-01-18 20:00:01.0,JP Saxe Announces His Single Release Show Is N...,JP Saxe Announces His Single Release Show Is N...,http://twitter.com/StarlightPR1/statuses/18807...,twitter.com,neutral,twitter,en,...,,False,,,,0,0,0,StarlightPR1 (Starlight PR‚Ñ¢),


# Cleaning Dataset

In [19]:
import re
import unicodedata
import pandas as pd
from typing import List

# ====== Load & Prepare Source ======
# Prefer the deduped, non-NA "Title" dataframe if it exists
df_source = df.copy()

# Use Title explicitly
df_source["raw_text"] = df_source["Title"].astype(str)

# ====== Regex Helpers ======
url_re = re.compile(r"(https?://\S+|www\.\S+)", flags=re.IGNORECASE)
mention_re = re.compile(r"@\w+")
rt_re = re.compile(r"\bRT\b:?")          # remove "RT" (optional trailing colon)
hashtag_token_re = re.compile(r"#\w+", flags=re.UNICODE)  # remove hashtags from text to avoid tag influence
zw_nbsp_re = re.compile(r"[\u200B-\u200D\uFEFF\u00A0]")  # zero-width / NBSP

# ====== Cleaning Function (KEEP emojis) ======
def clean_text_title_llm(text: str) -> str:
    # 1) Unicode normalize
    text = unicodedata.normalize("NFKC", text)
    # 2) Remove zero-width & NBSP
    text = zw_nbsp_re.sub(" ", text)
    # 3) Normalize URLs and mentions
    text = url_re.sub(" [URL] ", text)
    text = mention_re.sub(" [USER] ", text)
    # 4) Remove retweet markers
    text = rt_re.sub(" ", text)
    # 5) Remove hashtags entirely (while we keep them separately)
    text = hashtag_token_re.sub(" ", text)
    # 6) Keep emojis as-is (no emoji stripping here)
    # 7) Whitespace tidy
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\s*\n\s*", "\n", text)
    return text.strip()

# ====== Hashtag Extraction (saved separately to keep optional use) ======
# Simple extractor
hashtags_simple = df_source["raw_text"].str.findall(r"#([A-Za-z0-9_]+)")
# Unicode-friendly fallback: capture letters (excluding digits/underscore) to catch non-Latin words
hashtags_extra = df_source["raw_text"].str.findall(r"#([^\W_]+)", flags=re.UNICODE)

def merge_hashtags(a: List[str] | None, b: List[str] | None) -> list[str]:
    s = set((a or []) + (b or []))
    return sorted(s)

df_source["Hashtags"] = [merge_hashtags(a, b) for a, b in zip(hashtags_simple, hashtags_extra)]

# ====== Apply Cleaning ======
df_source["clean_text"] = df_source["raw_text"].apply(clean_text_title_llm)

output_path = "data_clean_title_llm.csv"
df_source.to_csv(output_path, index=False)

In [22]:
df_source

Unnamed: 0,Query Id,Query Name,Date,Title,Snippet,Url,Domain,Sentiment,Page Type,Language,...,Subreddit Subscribers,Subreddit Topics,Subscriptions,Tiktok Comments,Tiktok Likes,Tiktok Shares,Weblog Title,Emotion,raw_text,clean_text
0,2002928223,wildfire,2025-01-11 04:59:42.0,RT @npdcoalition Are you ready to manage burns...,RT @npdcoalition Are you ready to manage burns...,http://twitter.com/pfrostamis/statuses/1877943...,twitter.com,neutral,twitter,en,...,,,,0,0,0,"pfrostamis (Patricia Frost PNP, Pediatric Disa...",,RT @npdcoalition Are you ready to manage burns...,[USER] Are you ready to manage burns? This res...
1,2002928223,wildfire,2025-01-11 04:59:34.0,RT @MattWallace888 üö®üö®üö® CALIFORNIA FIRE PLANE C...,RT @MattWallace888 üö®üö®üö® CALIFORNIA FIRE PLANE C...,http://twitter.com/mbmenlo/statuses/1877943412...,twitter.com,neutral,twitter,en,...,,,,0,0,0,mbmenlo (Mary Bethüá∫üá∏ üá∫üá∏üá∫üá∏),,RT @MattWallace888 üö®üö®üö® CALIFORNIA FIRE PLANE C...,[USER] üö®üö®üö® CALIFORNIA FIRE PLANE CAUGHT ON CAM...
2,2002928223,wildfire,2025-01-11 04:59:32.0,RT @dog_rates This dog was found near a devast...,RT @dog_rates This dog was found near a devast...,http://twitter.com/Ogre_42/statuses/1877943404...,twitter.com,neutral,twitter,en,...,,,,0,0,0,Ogre_42 (EMJ-Ogre_42),,RT @dog_rates This dog was found near a devast...,[USER] This dog was found near a devastating L...
3,2002928223,wildfire,2025-01-11 04:59:17.0,RT @GavinNewsom California is preventing insur...,RT @GavinNewsom California is preventing insur...,http://twitter.com/GellerLaurie/statuses/18779...,twitter.com,negative,twitter,en,...,,,,0,0,0,GellerLaurie (Laurie Geller ‚òÆÔ∏è),,RT @GavinNewsom California is preventing insur...,[USER] California is preventing insurance comp...
4,2002928223,wildfire,2025-01-11 04:59:16.0,RT @Weather_West Due to increased activity on ...,RT @Weather_West Due to increased activity on ...,http://twitter.com/FreeSpkr/statuses/187794333...,twitter.com,neutral,twitter,en,...,,,,0,0,0,FreeSpkr (Cheri Hierbaum),,RT @Weather_West Due to increased activity on ...,[USER] Due to increased activity on eastern fl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131263,2002928223,wildfire,2025-01-18 20:00:55.0,No Doubt to reunite for performance at FireAid...,No Doubt to reunite for performance at FireAid...,http://twitter.com/THR/statuses/18807069578220...,twitter.com,neutral,twitter,en,...,,,,0,0,0,THR (The Hollywood Reporter),,No Doubt to reunite for performance at FireAid...,No Doubt to reunite for performance at FireAid...
131267,2002928223,wildfire,2025-01-18 20:00:03.0,@TMZ Full story üîó https://t.co/eIs7xovUh8,@TMZ Full story üîó https://t.co/eIs7xovUh8,http://twitter.com/TMZ/statuses/18807067394497...,twitter.com,neutral,twitter,en,...,,,,0,0,0,TMZ (TMZ),Joy,@TMZ Full story üîó https://t.co/eIs7xovUh8,[USER] Full story üîó [URL]
131268,2002928223,wildfire,2025-01-18 20:00:02.0,Vanessa Bryant gifted L.A. wildfire victims Ni...,Vanessa Bryant gifted L.A. wildfire victims Ni...,http://twitter.com/TMZ/statuses/18807067378476...,twitter.com,neutral,twitter,en,...,,,,0,0,0,TMZ (TMZ),,Vanessa Bryant gifted L.A. wildfire victims Ni...,Vanessa Bryant gifted L.A. wildfire victims Ni...
131269,2002928223,wildfire,2025-01-18 20:00:01.0,JP Saxe Announces His Single Release Show Is N...,JP Saxe Announces His Single Release Show Is N...,http://twitter.com/StarlightPR1/statuses/18807...,twitter.com,neutral,twitter,en,...,,,,0,0,0,StarlightPR1 (Starlight PR‚Ñ¢),,JP Saxe Announces His Single Release Show Is N...,JP Saxe Announces His Single Release Show Is N...
