In [23]:
import pandas as pd
import re

youtube_path = "../data/youtube_raw.csv"
gdelt_path = "../data/gdelt_raw.csv"

In [24]:
yt = pd.read_csv(youtube_path)
print("YouTube raw shape:", yt.shape)

YouTube raw shape: (99620, 9)


In [25]:
gd = pd.read_csv(gdelt_path)
print("GDELT raw shape:", gd.shape)

GDELT raw shape: (81477, 10)


In [26]:
yt = yt.dropna(axis=1, how="all")
print("YouTube after dropping empty columns:", yt.shape)

YouTube after dropping empty columns: (99620, 9)


In [27]:
gd = gd.dropna(axis=1, how="all")
print("GDELT after dropping empty columns:", gd.shape)

GDELT after dropping empty columns: (81477, 6)


In [28]:
gd["published_at_dt"] = pd.to_datetime(
    gd["published_at"],
    format="%Y%m%dT%H%M%SZ",
    errors="coerce",
    utc=True
)

In [29]:
gd = gd[gd["language"].str.lower() == "english"]
print("GDELT after English filter:", gd.shape)

GDELT after English filter: (71419, 7)


In [30]:
def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)       # remove URLs
    text = re.sub(r"@\w+", "", text)          # remove mentions
    text = re.sub(r"#\w+", "", text)          # remove hashtags
    text = re.sub(r"\s+", " ", text).strip() # normalize spaces
    return text

yt["clean_comment_text"] = yt["comment_text"].apply(clean_text)
gd["clean_title"] = gd["title"].apply(clean_text)

In [31]:
yt = yt[yt["clean_comment_text"].str.len() > 10]
print("YouTube after short-text removal:", yt.shape)

YouTube after short-text removal: (92170, 10)


In [32]:
gd = gd[gd["clean_title"].str.len() > 10]
print("GDELT after short-text removal:", gd.shape)

GDELT after short-text removal: (71314, 8)


In [33]:
yt = yt.drop_duplicates(subset=["clean_comment_text"])
print("YouTube after deduplication:", yt.shape)

YouTube after deduplication: (88436, 10)


In [34]:
gd = gd.drop_duplicates(subset=["clean_title"])
print("GDELT after deduplication:", gd.shape)

GDELT after deduplication: (53248, 8)


In [35]:
print("YouTube missing values:\n", yt.isna().sum())

YouTube missing values:
 video_id                0
video_title             0
channel                 0
video_published_at      0
comment_id              0
comment_text            0
comment_likes           0
comment_published_at    0
keyword                 0
clean_comment_text      0
dtype: int64


In [36]:
print("\nGDELT missing values:\n", gd.isna().sum())


GDELT missing values:
 url                0
title              0
language           0
published_at       0
keyword            0
source             0
published_at_dt    0
clean_title        0
dtype: int64


In [37]:
yt_clean_path = "../data/youtube_clean.csv"
gd_clean_path = "../data/gdelt_clean.csv"

yt.to_csv(yt_clean_path, index=False)
gd.to_csv(gd_clean_path, index=False)

yt_clean_path, gd_clean_path


('../data/youtube_clean.csv', '../data/gdelt_clean.csv')