### Import

In [1]:
import pandas as pd

# Load the collected data 
df_news = pd.read_csv("../1_OriginalData/news_data.csv")
df_worldnews = pd.read_csv("../1_OriginalData/worldnews_data.csv")
df_politics = pd.read_csv("../1_OriginalData/politics_data.csv")
df_technology = pd.read_csv("../1_OriginalData/technology_data.csv")
df_worldpolitics = pd.read_csv("../1_OriginalData/worldpolitics_data.csv")
df_TrueReddit = pd.read_csv("../1_OriginalData/TrueReddit_data.csv")

# Combine all into one DataFrame
df = pd.concat([df_news, df_worldnews, df_politics, df_technology, df_worldpolitics, df_TrueReddit], ignore_index=True)

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (3000, 6)


Unnamed: 0,title,score,url,created_utc,num_comments,subreddit
0,Joe Biden elected president of the United States,365122,https://apnews.com/article/election-2020-joe-b...,1604767000.0,28194,news
1,"Chauvin found guilty of murder, manslaughter i...",250268,https://kstp.com/news/former-minneapolis-polic...,1618953000.0,27550,news
2,President Donald Trump says he has tested posi...,233319,https://www.cnbc.com/2020/10/02/president-dona...,1601615000.0,33133,news
3,Blizzard Employees Staged a Walkout After the ...,226328,https://www.thedailybeast.com/blizzard-employe...,1570654000.0,9392,news
4,Trump has left the White House for the last ti...,222636,https://edition.cnn.com/politics/live-news/bid...,1611149000.0,11623,news


### Clean and Keep Relevant Columns

In [2]:
print("Columns:", df.columns)

Columns: Index(['title', 'score', 'url', 'created_utc', 'num_comments', 'subreddit'], dtype='object')


In [3]:
# Keep only the relevant ones
columns_to_keep = ['title', 'score', 'num_comments', 'created_utc', 'subreddit']
df = df[columns_to_keep]

# Count missing values in each column
missing_counts = df.isna().sum()

print("Missing values per column:")
print(missing_counts)

Missing values per column:
title           0
score           0
num_comments    0
created_utc     0
subreddit       0
dtype: int64


In [4]:
# Group by subreddit and find duplicated titles
duplicates = df[df.duplicated(subset=['subreddit', 'title'], keep=False)]

# Sort for readability
duplicates = duplicates.sort_values(by=['subreddit', 'title'])

print(f"Number of duplicate titles within subreddits: {len(duplicates)}")
duplicates.head(10)

Number of duplicate titles within subreddits: 23


Unnamed: 0,title,score,num_comments,created_utc,subreddit
2562,Anne Frank and her family were also denied ent...,3912,341,1485631000.0,TrueReddit
2732,Anne Frank and her family were also denied ent...,2518,396,1448467000.0,TrueReddit
2553,Study Reveals It Costs Less to Give the Homele...,4107,731,1404391000.0,TrueReddit
2700,Study Reveals It Costs Less to Give the Homele...,2697,194,1596846000.0,TrueReddit
1046,Georgia Judge Throws Out Trump Campaign Lawsui...,106393,2597,1604594000.0,politics
1093,Georgia Judge Throws Out Trump Campaign Lawsui...,97311,4093,1604872000.0,politics
1060,Warren reintroduces bill to bar lawmakers from...,101877,2341,1608388000.0,politics
1170,Warren reintroduces bill to bar lawmakers from...,89617,1709,1609092000.0,politics
1609,I know you’re tired of hearing about net neutr...,74764,1690,1525528000.0,technology
1911,I know you’re tired of hearing about net neutr...,53856,1256,1526400000.0,technology


In [5]:
# Drop duplicate titles within the same subreddit, keeping the first one
df = df.drop_duplicates(subset=['subreddit', 'title'], keep='first')

### Convert Timestamps

In [6]:
# Convert 'created_utc' to datetime
df['created_date'] = pd.to_datetime(df['created_utc'], unit='s')

# Extract year and month for time-based grouping
df['year_month'] = df['created_date'].dt.to_period('M')

df.head()

Unnamed: 0,title,score,num_comments,created_utc,subreddit,created_date,year_month
0,Joe Biden elected president of the United States,365122,28194,1604767000.0,news,2020-11-07 16:28:37,2020-11
1,"Chauvin found guilty of murder, manslaughter i...",250268,27550,1618953000.0,news,2021-04-20 21:07:44,2021-04
2,President Donald Trump says he has tested posi...,233319,33133,1601615000.0,news,2020-10-02 05:04:17,2020-10
3,Blizzard Employees Staged a Walkout After the ...,226328,9392,1570654000.0,news,2019-10-09 20:45:17,2019-10
4,Trump has left the White House for the last ti...,222636,11623,1611149000.0,news,2021-01-20 13:16:44,2021-01


### Basic Text Cleaning for NLP

In [7]:
# Basic text cleanup
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)             # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)            # remove non-letter characters
    text = re.sub(r"\s+", " ", text).strip()        # remove extra whitespace
    return text

df['clean_title'] = df['title'].apply(clean_text)

In [8]:
# Sanity-check, drop corrupted rows
bad_rows = df[~df["clean_title"].apply(lambda x: isinstance(x, str))]
print(f"Non-string clean_title rows dropped: {len(bad_rows)}")

df = df[df["clean_title"].apply(lambda x: isinstance(x, str))].copy()

Non-string clean_title rows dropped: 0


In [9]:
# Tokenization and stopword removal
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

BASE_STOP   = set(stopwords.words('english'))
CUSTOM_STOP = {
    'schlop','upvote','fuck','says','dont','get','like','one','us','man',
    'people','years','year','america','white','black',
    'make','take','know','think','going','want'
}

def clean_and_tokenize(text: str) -> list[str]:
    if not isinstance(text, str):
        return []
    words = re.findall(r"\b[a-zA-Z']+\b", text.lower())
    return [
        w for w in words
        if w not in BASE_STOP and w not in CUSTOM_STOP
    ]

# Create tokens column and remove any non-string leftovers
df['tokens'] = (
    df['clean_title']
        .apply(clean_and_tokenize)
        .apply(lambda lst: [w for w in lst if isinstance(w, str)])
)

# Remove rows with no usable tokens
df = df[df['tokens'].str.len() > 0].copy()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/marynakyslytsyna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Save Cleaned Data

In [10]:
df.to_csv("cleaned_reddit_posts.csv", index=False)
print("Cleaned data saved successfully.")

Cleaned data saved successfully.
