### Import

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the collected data 
df_news = pd.read_csv("news_data.csv")
df_worldnews = pd.read_csv("worldnews_data.csv")
df_politics = pd.read_csv("politics_data.csv")
df_technology = pd.read_csv("technology_data.csv")
df_worldpolitics = pd.read_csv("worldpolitics_data.csv")
df_TrueReddit = pd.read_csv("TrueReddit_data.csv")

# Combine all three into one DataFrame
df = pd.concat([df_news, df_worldnews, df_politics, df_technology, df_worldpolitics, df_TrueReddit], ignore_index=True)

# Display the structure
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (3000, 6)


Unnamed: 0,title,score,url,created_utc,num_comments,subreddit
0,Joe Biden elected president of the United States,365130,https://apnews.com/article/election-2020-joe-b...,1604767000.0,28247,news
1,"Chauvin found guilty of murder, manslaughter i...",250275,https://kstp.com/news/former-minneapolis-polic...,1618953000.0,27554,news
2,President Donald Trump says he has tested posi...,233315,https://www.cnbc.com/2020/10/02/president-dona...,1601615000.0,33153,news
3,Blizzard Employees Staged a Walkout After the ...,226336,https://www.thedailybeast.com/blizzard-employe...,1570654000.0,9395,news
4,Trump has left the White House for the last ti...,222635,https://edition.cnn.com/politics/live-news/bid...,1611149000.0,11633,news


### Clean and Keep Relevant Columns

In [22]:
print("Columns:", df.columns)

Columns: Index(['title', 'score', 'url', 'created_utc', 'num_comments', 'subreddit'], dtype='object')


In [23]:
# Keep only the relevant ones
columns_to_keep = ['title', 'score', 'num_comments', 'created_utc', 'subreddit']
df = df[columns_to_keep]

# Count missing values in each column
missing_counts = df.isna().sum()

# Display
print("Missing values per column:")
print(missing_counts)

Missing values per column:
title           0
score           0
num_comments    0
created_utc     0
subreddit       0
dtype: int64


In [24]:
# Group by subreddit and find duplicated titles
duplicates = df[df.duplicated(subset=['subreddit', 'title'], keep=False)]

# Sort for readability
duplicates = duplicates.sort_values(by=['subreddit', 'title'])

# Display
print(f"Number of duplicate titles within subreddits: {len(duplicates)}")
duplicates.head(10)

Number of duplicate titles within subreddits: 23


Unnamed: 0,title,score,num_comments,created_utc,subreddit
2556,Anne Frank and her family were also denied ent...,3909,341,1485631000.0,TrueReddit
2723,Anne Frank and her family were also denied ent...,2518,396,1448467000.0,TrueReddit
2548,Study Reveals It Costs Less to Give the Homele...,4104,731,1404391000.0,TrueReddit
2691,Study Reveals It Costs Less to Give the Homele...,2697,194,1596846000.0,TrueReddit
1046,Georgia Judge Throws Out Trump Campaign Lawsui...,106398,2597,1604594000.0,politics
1093,Georgia Judge Throws Out Trump Campaign Lawsui...,97318,4093,1604872000.0,politics
1060,Warren reintroduces bill to bar lawmakers from...,101880,2341,1608388000.0,politics
1170,Warren reintroduces bill to bar lawmakers from...,89622,1710,1609092000.0,politics
1608,I know you’re tired of hearing about net neutr...,74765,1698,1525528000.0,technology
1907,I know you’re tired of hearing about net neutr...,53861,1256,1526400000.0,technology


In [25]:
# Since there're only 2 reposts  with different num_comments and time created I decided to drop the second one of each duplicate
# Drop duplicate titles within the same subreddit, keeping the first one
df = df.drop_duplicates(subset=['subreddit', 'title'], keep='first')


### Convert Timestamps

In [26]:
# Convert 'created_utc' (UNIX timestamp) to datetime
df['created_date'] = pd.to_datetime(df['created_utc'], unit='s')

# Extract year and month for time-based grouping
df['year_month'] = df['created_date'].dt.to_period('M')

In [27]:
# Check
df.head()

Unnamed: 0,title,score,num_comments,created_utc,subreddit,created_date,year_month
0,Joe Biden elected president of the United States,365130,28247,1604767000.0,news,2020-11-07 16:28:37,2020-11
1,"Chauvin found guilty of murder, manslaughter i...",250275,27554,1618953000.0,news,2021-04-20 21:07:44,2021-04
2,President Donald Trump says he has tested posi...,233315,33153,1601615000.0,news,2020-10-02 05:04:17,2020-10
3,Blizzard Employees Staged a Walkout After the ...,226336,9395,1570654000.0,news,2019-10-09 20:45:17,2019-10
4,Trump has left the White House for the last ti...,222635,11633,1611149000.0,news,2021-01-20 13:16:44,2021-01


### Basic Text Cleaning for NLP

In [28]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)             # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)            # remove non-letter characters
    text = re.sub(r"\s+", " ", text).strip()        # remove extra whitespace
    return text

df['clean_title'] = df['title'].apply(clean_text)

In [29]:
df.head()

Unnamed: 0,title,score,num_comments,created_utc,subreddit,created_date,year_month,clean_title
0,Joe Biden elected president of the United States,365130,28247,1604767000.0,news,2020-11-07 16:28:37,2020-11,joe biden elected president of the united states
1,"Chauvin found guilty of murder, manslaughter i...",250275,27554,1618953000.0,news,2021-04-20 21:07:44,2021-04,chauvin found guilty of murder manslaughter in...
2,President Donald Trump says he has tested posi...,233315,33153,1601615000.0,news,2020-10-02 05:04:17,2020-10,president donald trump says he has tested posi...
3,Blizzard Employees Staged a Walkout After the ...,226336,9395,1570654000.0,news,2019-10-09 20:45:17,2019-10,blizzard employees staged a walkout after the ...
4,Trump has left the White House for the last ti...,222635,11633,1611149000.0,news,2021-01-20 13:16:44,2021-01,trump has left the white house for the last ti...


### Save Cleaned Data

In [30]:
df.to_csv("cleaned_reddit_posts.csv", index=False)
print("Cleaned data saved successfully.")

Cleaned data saved successfully.
