Preprocessing all comment files in the directory: removing NaNs and duplicates, deleted comments, normalizing author names, converting all texts to lower case, and merging them into one csv file with pre-processed submissions.

This can be quite computationally expensive, so we advice using cloud services for the task.

In [None]:
import pandas as pd
import re
import glob
import os

In [None]:
comments_dir = "/PATH/comments"
submissions = "/PATH/merged_submissions.csv"

output_path = os.path.join("/OUTPUT_PATH", "merged_dataset.csv")

files = glob.glob(os.path.join(comments_dir, "*.csv"))

cleaned_dfs = []

In [None]:
def clean_comments(df):
    df = df.dropna(subset=["body"]).copy()
    df = df.drop_duplicates(subset=["author", "body", "created"])

    df["body"] = (
        df["body"]
        .astype(str)
        .str.lower()
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
        .str.replace(r"[^a-zA-Z0-9\s'.,!?-]", "", regex=True)
    )

    df = df[~df["body"].isin(["removed", "deleted"])]

    df["author"] = df["author"].astype(str).str.strip().str.replace(r"^u/", "", regex=True)

    return df

In [None]:
files

In [None]:
for file in files:
    print(f"Cleaning {file} ...")
    df = pd.read_csv(file)
    cleaned = clean_comments(df)
    cleaned_dfs.append(cleaned)

In [None]:
merged_comments = pd.concat(cleaned_dfs, ignore_index=True)

In [None]:
submissions_df = pd.read_csv(submissions)
submissions_df = submissions_df.rename(columns={"text": "body"}) # normalizing columns for consistency

In [None]:
merged_final = pd.concat([merged_comments, submissions_df], ignore_index=True)
merged_final = merged_final.drop_duplicates(subset=["author", "body", "created"])

In [None]:
merged_final["created"] = pd.to_datetime(merged_final["created"], errors="coerce")
merged_final = merged_final.sort_values(by="created", ascending=True).reset_index(drop=True)

In [None]:
len(merged_final)

In [None]:
merged_final.dtypes.value_counts()

In [None]:
num_posts = merged_final["title"].notna().sum()
num_comments = merged_final["title"].isna().sum()

print(f"Comments: {num_comments:,}")
print(f"Submissions: {num_posts:,}")

In [None]:
merged_final.info()

In [None]:
merged_final.head()

In [None]:
merged_final.to_csv(output_path, index=False)