# THIS NOTEBOOK IS FOR PREPARING, CHECKING AND CLEANING THE DATASET

Import statements

In [1]:
import praw
import json
import os
from dotenv import load_dotenv
load_dotenv()

True

Get the dataset by scraping the top 100 reddit posts in a subreddit (in this case r/PHbuildapc)

In [7]:

reddit = praw.Reddit(
    client_id = os.getenv('CLIENT_ID'),
    client_secret = os.getenv('CLIENT_SECRET'),
    user_agent = "retldr"
)

data = []
subreddit = reddit.subreddit("PHbuildapc") # subreddit of choice 

for submission in subreddit.top(limit=100): # only fetch the top 100 posts
    post = {
        "title": submission.title,
        "selftext": submission.selftext,
        "comments": [
            comment.body for comment in submission.comments
            if isinstance(comment, praw.models.Comment)
        ]
    }
    data.append(post)

# save dataset to json
with open("reddit_data.json", "w") as file:
    json.dump(data, file, indent=4)


Open the dataset (json) file

In [2]:
with open('reddit_data.json', 'r+') as file:
    data = json.load(file)

Check how many summaries have been added to the dataset

In [13]:
summary_counter = 0
post_w_summary = []
for i in data:
    if "summary" in i:
        summary_counter+=1
        post_w_summary.append(i["index"])

print(f"{summary_counter} posts have summaries added to it")
print(f"posts with index {post_w_summary} have summaries added to it")

0 posts have summaries added to it
posts with index [] have summaries added to it


Add index numbers to the dataset

In [14]:
index_num = 0
for i in data:
    i['index'] = index_num
    index_num += 1

with open('reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)

print("index numbers added and saved to json file.")

index numbers added and saved to json file.


Attempt to automate adding summaries to each post entry in the dataset

In [26]:
index = 20
summary = {
  "title": "r/PHBuildaPC Goes Dark in Protest Against Reddit API Pricing",
  "selftext": {
    "description": [
      "This post on r/PHBuildaPC discusses the subreddit going dark indefinitely as a part of a coordinated protest against Reddit's decision to raise API prices, which negatively affects third-party applications (3PAs) like Apollo, Sync, and Reddit is Fun.",
      "The subreddit will remain closed unless Reddit changes its stance on the API pricing.",
      "The post includes a FAQ section addressing concerns about the subreddit’s future, alternatives to Reddit like Tildes, and the broader context of the protest."
    ]
  },
  "comments": [
    {
      "comment": "A comment thanks the mod, u/Ryvaeus, for all the efforts and wishes them luck in future endeavors."
    },
    {
      "comment": "Another user expresses gratitude for the help in building their PC with input from the subreddit."
    },
    {
      "comment": "A commenter shares their sadness about the closure but acknowledges the importance of the protest."
    },
    {
      "comment": "Some users mention they’ve already built their PCs but still feel disappointed by the situation."
    }
  ]
}

for i in data:
    if i["index"] == index:
        i["summary"] = summary
        
with open('reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)