# THIS NOTEBOOK IS FOR PREPARING, CHECKING AND CLEANING THE DATASET

Import statements

In [2]:
import praw
import json
import os
from dotenv import load_dotenv
load_dotenv()

True

Get the dataset by scraping the top 100 reddit posts in a subreddit (in this case r/PHbuildapc)

In [7]:

reddit = praw.Reddit(
    client_id = os.getenv('CLIENT_ID'),
    client_secret = os.getenv('CLIENT_SECRET'),
    user_agent = "retldr"
)

data = []
subreddit = reddit.subreddit("PHbuildapc") # subreddit of choice 

for submission in subreddit.top(limit=100): # only fetch the top 100 posts
    post = {
        "title": submission.title,
        "selftext": submission.selftext,
        "comments": [
            comment.body for comment in submission.comments
            if isinstance(comment, praw.models.Comment)
        ]
    }
    data.append(post)

# save dataset to json
with open("reddit_data.json", "w") as file:
    json.dump(data, file, indent=4)


Open the dataset (json) file

In [3]:
with open('reddit_data.json', 'r+') as file:
    data = json.load(file)

Check how many summaries have been added to the dataset

In [5]:
summary_counter = 0
post_w_summary = []
for i in data:
    if "summary" in i:
        summary_counter+=1
        post_w_summary.append(i["index"])

print(f"{summary_counter} posts have summaries added to it")
print(f"posts with index {post_w_summary} have summaries added to it")

22 posts have summaries added to it
posts with index [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] have summaries added to it


Add index numbers to the dataset ( run only once )

In [14]:
index_num = 0
for i in data:
    i['index'] = index_num
    index_num += 1

with open('reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)

print("index numbers added and saved to json file.")

index numbers added and saved to json file.


Attempt to automate adding summaries to each post entry in the dataset

In [14]:
index = 30
summary = {
  "title": "11/11 Deals Cheat Sheet - Lazada & Amazon Offers",
  "post": {
    "overview": "The post provides a cheat sheet with updated deals and offers for 11/11, particularly on Lazada and Amazon. Key highlights include Lazada and Amazon deals, voucher instructions, and a link to a Google Sheets document with more details.",
    "details": {
      "lazada_deals": {
        "special_offers": [
          "0% Spaylater voucher",
          "Ways to maximize coin usage",
          "Free LazRewards for Lazada members (Nov 8-13)"
        ],
        "additional_vouchers": [
          "LAZADA300",
          "LAZADA500"
        ],
        "product_deals": [
          "CPUs",
          "GPUs",
          "SSDs",
          "RAM",
          "Accessories"
        ]
      },
      "amazon_deals": {
        "discounted_components": [
          "RAM",
          "SSDs",
          "Monitors"
        ],
        "product_categories": [
          "Budget-friendly gaming components",
          "Productivity components"
        ]
      },
      "voucher_instructions": "Details on using specific Lazada and Amazon vouchers for better discounts.",
      "link_to_google_sheet": "A link to the updated Google Sheets document containing more details on these offers."
    },
    "user_input_encouraged": "The post invites users to share additional deals that should be included."
  }
}

for i in data:
    if i["index"] == index:
        i["summary"] = summary
        
with open('reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)