# THIS NOTEBOOK IS FOR PREPARING, CHECKING AND CLEANING THE DATASET

Import statements

In [3]:
import praw
import json
import os
from dotenv import load_dotenv
load_dotenv()

True

Get the dataset by scraping the top 100 reddit posts in a subreddit (in this case r/PHbuildapc)

In [7]:

reddit = praw.Reddit(
    client_id = os.getenv('CLIENT_ID'),
    client_secret = os.getenv('CLIENT_SECRET'),
    user_agent = "retldr"
)

data = []
subreddit = reddit.subreddit("PHbuildapc") # subreddit of choice 

for submission in subreddit.top(limit=100): # only fetch the top 100 posts
    post = {
        "title": submission.title,
        "selftext": submission.selftext,
        "comments": [
            comment.body for comment in submission.comments
            if isinstance(comment, praw.models.Comment)
        ]
    }
    data.append(post)

# save dataset to json
with open("reddit_data.json", "w") as file:
    json.dump(data, file, indent=4)


Open the dataset (json) file

In [19]:
with open('reddit_data.json', 'r+') as file:
    data = json.load(file)

Check how many summaries have been added to the dataset

In [13]:
summary_counter = 0
post_w_summary = []
for i in data:
    if "summary" in i:
        summary_counter+=1
        post_w_summary.append(i["index"])

print(f"{summary_counter} posts have summaries added to it")
print(f"posts with index {post_w_summary} have summaries added to it")

0 posts have summaries added to it
posts with index [] have summaries added to it


Add index numbers to the dataset

In [14]:
index_num = 0
for i in data:
    i['index'] = index_num
    index_num += 1

with open('reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)

print("index numbers added and saved to json file.")

index numbers added and saved to json file.


Attempt to automate adding summaries to each post entry in the dataset

In [24]:
index = 2
summary = {
  "title": {
    "description": "The user is looking for a laptop with decent specs for gaming, particularly for games like Valorant, PUBG, Warframe, and Overwatch 2, with a budget of around 50k to 60k pesos. They are currently experiencing poor FPS performance on their old laptop, resulting in a laggy gaming experience."
  },
  "sections": [
    "Lenovo LOQ and Acer gaming laptops for a budget-friendly option.",
    "Lenovo Legion series with RTX 3050/3050Ti for better performance.",
    "Consider second-hand laptops as they depreciate quickly, which might offer better specs within the budget.",
    "MSI GF63 Thin, Acer Predator, Asus TUF, and Gigabyte laptops were also recommended for their value and specs.",
    "Some users suggested extending the budget to 75k for more options, such as RTX 4060 models.",
    "Several users highlighted the importance of getting a laptop with at least 16GB RAM and a GTX 1650 GPU or better.",
    "A few users also mentioned the option of building a PC for better long-term performance at a lower cost."
  ]
}


for i in data:
    if i["index"] == index:
        i["summary"] = summary
        
with open('reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)