# THIS NOTEBOOK IS FOR PREPARING, CHECKING AND CLEANING THE DATASET

Import statements

In [2]:
import praw
import json
import os
from dotenv import load_dotenv
load_dotenv()

True

Get the dataset by scraping the top 100 reddit posts in a subreddit (in this case r/PHbuildapc)

In [7]:

reddit = praw.Reddit(
    client_id = os.getenv('CLIENT_ID'),
    client_secret = os.getenv('CLIENT_SECRET'),
    user_agent = "retldr"
)

data = []
subreddit = reddit.subreddit("PHbuildapc") # subreddit of choice 

for submission in subreddit.top(limit=100): # only fetch the top 100 posts
    post = {
        "title": submission.title,
        "selftext": submission.selftext,
        "comments": [
            comment.body for comment in submission.comments
            if isinstance(comment, praw.models.Comment)
        ]
    }
    data.append(post)

# save dataset to json
with open("../data/reddit_data.json", "w") as file:
    json.dump(data, file, indent=4)


Open the dataset (json) file

In [11]:
with open('../data/reddit_data.json', 'r+') as file:
    data = json.load(file)

Check how many summaries have been added to the dataset

In [4]:
summary_counter = 0
post_w_summary = []
for i in data:
    if "summary" in i:
        summary_counter+=1
        post_w_summary.append(i["index"])

print(f"{summary_counter} posts have summaries added to it")
print(f"posts with index {post_w_summary} have summaries added to it")

31 posts have summaries added to it
posts with index [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] have summaries added to it


Add index numbers to the dataset ( run only once )

In [10]:
index_num = 0
for i in data:
    i['index'] = index_num
    index_num += 1

with open('../data/reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)

print("index numbers added and saved to json file.")

index numbers added and saved to json file.


Added "summary" keys to the dataset

In [28]:
for i in range(46, len(data)):
    data[i]["summary"] = None
    
with open('../data/reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)

print("empty summary keys added and saved to json file.")

empty summary keys added and saved to json file.


Attempt to automate adding summaries to each post entry in the dataset

In [12]:
index = 55
summary = {
  "title": "Positive Experience Buying Used GPU (RTX 3060) from Facebook Marketplace",
  "selftext": "This post highlights a positive experience with purchasing a used GPU (RTX 3060) from Facebook Marketplace. The user was searching for a GPU for two months and found an excellent deal on an RTX 3060 8GB GAMING OC priced at 11k PHP, much lower than its usual price. The seller offered a personal warranty and even offered to come to the user's house to test the product, making the transaction more convenient. After agreeing on a price, the seller reduced it further to 8.5k PHP, which was considered an unusually good deal.\n\nThe user checked the receipt and found the GPU was originally purchased for 21k PHP in January 2023. After purchasing the GPU, the user had no issues and praised the seller's professionalism. The post ends with a recommendation to explore the used market for PC parts, particularly GPUs.",
  "comments": [
    "Users congratulate the OP and share their own experiences of finding good deals in the used market.",
    "Some users mention the competitive nature of the used market, where people offer significantly lower prices.",
    "Others praise the generosity of sellers who go the extra mile, like offering personal warranties or testing the product on-site."
  ],
  "sentiment": "Positive and encouraging, with a focus on patience, research, and finding great deals in the used market."
}


for i in data:
    if i["index"] == index:
        i["summary"] = summary
        
with open('../data/reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)