# THIS NOTEBOOK IS FOR PREPARING, CHECKING AND CLEANING THE DATASET

Import statements

In [1]:
import praw
import json
import os
from dotenv import load_dotenv
load_dotenv()

True

Get the dataset by scraping the top 100 reddit posts in a subreddit (in this case r/PHbuildapc)

In [7]:

reddit = praw.Reddit(
    client_id = os.getenv('CLIENT_ID'),
    client_secret = os.getenv('CLIENT_SECRET'),
    user_agent = "retldr"
)

data = []
subreddit = reddit.subreddit("PHbuildapc") # subreddit of choice 

for submission in subreddit.top(limit=100): # only fetch the top 100 posts
    post = {
        "title": submission.title,
        "selftext": submission.selftext,
        "comments": [
            comment.body for comment in submission.comments
            if isinstance(comment, praw.models.Comment)
        ]
    }
    data.append(post)

# save dataset to json
with open("reddit_data.json", "w") as file:
    json.dump(data, file, indent=4)


Open the dataset (json) file

In [2]:
with open('reddit_data.json', 'r+') as file:
    data = json.load(file)

Check how many summaries have been added to the dataset

In [4]:
summary_counter = 0
post_w_summary = []
for i in data:
    if "summary" in i:
        summary_counter+=1
        post_w_summary.append(i["index"])

print(f"{summary_counter} posts have summaries added to it")
print(f"posts with index {post_w_summary} have summaries added to it")

31 posts have summaries added to it
posts with index [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30] have summaries added to it


Add index numbers to the dataset ( run only once )

In [14]:
index_num = 0
for i in data:
    i['index'] = index_num
    index_num += 1

with open('reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)

print("index numbers added and saved to json file.")

index numbers added and saved to json file.


Attempt to automate adding summaries to each post entry in the dataset

In [14]:
index = 40
summary = {
  "title": "We are flooded with USED GPUs right now, please be careful ✌️",
  "selftext": "The user discusses the increasing availability of used GPUs, especially those released by miners in the Philippines. They highlight the current prices of used GPUs, such as the 2060 costing 9k PHP and the 2060 Super costing 12k PHP. The user advises potential buyers to thoroughly test used GPUs, checking both their physical appearance and performance under load using stress testing apps like Furmark and Unigine Heaven. They mention that this may be the best time to buy a used GPU if you're willing to take the risk.",
  "links_to_deals": [
    {
      "product": "Used 2060",
      "price": "9k PHP",
      "link": "https://imgur.com/Bj0jwjt"
    },
    {
      "product": "Used 2060 Super",
      "price": "12k PHP",
      "link": "https://www.lazada.com.ph/products/used-colorful-geforce-rtx-2060-super-8g-graphics-card-sp-2176-8gb-gddr6-256bit-1470mhz-1650mhz-dphddvi-video-card-i3342718056-s17004144960.html?clickTrackInfo=query%253A2060super%253Bnid%253A3342718056%253Bsrc%253ALazadaMainSrp%253Brn%253A42ac091acc1b5be0eb1f7fa0438df410%253Bregion%253Aph%253Bsku%253A3342718056_PH%253Bprice%253A12799.00%253Bclient%253Adesktop%253Bsupplier_id%253A100067020%253Basc_category_id%253A5157%253Bitem_id%253A3342718056%253Bsku_id%253A17004144960%253Bshop_id%253A86237&freeshipping=1&fs_ab=2&fuse_fs=1&search=1&spm=a2o4l.searchlist.list.i68.48b27a08Cb5SW7"
    }
  ],
  "comments": [
    {
      "topic": "Wear and tear of mining GPUs",
      "comment": "Users express concerns about the potential wear and tear of GPUs used for mining. They mention that mining GPUs often experience more stress due to prolonged 24/7 use, which could lead to issues like broken capacitors and voltage regulators."
    },
    {
      "topic": "Risk vs reward of buying mining GPUs",
      "comment": "Some users mention that while buying mining GPUs can be risky, there are instances where the cards may still function well, especially if they are priced significantly lower than new ones."
    },
    {
      "topic": "Alternative stress-testing apps",
      "comment": "Others suggest using different stress-testing apps like Unigine Valley and 3DMark Firestrike/Timespy to check GPU performance."
    },
    {
      "topic": "Used 6700XT deal",
      "comment": "One user considers buying a used 6700XT with more than two years of warranty left for only $180 as an appealing option."
    }
  ],
  "sentiment": "The post encourages cautiousness when buying used GPUs, particularly those used for mining. The tone is informative and emphasizes the importance of testing and evaluating the product thoroughly before making a purchase. There is a sense of risk associated with buying used GPUs, but some users also share good deals they have found."
}





for i in data:
    if i["index"] == index:
        i["summary"] = summary
        
with open('reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)