# THIS NOTEBOOK IS FOR PREPARING, CHECKING AND CLEANING THE DATASET

Import statements

In [2]:
import praw
import json
import os
from dotenv import load_dotenv
load_dotenv()

True

Get the dataset by scraping the top 100 reddit posts in a subreddit (in this case r/PHbuildapc)

In [2]:

reddit = praw.Reddit(
    client_id = os.getenv('CLIENT_ID'),
    client_secret = os.getenv('CLIENT_SECRET'),
    user_agent = "retldr"
)

data = []
subreddit = reddit.subreddit("ShopeePH") # subreddit of choice 

for submission in subreddit.top(limit=10): # only fetch the top 100 posts
    post = {
        "title": submission.title,
        "selftext": submission.selftext,
        "comments": [
            comment.body for comment in submission.comments
            if isinstance(comment, praw.models.Comment)
        ]
    }
    data.append(post)

# save dataset to json
with open("../data/reddit_eval.json", "w") as file:
    json.dump(data, file, indent=4)


In [14]:
reddit = praw.Reddit(
    client_id=os.getenv('CLIENT_ID'),
    client_secret=os.getenv('CLIENT_SECRET'),
    user_agent="retldr"
)

post_url = "https://www.reddit.com/r/phtravel/comments/1ieyug7/is_there_a_cheaper_alternative_way_to_go_to/"
submission = reddit.submission(url=post_url)

post_data = {
    "title": submission.title,
    "selftext": submission.selftext,
    "comments": [
        comment.body for comment in submission.comments
        if isinstance(comment, praw.models.Comment)
    ]
}

output_file = "../data/references.json"

if os.path.exists(output_file) and os.stat(output_file).st_size > 0:
    with open(output_file, "r", encoding="utf-8") as file:
        data = json.load(file)
else:
    data = []

data.append(post_data)

with open(output_file, "w", encoding="utf-8") as file:
    json.dump(data, file, indent=4, ensure_ascii=False)

print(f"Post data saved to {output_file}")


Post data saved to ../data/references.json


Open the dataset (json) file

In [8]:
with open('../data/reddit_eval.json', 'r+') as file:
    data = json.load(file)

Check how many summaries have been added to the dataset

In [9]:
summary_counter = 0
post_w_summary = []
for i in data:
    if "summary" in i:
        summary_counter+=1
        post_w_summary.append(i["index"])

print(f"{summary_counter} posts have summaries added to it")
print(f"posts with index {post_w_summary} have summaries added to it")

0 posts have summaries added to it
posts with index [] have summaries added to it


Add index numbers to the dataset ( run only once )

In [15]:
index_num = 0
for i in data:
    i['index'] = index_num
    index_num += 1

with open('../data/references.json', 'w') as file:
    json.dump(data, file, indent=4)

print("index numbers added and saved to json file.")

index numbers added and saved to json file.


Added "summary" keys to the dataset

In [28]:
for i in range(46, len(data)):
    data[i]["summary"] = None
    
with open('../data/reddit_data.json', 'w') as file:
    json.dump(data, file, indent=4)

print("empty summary keys added and saved to json file.")

empty summary keys added and saved to json file.


Attempt to automate adding summaries to each post entry in the dataset

In [23]:
index = 0
summary = {}

for i in data:
    if i["index"] == index:
        i["summary"] = summary
        
with open('../data/reddit_eval.json', 'w') as file:
    json.dump(data, file, indent=4)

extract the summaries and indexes and write it into another file as the reference summaries

In [6]:
extracted_data = [{"index": post["index"], "summary": post["summary"]} for post in data]

In [7]:
with open('../data/ref_eval.json', 'w') as file:
    json.dump(extracted_data, file, indent=4)