## LOAD SUBREDDITS

In [None]:
import glob

json_files = glob.glob('./data/*.json')
print(json_files)

import json

def getRedditSubreddits(json_files):
  subreddits = {}
  for json_file in json_files:
    with open(json_file) as f:
      data = json.load(f)
      firstKey = next(iter(data))
      subredditName = data[firstKey]['subreddit']
      subreddits[subredditName] = data
  return subreddits

def getTopComments(subreddit_name, post_id):
  # ./data/subreddit_name/post_id_top_comments.json
  json_file = './data/' + subreddit_name + '/' + post_id + '_top_comments.json'
  with open(json_file) as f:
    data = json.load(f)
    return data

def saveredditSubreddits(subreddits):
  for subreddit in subreddits:
    with open('./data/' + subreddit + '_top_posts.json', 'w') as outfile:
      json.dump(subreddits[subreddit], outfile, indent=4)

subreddits = getRedditSubreddits(json_files)

print("subreddits", subreddits.keys())


## Cull Posts without Comments

In [None]:
total_deleted = 0
for subreddit in subreddits:
  deleted = 0
  for post_id in reversed(list(subreddits[subreddit])):
    #print("post_id", post_id)
    try:
      top_comments = getTopComments(subreddit, post_id)
    except:
      # remove post_id from subreddits[subreddit]
      del subreddits[subreddit][post_id]
      print("post_id", post_id, "removed")
      deleted += 1
      continue
  if deleted > 0:
    print("subreddit", subreddit, "deleted", deleted, "posts")
    total_deleted += deleted
if total_deleted > 0:
  saveredditSubreddits(subreddits)

In [None]:
def diveInPost(subreddit_name, post_id):
  global commentPathsTemp
  topComments = getTopComments(subreddit_name, post_id)
  #print(subreddit_name, post_id, len(topComments))
  commentPathsTemp = []
  commentTree = []
  for comment_id in topComments:
    comment = topComments[comment_id]
    #print(comment['body'])
    diveRecursion(topComments, comment_id, [])
    commentPaths = commentPathsTemp.copy()
    #print(commentPaths)
    commentPathsTemp = []
    commentTree.append(commentPaths)
  #displayCommentTree(commentTree, topComments)
  return topComments, commentTree
def is_nested(array):
  return any(isinstance(i, list) for i in array)

def normalizeNestedArray(array):
  if not is_nested(array):
    return array
  else:
    return array[0]

commentPathsTemp = []
# Base case: comment has no other comments listing it as a parent_id
#   - add comment to commentArray then return commentArray
# Recursive Case: add comment to commentArray, and call diveRecursion on all comments that list it as a parent 
def diveRecursion(topComments, comment_id, commentArray):
  global commentPathsTemp
  comment = topComments[comment_id]
  commentArray.append(comment_id)
  # loop through every comment checking if it has a parent_id of comment_id
  count = 0
  for other_comment_id in topComments:
    if topComments[other_comment_id]['parent_id'] == f"t1_{comment_id}":
      diveRecursion(topComments, other_comment_id, commentArray.copy())
      count += 1
  if count == 0 and len(commentArray) > 0: # base case, no other comments list this comment as a parent. We're at a leaf.
    commentPathsTemp.append(commentArray.copy())

def displayCommentTree(commentTree, topComments, indent=0):
  for commentPath in commentTree:
    for comment_id, i in zip(commentPath, range(len(commentPath))):
      comment = topComments[comment_id]
      indentStr = ' ' * i
      print(indentStr + comment['id'])
      
    print('\n')


In [None]:
def normalizeUpvotes(max_upvotes, upvotes):
  # normalize upvotes to be between 0 and 1 with 2 decimal places
  return round(upvotes / max_upvotes, 2)

def getIndexOfFirstDeletedComment(commentPath, topComments):
  for comment_id in commentPath:
    comment = topComments[comment_id]
    is_suspended = comment['author']['is_suspended']
    if is_suspended or comment['author']['author_name'].strip() == '[deleted]' or comment['body'].strip() == '[deleted]' or comment['body'].strip() == '[removed]':
      return commentPath.index(comment_id)
  return -1

def removeDeletedComments(commentPath, topComments):
  # finds the first deleted comment in commentPath and removes it and all comments after it
  index = getIndexOfFirstDeletedComment(commentPath, topComments)
  if index == -1:
    return commentPath
  else:
    return commentPath[0:index]

def removeDeletedComments_old(commentPath, topComments):
  # remove deleted comments from commentPath
  #print("commentPath", commentPath)
  for comment_id in reversed(list(commentPath)):
    comment = topComments[comment_id]
    #print(comment['body'])
    is_suspended = comment['author']['is_suspended']
      
    if is_suspended or comment['author']['author_name'].strip() == '[deleted]' or comment['body'].strip() == '[deleted]' or comment['body'].strip() == '[removed]':
      #print('deleted')
      commentPath.remove(comment_id)
  return commentPath

In [None]:
# remove deleted comments from commentPath
for subreddit_name in subreddits:
  for post_id in subreddits[subreddit_name]:
    topComments, commentTree = diveInPost(subreddit_name, post_id)
    for commentPaths in commentTree:
      
      #topComments = getTopComments(subreddit_name, post_id)

      for i in reversed(range(0, len(commentPaths))):
        commentPaths[i] = removeDeletedComments(commentPaths[i], topComments)
        if len(commentPaths[i]) == 0:
          commentPaths.remove(commentPaths[i])

In [None]:
# create jsonl file just for subreddit aww
# https://huggingface.co/blog/llama2#how-to-prompt-llama-2
jsonl = []
for subreddit_name in subreddits:
  posts = subreddits[subreddit_name]
  # slice posts to be only the first 5
  posts = dict(list(posts.items())[:5])
  for post_id in posts:
    topComments, fullCommentTree = diveInPost(subreddit_name, post_id)
    startingRange = 5
    middleRange = 10
    middleRangeStart = len(fullCommentTree) // 2 - middleRange // 2
    middleRangeEnd = len(fullCommentTree) // 2 + middleRange // 2
    endingRange = 5
    # slice commentTree to only use the first startingRange items and the last endingRange items
    commentTree = fullCommentTree[:startingRange] + fullCommentTree[middleRangeStart:middleRangeEnd] + fullCommentTree[-endingRange:]

    for commentPaths in commentTree:
      firstKey = next(iter(topComments))
      max_upvotes = topComments[firstKey]['score']
      
      for commentPath in commentPaths:

        commentPath = removeDeletedComments(commentPath, topComments)
        if len(commentPath) == 0:
          continue
        #commentPath = commentPath[:3] # only use the first comment in the path TEST

        _instruction = f"""You are a Reddit user comment generator. In the conversation you change your Reddit username often to simulate different users."""
        _firstUserInput = f"""You are on the subreddit /r/facepalm.
Post title: {posts[post_id]['title']}
Post media type: image
The post submission is about: a man is being held by a police officer in front of a crowd
The Original Poster(OP) username is: thewrongun
Your username is made up by you. Generate a comment in the format: USERNAME - COMMENT"""
        userInputs = [_firstUserInput]
        outputs = []

        if len(commentPath) > 1:
          for comment_id, i in zip(commentPath, range(len(commentPath))):
            comment = topComments[comment_id]
            reply = f"{comment['author']['author_name']} - {comment['body'].strip()}"
            userInputs.append(f"""Generate a follow up comment in the format: USERNAME - COMMENT""")
            outputs.append(reply)
        
        # generate text
        output_text = ""
        for _input, _output in zip(userInputs, outputs):
          output_text += f"""User: {_input}
Assistant: {_output}
"""
        
        jsonl_template = {
          "instruction": _instruction,
          "output": output_text,
          "text": f"""{_instruction}{output_text}"""
        }
        #print(jsonl_template['text'])
        jsonl.append(jsonl_template)
        #
        



In [None]:
with open('reddit_comments_vicuna_1.1.jsonl', 'w') as outfile:
  for entry in jsonl:
    outfile.write(json.dumps(entry) + '\n')