In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import csv
import json

In [2]:
# Dyad downloaded as CSV
# Adjust path as needed
dyad = pd.read_csv("/content/RedditAntiAbuseUCSD.csv")

In [3]:
# Table of all comments whose parent is the post, downloaded as CSV
# The 'id' column of this table is basically a list of all the roots!
# Adjust path as needed
parents = pd.read_csv("/content/UCSDRoots.csv")

In [4]:
print(dyad.shape)
print(parents.shape)

(53881, 5)
(67250, 3)


In [5]:
dyad.head()

Unnamed: 0,post_id,parent_id,parent_content,child_id,child_content
0,t3_eiacy6,fcoq5mb,I’m pretty sure I have heard of them accepting...,fcoud8c,Ty I’ll try waiving with it
1,t3_eibln2,fcovsx7,FYI I emailed them to get a voucher last quart...,fcowzcq,u gotta go in to college advising office
2,t3_ehznm9,fcozrto,"Stats is much more employable, pure math sorta...",fcozvy8,Gotcha. I guess I look at it this way...if yo...
3,t3_ehznm9,fcozvy8,Gotcha. I guess I look at it this way...if yo...,fcp01am,This is what I needed to hear. Thank you kind ...
4,t3_ehznm9,fcp01am,This is what I needed to hear. Thank you kind ...,fcp2yr0,"Of course! Happy new year!\n\nAlso yeah, that ..."


In [6]:
parents.head()

Unnamed: 0,id,post_id,post_link
0,fm3iyf9,doqqcm,/r/UCSD/comments/doqqcm/ask_me_anything_about_...
1,fcsmguw,dqagex,/r/UCSD/comments/dqagex/winter_2020_enrollment...
2,fcotp0r,ebzqrl,/r/UCSD/comments/ebzqrl/ahhhhhhhwhyahhhhhhh/fc...
3,fcove2d,ecj85u,/r/UCSD/comments/ecj85u/busy_successful_studen...
4,fcq9mqp,edwqpd,/r/UCSD/comments/edwqpd/smash_ultimate_rainbow...


In [7]:
# Create child_to_parent, parent_to_children dictionaries
child_to_parent = defaultdict(str)
parent_to_children = defaultdict(list)
child_to_post = defaultdict(str)

for i in range(dyad.shape[0]):
  child = dyad['child_id'].iloc[i]
  parent = dyad['parent_id'].iloc[i]
  child_to_parent[child] = parent
  parent_to_children[parent].append(child)
  child_to_post[child] = dyad['post_id'].iloc[i]

# Create root_to_post dictionary
root_to_post = defaultdict(str)
for j in range(parents.shape[0]):
  root = parents['id'].iloc[j]
  root_to_post[root] = parents['post_id'].iloc[j]


In [36]:
# post_id to post_link
id_to_link = defaultdict(str)
for k in range(parents.shape[0]):
  post = parents['post_id'].iloc[k]
  id_to_link[post] = parents['post_link'].iloc[k]

In [8]:
# Any node that does not have a child is a leaf
leaves = []
for child in child_to_parent:
  if child not in parent_to_children:
    leaves.append(child)
len(leaves)

32224

In [9]:
# Root nodes are comments whose parent is the post (t3_) as opposed to another comment (t1_)
roots = []
for parent in parent_to_children:
  if parent in root_to_post:
    roots.append(parent)
len(roots)

22029

# Bottom Up Approach
O(l*h) where l is the number of leaves and h is the height of the tree

In [28]:
def leafToRoot(leaf):
    path = []
    currentNode = leaf
    # Traverse through the tree until reaching the parent
    while currentNode not in set(roots):
        path.append(currentNode)
        parent = child_to_parent[currentNode]
        # Node does not have a parent yet is not a root
        # This means the conversation chain was broken
        # Return blank path, to be filtered out below
        if not parent:
          return []
        # If node does have a parent, continue
        currentNode = parent
    # Add the root and post_id and reverse the path so that it is [post,root,...,leaf]
    path.append(currentNode)
    path.append(root_to_post[currentNode])
    path.reverse()
    return path


In [29]:
# This takes a second to run!
# Find a leaf-to-root path for each leaf
conversationsBottomUp = []
for leaf in tqdm(leaves):
  path = leafToRoot(leaf)
  # If the length of the path is 0, then [] was returned
  # We don't want to look at these
  if len(path) > 0:
    conversationsBottomUp.append(path)
len(conversationsBottomUp)

100%|██████████| 32224/32224 [03:22<00:00, 159.27it/s]


29780

# Top Down Approach
O(r*n) where r is the number of roots and n is the number of nodes in the tree

In [12]:
def rootToLeaf(currentNode, conversation, rootConvos):
  conversation.append(currentNode)
  # If the node is a leaf, we are at the end of the path!
  # Add the path to the list of all paths for this root
  if currentNode in leaves:
    rootConvos.append(list(conversation))
  # Otherwise, the node should have children
  children = parent_to_children[currentNode]
  # If it
  if children:
    # Recursion for each child
    for child in children:
      rootToLeaf(child, conversation, rootConvos)
  # Go back up to the previous node
  conversation.pop()

# We want each path from a given root
# This will return a list of all paths (also lists)
def allPaths(root):
  allPaths = []
  rootToLeaf(root, [root_to_post[root]], allPaths)
  return allPaths

In [15]:
# Do this for every root
conversationsTopDown = []
for root in tqdm(roots):
  # allPaths() returns a list of lists, which is why I am concatenating instead of appending
  conversationsTopDown += allPaths(root)

len(conversationsTopDown)

100%|██████████| 22029/22029 [00:48<00:00, 453.85it/s]


29780

In [30]:
# Yayyyyyy - both methods work!
# i.e. each conversation identified follows post, root <-> ... <-> leaf structure
# Broken chains are not included
# Can choose either method, they both take about a minute to run
sorted(conversationsTopDown)==sorted(conversationsBottomUp)

True

# Save file

In [17]:
with open('ucsd_conversations.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(conversationsTopDown)

In [18]:
with open('ucsd_conversations.json', 'w') as file:
    json.dump(conversationsTopDown, file)

# Printing Conversations

In [19]:
def getIndex(df,col,val):
  return df.index[df[col]==val][0]

In [20]:
# This function assumes that the root is the first element in the list
def list_to_conversation(lst):
  # Print POST LINK
  print('POST: reddit.com' + root_to_post[lst[0]])
  # Print ROOT
  count = 1
  parentIndex = getIndex(dyad,'parent_id',lst[0])
  print(count, ':',dyad['parent_content'][parentIndex])
  # Print CHILDREN
  for i in range(1,len(lst)):
    count += 1
    childIndex = getIndex(dyad,'child_id',lst[i])
    print('\n',count, ':',dyad['child_content'][childIndex])

# Deleted/Removed Comments

In [31]:
id_to_content = defaultdict(str)
for i in range(dyad.shape[0]):
  child = dyad['child_id'].iloc[i]
  child_body = dyad['child_content'].iloc[i]
  parent = dyad['parent_id'].iloc[i]
  parent_body = dyad['parent_content'].iloc[i]
  if child not in id_to_content:
    id_to_content[child] = child_body
  if parent not in id_to_content:
    id_to_content[parent] = parent_body

In [47]:
def convoToContent(conversation):
  lst = []
  post_id = conversation[0]
  link = 'reddit.com' + id_to_link[post_id]
  lst.append(link)
  for i in range(1,len(conversation)):
    lst.append(id_to_content[conversation[i]])
  return lst

In [38]:
def printConvo(conversation_words):
  for comment in conversation_words:
    print(comment,'\n')

In [49]:
for convo in conversationsTopDown:
  for comment in convo:
    content = id_to_content[comment]
    if '[deleted]' in content or '[removed]' in content:
      printConvo(convoToContent(convo))
# The examples below aren't really what we are looking for
# But it shows how the code works

reddit.com/r/UCSD/comments/k8sur9/math_department_response_to_math_183_ezzati/gf14ikc/ 

This was posted on the class Piazza and the professor has [removed](https://i.imgur.com/hweNjef.png) it. 

Damn it was on there half an hour ago. She also made an announcement regarding it. 

Let’s see the screenshot! 

reddit.com/r/UCSD/comments/k8sur9/math_department_response_to_math_183_ezzati/gf14ikc/ 

This was posted on the class Piazza and the professor has [removed](https://i.imgur.com/hweNjef.png) it. 

Dear all,  
We cannot allow any posts on Piazza that is not relevant to the course materials. Piazza is to discuss the course materials and course related announcements only. If any other questions/concerns - please consider using the office hours or direct email to address them first before posting on Piazza. Please understand majority of class are doing well and need to focus on their studies than being distracted.  
All best,  
Parinaz 

