In [295]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [296]:
# UCSD Dyad downloaded as CSV
# Adjust path as needed
dyad = pd.read_csv("/content/RedditAntiAbuseUCSD.csv")

In [297]:
'''
Query:
SELECT
id,
SUBSTRING(parent_id,4) as post_id,
permalink as post_link
FROM `gcp-toxic-behavior.reddit_anti_abuse.UCSD`
WHERE parent_id LIKE 't3_%';
'''
# Table of all comments whose parent is the post, downloaded as CSV
# The 'id' column of this table is basically a list of all the roots!
# Adjust path as needed
parents = pd.read_csv("/content/posts.csv")
#parents = parents.iloc[1:]

In [298]:
dyad.head()

Unnamed: 0,post_id,parent_id,parent_content,child_id,child_content
0,t3_eiacy6,fcoq5mb,I’m pretty sure I have heard of them accepting...,fcoud8c,Ty I’ll try waiving with it
1,t3_eibln2,fcovsx7,FYI I emailed them to get a voucher last quart...,fcowzcq,u gotta go in to college advising office
2,t3_ehznm9,fcozrto,"Stats is much more employable, pure math sorta...",fcozvy8,Gotcha. I guess I look at it this way...if yo...
3,t3_ehznm9,fcozvy8,Gotcha. I guess I look at it this way...if yo...,fcp01am,This is what I needed to hear. Thank you kind ...
4,t3_ehznm9,fcp01am,This is what I needed to hear. Thank you kind ...,fcp2yr0,"Of course! Happy new year!\n\nAlso yeah, that ..."


In [299]:
parents.head()

Unnamed: 0,id,post_id,post_link
0,fm3iyf9,doqqcm,/r/UCSD/comments/doqqcm/ask_me_anything_about_...
1,fcsmguw,dqagex,/r/UCSD/comments/dqagex/winter_2020_enrollment...
2,fcotp0r,ebzqrl,/r/UCSD/comments/ebzqrl/ahhhhhhhwhyahhhhhhh/fc...
3,fcove2d,ecj85u,/r/UCSD/comments/ecj85u/busy_successful_studen...
4,fcq9mqp,edwqpd,/r/UCSD/comments/edwqpd/smash_ultimate_rainbow...


In [300]:
# Create child_to_parent, parent_to_children dictionaries
child_to_parent = defaultdict(str)
parent_to_children = defaultdict(list)

for i in range(dyad.shape[0]):
  child = dyad['child_id'][i]
  parent = dyad['parent_id'][i]
  child_to_parent[child] = parent
  parent_to_children[parent].append(child)

# Create root_to_post dictionary
root_to_post = defaultdict(str)
for j in range(parents.shape[0]):
  root = parents['id'].iloc[j]
  root_to_post[root] = parents['post_link'].iloc[j]


In [301]:
# Any node that does not have a child is a leaf
leaves = []
for child in child_to_parent:
  if child not in parent_to_children:
    leaves.append(child)
len(leaves)

32224

In [302]:
# Root nodes are comments whose parent is the post (t3_) as opposed to another comment (t1_)
roots = []
for parent in parent_to_children:
  if parent in root_to_post:
    roots.append(parent)
len(roots)

22029

# Bottom Up Approach

In [303]:
def leafToRoot(leaf):
    path = []
    currentNode = leaf
    # Traverse through the tree until reaching the parent
    while currentNode not in roots:
        path.append(currentNode)
        parent = child_to_parent[currentNode]
        # Node does not have a parent yet is not a root
        # This means the conversation chain was broken
        # Return blank path, to be filtered out below
        if not parent:
          return []
        # If node does have a parent, continue
        currentNode = parent
    # Add the root and reverse the path so that it is [root,...,leaf]
    path.append(currentNode)
    path.reverse()
    return path


In [304]:
# This takes a second to run!
# Find a leaf-to-root path for each leaf
conversationsBottomUp = []
for leaf in leaves:
  path = leafToRoot(leaf)
  # If the length of the path is 0, then [] was returned
  # We don't want to look at these
  if len(path) > 0:
    conversationsBottomUp.append(path)
len(conversationsBottomUp)

29780

# Top Down Approach

In [305]:
def rootToLeaf(currentNode, conversation, rootConvos):
  conversation.append(currentNode)
  # If the node is a leaf, we are at the end of the path!
  # Add the path to the list of all paths for this root
  if currentNode in leaves:
    rootConvos.append(list(conversation))
  # Otherwise, the node should have children
  children = parent_to_children[currentNode]
  # Recursion for each child
  for child in children:
    rootToLeaf(child, conversation, rootConvos)
  # Go back up to the previous node
  conversation.pop()

# We want each path from a given root
# This will return a list of all paths (also lists)
def allPaths(root):
  allPaths = []
  rootToLeaf(root, [], allPaths)
  return allPaths

In [306]:
# Do this for every root
conversationsTopDown = []
for root in roots:
  # allPaths() returns a list of lists, which is why I am concatenating instead of appending
  conversationsTopDown += allPaths(root)

len(conversationsTopDown)

29780

In [307]:
# Yayyyyyy - both methods work!
# i.e. each conversation identified follows root <-> ... <-> leaf structure
# Broken chains are not included
# Can choose either method, they both take about a minute to run
sorted(conversationsTopDown)==sorted(conversationsBottomUp)

True

# Printing Conversations

In [308]:
def getIndex(df,col,val):
  return df.index[df[col]==val][0]

In [309]:
# This function assumes that the root is the first element in the list
def list_to_conversation(lst):
  # Print POST LINK
  print('POST: reddit.com' + root_to_post[lst[0]])
  # Print ROOT
  count = 1
  parentIndex = getIndex(dyad,'parent_id',lst[0])
  print(count, ':',dyad['parent_content'][parentIndex])
  # Print CHILDREN
  for i in range(1,len(lst)):
    count += 1
    childIndex = getIndex(dyad,'child_id',lst[i])
    print('\n',count, ':',dyad['child_content'][childIndex])

In [310]:
# Example
longestConvo = max(conversationsTopDown,key=len)
list_to_conversation(longestConvo)

POST: reddit.com/r/UCSD/comments/hmfy80/sevp_modifications_affecting_all_international/fx550ru/
1 : If you think about it, this notification has absolutely no basis or reasoning whatsoever other than trying to be a satanic asshole. I mean what are they even gaining by deporting international students -legally in the US on an F1 visa-  who are stuck in a global pandemic like the rest of the world?

I mean the worst colors of this country are coming out at the worst times.

 2 : It seems that the effort is to prevent schools nationwide from going entirely online and going bankrupt in the process. We do run the risk of damaging our higher education system if schools across the nation take irreparable financial damage.

Edit: Never said I agreed with this measure but ok

 3 : I think Trump's recent tweets are fairly suggestive that this is about him wanting things to be open again around the election.

If it's about keeping schools solvent, that's a good goal, but there are easier and more