In [47]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [48]:
# UCSD Dyad downloaded as CSV
# Adjust path as needed
dyad = pd.read_csv("/content/dyad.csv")

In [49]:
'''
Query:

SELECT
id,
SUBSTRING(parent_id,4) as post_id,
permalink as post_link
FROM `gcp-toxic-behavior.reddit_anti_abuse.UCSD`
WHERE parent_id LIKE 't3_%';
'''
# Table of all comments whose parent is the post, downloaded as CSV
# Adjust path as needed
parents = pd.read_csv("/content/posts.csv", names=["id","post","link"])
parents = parents.iloc[1:]

In [50]:
dyad.head()

Unnamed: 0,parent_id,parent_content,child_id,child_content
0,fcoq5mb,I’m pretty sure I have heard of them accepting...,fcoud8c,Ty I’ll try waiving with it
1,fcovsx7,FYI I emailed them to get a voucher last quart...,fcowzcq,u gotta go in to college advising office
2,fcozrto,"Stats is much more employable, pure math sorta...",fcozvy8,Gotcha. I guess I look at it this way...if yo...
3,fcozvy8,Gotcha. I guess I look at it this way...if yo...,fcp01am,This is what I needed to hear. Thank you kind ...
4,fcp01am,This is what I needed to hear. Thank you kind ...,fcp2yr0,"Of course! Happy new year!\n\nAlso yeah, that ..."


In [51]:
parents.head()

Unnamed: 0,id,post,link
1,fm3iyf9,doqqcm,/r/UCSD/comments/doqqcm/ask_me_anything_about_...
2,fcsmguw,dqagex,/r/UCSD/comments/dqagex/winter_2020_enrollment...
3,fcotp0r,ebzqrl,/r/UCSD/comments/ebzqrl/ahhhhhhhwhyahhhhhhh/fc...
4,fcove2d,ecj85u,/r/UCSD/comments/ecj85u/busy_successful_studen...
5,fcq9mqp,edwqpd,/r/UCSD/comments/edwqpd/smash_ultimate_rainbow...


In [52]:
# Create child_to_parent, parent_to_children dictionaries
child_to_parent = defaultdict(str)
parent_to_children = defaultdict(list)

for i in range(dyad.shape[0]):
  child = dyad['child_id'][i]
  parent = dyad['parent_id'][i]
  child_to_parent[child] = parent
  parent_to_children[parent].append(child)


In [53]:
# Identify leaf nodes
leaves = []
for child in child_to_parent:
  if child not in parent_to_children:
    leaves.append(child)

In [54]:
# METHOD 1 for identifying root nodes
roots = []
for parent in parent_to_children:
  if parent not in child_to_parent:
    roots.append(parent)

In [55]:
# METHOD 2 for identifying root nodes
roots2 = []
for parent in parent_to_children:
  if parent in list(parents['id']):
    roots2.append(parent)

In [56]:
# Nodes identified by METHOD 1 and not METHOD 2
set(roots) - set(roots2)

{'fcozrto',
 'fcp1let',
 'fcp4ccg',
 'fcp5gtl',
 'fcpz9b6',
 'fcrin3p',
 'fcsow5b',
 'fcsycxv',
 'fctbdyi',
 'fcvgxm2',
 'fcw869v',
 'fd0zzga',
 'fd6dv4l',
 'fd6g76c',
 'fd7p1gz',
 'fd968uu',
 'fdcabgh',
 'fdcl0dj',
 'fdcnx37',
 'fdcol0r',
 'fdegncz',
 'fdfzrch',
 'fdgaghb',
 'fdhwhvx',
 'fdhziqc',
 'fdi7rp9',
 'fdj0th3',
 'fdje1ng',
 'fdk858e',
 'fdlhqgq'}

In [57]:
# child_to_parent cannot be called on these nodes
# Therfore Bottom Up approach (below) has to make use of METHOD 1 roots
child_to_parent['fdlhqgq']

''

# Bottom Up Approach


In [58]:
conversationsBottomUp = []
for leaf in leaves:
  conversation = [leaf]
  currentNode = leaf
  while True:
    if currentNode in roots: break
    parent =  child_to_parent[currentNode]
    conversation.append(parent)
    currentNode = parent
  conversationsBottomUp.append(conversation)

len(conversationsBottomUp)

286

In [71]:
# Example
conversationsBottomUp[10]

['fcsed46', 'fcsdbfg', 'fcpusy9']

# Top Down Approach

In [59]:
# Find all paths given root node
def findPath(currentNode, conversation, rootConvos):
  conversation.append(currentNode)
  if currentNode in leaves:
    rootConvos.append(list(conversation))
  children = parent_to_children[currentNode]
  for child in children:
    findPath(child, conversation, rootConvos)
  conversation.pop()

def allPaths(root):
  allPaths = []
  findPath(root, [], allPaths)
  return allPaths

In [60]:
# Do this for every root
conversationsTopDown = []
for root in roots2:
  conversationsTopDown += allPaths(root)
len(conversationsTopDown)

253

In [69]:
# Example
conversationsTopDown[8]

['fcsljw3', 'fcslqeb', 'fcslu28']

# Printing Conversations

In [61]:
def getIndex(df,col,val):
  return df.index[df[col]==val][0]

In [62]:
# This function assumes that a TOP DOWN approach was used.
# If a bottom up approach was used to create list, reverse it before calling function
def list_to_conversation(lst):
  # Print POST LINK
  postIndex = getIndex(parents,'id',lst[0])
  print('POST: reddit.com' + parents['link'][postIndex])
  # Print ROOT
  count = 1
  parentIndex = getIndex(dyad,'parent_id',lst[0])
  print(count, ':',dyad['parent_content'][parentIndex])
  # Print CHILDREN
  for i in range(1,len(lst)):
    count += 1
    childIndex = getIndex(dyad,'child_id',lst[i])
    print('\n',count, ':',dyad['child_content'][childIndex])

In [72]:
# Example
longestConvo = ['fcv5o9b', 'fcvgios', 'fcxouul', 'fcy6tlq', 'fcygjem', 'fd5d4ul', 'fd7g09v', 'fd7tcok', 'fd875d2', 'fd8nrcd', 'fd90ywd', 'fdb8elw', 'fdc09db', 'fdde1to', 'fddg1iw', 'fdetitc', 'fdf0r96']
# ^ Length is 17. Found by max(len(x) for x in conversationsTopDown), and then iterating through until conversation is found
list_to_conversation(longestConvo)

POST: reddit.com/r/UCSD/comments/ej2zjv/does_this_happen_to_anyone_else/fcv5o9b/
1 : USD is the Virgin 

UCSD is the CHAD.

SDSU Is the T H A D of all San Diego Universities.

 2 : In what world is UCSD not the virgin?

 3 : It's not the virgin because we're not literally comparing the sex lives of students.... If reputation and relevance in the world is the consideration, then UCSD is definitely not the "virgin".

 4 : So SDSU has a better reputation?

 5 : I don't believe so. The person that called it the thad seems to think it's better though

 6 : I mean isn’t that what literally everyone in this post is saying? Like the point is that UCSD lacks a reputation.

 7 : It definitely does not lack a reputation in the academic/scientific world and it is certainly NOT lesser known than USD. Nobody outside of SoCal, or even outside of SD, knows about USD. Thus, the designation of it as the virgin by the poster.

SDSU provides more of a college experience than UCSD but UCSD has better acade