# Complete Reddit Selenum & BeautifulSoup Scraping



## Preparation

In [8]:
# Install chromium, driver, and selenium
!pip install selenium
!pip install pandas
!pip install anytree
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

# Adding to path, unnecessary if files are moved to /usr/bin
# import sys
# sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

Collecting selenium
[?25l  Downloading https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl (904kB)
[K     |▍                               | 10kB 29.9MB/s eta 0:00:01[K     |▊                               | 20kB 5.7MB/s eta 0:00:01[K     |█                               | 30kB 8.1MB/s eta 0:00:01[K     |█▌                              | 40kB 5.4MB/s eta 0:00:01[K     |█▉                              | 51kB 6.6MB/s eta 0:00:01[K     |██▏                             | 61kB 7.8MB/s eta 0:00:01[K     |██▌                             | 71kB 8.1MB/s eta 0:00:01[K     |███                             | 81kB 9.1MB/s eta 0:00:01[K     |███▎                            | 92kB 10.1MB/s eta 0:00:01[K     |███▋                            | 102kB 8.8MB/s eta 0:00:01[K     |████                            | 112kB 8.8MB/s eta 0:00:01[K     |████▍                           | 122kB 8.8MB

## Initialization

In [0]:
# Hyperparameters
PAGES = 1   # Number of pages from each subreddit
HTTPS = "https://old.reddit.com/"

SUBREDDIT = [ "r/all",
              "r/funny",
              "r/jokes"]

SORT = "top"    # Sorting type, set to hot, new, rising, controversial, top, or gilded
TIME = "week"   # Sorting timespan, set to hour, week, month, year, or all. No effect for the first 3 sorting types above.

In [0]:
# Import libraries
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup as soup

import pandas as pd
import anytree
from anytree import NodeMixin, Node, RenderTree
from pprint import pprint

# Initialize webdriver, set to headless
options = webdriver.ChromeOptions()
options.add_argument("-no-sandbox")
options.add_argument("-headless")
options.add_argument("-disable-dev-shm-usage")

# Open driver with a website, then get result
d = webdriver.Chrome("chromedriver", options=options)
# d.get("https://old.reddit.com/r/all/")
# print(d.page_source) # results

In [0]:
# Custom anytree node for posts' comments
class NodeCom(NodeMixin):
  def __init__(self, name, attrs=None, parent=None, children=None):
    super(NodeCom, self).__init__()
    self.name = name
    self.attrs = attrs  # The reddit data
    self.parent = parent
    if children:
      self.children = children


# Get every data from a comment HTML
def get_comment(htcom):
  # Get comment & vote count. None = Deleted
  text = htcom.find("div", class_="usertext-body").text
  vote = htcom.find("span", class_="unvoted")

  # Deleted comments. Also automatically None if hidden
  if vote == '':
    vote = None
  if vote != None:
    vote = int(vote.attrs["title"])

  attrs = htcom.attrs # Get everything else (in attribute)
  attrs.update({"text":text, "vote":vote})
  return attrs


# Recursively get comments & create tree by linking nodes
def recursive(soup, parent):
  # End recursion if it's empty
  if not soup:
    return;

  # Soup is another link opened by soup
  htcoms = soup[0].find_all("div", class_="comment", recursive=False)

  # Iterate through every comment found, set to the same parent
  # print(len(htcoms))
  for htcom in htcoms:
    # if htcom is not None:
    attrs = get_comment(htcom)

    # Create node. Won't be replaced in the next loop since what matters is the object, not the variable.
    # node = NodeCom(attrs["data-fullname"], attrs, parent) # Deleted comments don't have data-fullname
    node = NodeCom("a", attrs, parent)
    nxsoup = htcom.select("div.listing")
    recursive(nxsoup, node) # Go deeper

  # Nothing to return, since this procedure just links the comment nodes

# Scrape

In [43]:
# Get all posts
posts = []

for sub in SUBREDDIT:
  # Iterate through every subreddit listed
  link = HTTPS + sub + "/" + SORT + "/?t=" + TIME
  print(link)
  d.get(link)

  for i in range(PAGES):
    wdposts = d.find_elements_by_class_name("thing")

    # Iterate through every post found
    for wdpost in wdposts:
      html = wdpost.get_attribute("outerHTML")
      htpost = soup(html, "html.parser").div
      
      # Get post title
      text = htpost.find("a", class_="title").text
      attrs = htpost.attrs # Get everything else (in attribute)
      
      # Append all & create room for the comment tree
      attrs.update({"text":text, "comments":None})
      posts.append(attrs)
    
    # Go to next page, cancel if there isn't any
    try:
      next = d.find_element_by_class_name("next-button").get_attribute("outerHTML")
    except NoSuchElementException:
      break # End loop

    page = soup(next, "html.parser").a.attrs["href"]
    print(i+1, page)
    d.get(page)


print("Total posts:", len(posts))
# Create table using pandas DataFrame, just for preview
pd.DataFrame(posts)[0:4]

https://old.reddit.com/r/all/top/?t=week
1 https://old.reddit.com/r/all/top/?t=week&count=25&after=t3_dy6ujn
https://old.reddit.com/r/funny/top/?t=week
1 https://old.reddit.com/r/funny/top/?t=week&count=25&after=t3_dxbjil
https://old.reddit.com/r/jokes/top/?t=week
1 https://old.reddit.com/r/Jokes/top/?t=week&count=25&after=t3_dxusqc
Total posts: 75


Unnamed: 0,class,id,onclick,data-fullname,data-type,data-gildings,data-whitelist-status,data-author,data-author-fullname,data-subreddit,data-subreddit-prefixed,data-subreddit-fullname,data-subreddit-type,data-timestamp,data-url,data-permalink,data-domain,data-rank,data-comments-count,data-score,data-promoted,data-nsfw,data-spoiler,data-oc,data-num-crossposts,data-context,text,vote,comments,data-kind
0,"[, thing, id-t3_dwdllq, odd, gilded, link, ]",thing_t3_dwdllq,click_thing(this),t3_dwdllq,link,18,all_ads,davidambart,t2_gdgjkg,pics,r/pics,t5_2qh0u,public,1573756037000,https://i.redd.it/0nzln43z2py31.jpg,/r/pics/comments/dwdllq/the_most_challenging_p...,i.redd.it,1,2458,194571,False,False,False,False,52,listing,The most challenging painting I've ever done t...,195000.0,,
1,"[, thing, id-t3_dybzwo, even, gilded, link, ]",thing_t3_dybzwo,click_thing(this),t3_dybzwo,link,1,all_ads,Elizabeth-II,t2_bjjwuhq,pics,r/pics,t5_2qh0u,public,1574121524000,https://i.redd.it/ia2vw09y9jz31.jpg,/r/pics/comments/dybzwo/a_powerful_shot_in_hon...,i.redd.it,2,7117,150081,False,False,False,False,27,listing,A powerful shot in Hong Kong,150000.0,,
2,"[, thing, id-t3_dy9gzl, linkflair, linkflair-f...",thing_t3_dy9gzl,click_thing(this),t3_dy9gzl,link,11,no_ads,hey-boss,t2_adpq4,HongKong,r/HongKong,t5_2rbn0,public,1574111053000,/r/HongKong/comments/dy9gzl/redditors_be_aware...,/r/HongKong/comments/dy9gzl/redditors_be_aware...,self.HongKong,3,3055,139615,False,False,False,False,20,listing,REDDITORS BE AWARE: CHINA IS PAYING FOR PEOPLE...,140000.0,,
3,"[, thing, id-t3_dybtxg, even, gilded, link, ]",thing_t3_dybtxg,click_thing(this),t3_dybtxg,link,43,all_ads,drewhead118,t2_a32uu,gaming,r/gaming,t5_2qh03,public,1574120788000,https://www.theverge.com/2019/11/18/20971514/v...,/r/gaming/comments/dybtxg/valve_announces_half...,theverge.com,4,15855,138041,False,False,False,False,19,listing,"""Valve announces Half-Life: Alyx, its first fl...",138000.0,,


In [0]:
# Get all comments from every posts, using recursive scrape-store to differentiate between childs & parents.
# for postlink in dfposts["data-permalink"]:

for post in posts:
  # print(post)
  d.get(HTTPS + post["data-permalink"])

  # Get the raw text page source
  html = d.page_source

  # Find the div containing the comments list, then get the comments unrecursively.
  htlist = soup(html, "html.parser").select("div.nestedlisting")
  parent = NodeCom(attrs["data-fullname"], post)  # Set the post as the first node
  recursive(htlist, parent)
  post["comments"] = parent

In [56]:
# Preview tree of a single post
prev = posts[0]
print(RenderTree(prev["comments"]))
print()
for pre, fill, node in RenderTree(prev["comments"]):
  vote = node.attrs["vote"]
  if vote == None:
    vote = node.attrs["data-score"]
  if vote == None:
    vote = 0

  treestr = u"%s%d" % (pre, vote)
  # Preview the text cropped, and newline removed
  print(treestr.ljust(8), node.attrs["text"][:50].replace('\n', ' '))

<__main__.NodeCom object at 0x7f184910a748>
├── <__main__.NodeCom object at 0x7f1841f512e8>
│   └── <__main__.NodeCom object at 0x7f1841f510f0>
│       ├── <__main__.NodeCom object at 0x7f1841f51390>
│       │   ├── <__main__.NodeCom object at 0x7f1841f51438>
│       │   │   └── <__main__.NodeCom object at 0x7f1841f514e0>
│       │   │       └── <__main__.NodeCom object at 0x7f1841f51588>
│       │   ├── <__main__.NodeCom object at 0x7f1841f51518>
│       │   │   └── <__main__.NodeCom object at 0x7f1841f51550>
│       │   │       └── <__main__.NodeCom object at 0x7f1841f51668>
│       │   │           └── <__main__.NodeCom object at 0x7f1841f516d8>
│       │   │               ├── <__main__.NodeCom object at 0x7f1841f51780>
│       │   │               ├── <__main__.NodeCom object at 0x7f1841f51860>
│       │   │               │   ├── <__main__.NodeCom object at 0x7f1841f51898>
│       │   │               │   └── <__main__.NodeCom object at 0x7f1841f51978>
│       │   │               ├── 

In [46]:
# Using the data
# View a certain amount of posts & their attributes
pd.DataFrame(posts)[0:2]

Unnamed: 0,class,id,onclick,data-fullname,data-type,data-gildings,data-whitelist-status,data-author,data-author-fullname,data-subreddit,data-subreddit-prefixed,data-subreddit-fullname,data-subreddit-type,data-timestamp,data-url,data-permalink,data-domain,data-rank,data-comments-count,data-score,data-promoted,data-nsfw,data-spoiler,data-oc,data-num-crossposts,data-context,text,vote,comments,data-kind
0,"[, thing, id-t3_dwdllq, odd, gilded, link, ]",thing_t3_dwdllq,click_thing(this),t3_dwdllq,link,18,all_ads,davidambart,t2_gdgjkg,pics,r/pics,t5_2qh0u,public,1573756037000,https://i.redd.it/0nzln43z2py31.jpg,/r/pics/comments/dwdllq/the_most_challenging_p...,i.redd.it,1,2458,194571,False,False,False,False,52,listing,The most challenging painting I've ever done t...,195000.0,<__main__.NodeCom object at 0x7f184910a748>,
1,"[, thing, id-t3_dybzwo, even, gilded, link, ]",thing_t3_dybzwo,click_thing(this),t3_dybzwo,link,1,all_ads,Elizabeth-II,t2_bjjwuhq,pics,r/pics,t5_2qh0u,public,1574121524000,https://i.redd.it/ia2vw09y9jz31.jpg,/r/pics/comments/dybzwo/a_powerful_shot_in_hon...,i.redd.it,2,7117,150081,False,False,False,False,27,listing,A powerful shot in Hong Kong,150000.0,<__main__.NodeCom object at 0x7f1841f60fd0>,


In [55]:
t_post = posts[4] # Getting certain post
t_comments = t_post["comments"].children # Getting its comments
t_comments[1].attrs # View a comment & its attributes

{'class': ['', 'thing', 'id-t1_f7yb907', 'noncollapsed', 'comment', ''],
 'data-author': 'AChero9',
 'data-author-fullname': 't2_3yjs6hnh',
 'data-fullname': 't1_f7yb907',
 'data-gildings': '0',
 'data-permalink': '/r/pics/comments/dy2vh8/a_message_to_the_world_left_by_a_student_in/f7yb907/',
 'data-replies': '25',
 'data-subreddit': 'pics',
 'data-subreddit-fullname': 't5_2qh0u',
 'data-subreddit-prefixed': 'r/pics',
 'data-subreddit-type': 'public',
 'data-type': 'comment',
 'id': 'thing_t1_f7yb907',
 'onclick': 'click_thing(this)',
 'text': 'This shit looks straight out of an apocalypse movie\n\n',
 'vote': 6459}

# Data Storage

Using pickle to store binary raw data of a variable & store it using Google Drive.

In [39]:
#@title Drive Mount { form-width: "30%" }
FILE = "save.dat" #@param {type:"string"}

# Use pickle for file-variable management
import pickle

# Mount Google Drive as folder in "/drive" Collab folder
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Save & Store

In [0]:
# Write binary to a file
with open(FILE, 'wb') as f:
    pickle.dump(posts, f)

copy_cmd = " "+FILE+"  /content/drive/My\ Drive/"
!cp $copy_cmd

## Test Load

Do it after restarting the runtime. Don't run any code inside "Scrape" & "Save" sections to make sure the value is fully there.

In [0]:
# Test load data, do it after the drive is mounted
copy_cmd = "/content/drive/My\ Drive/"+FILE+" "+"/content/"
!cp $copy_cmd

with open(FILE, 'rb') as f:
    loaded = pickle.load(f)

In [0]:
# Preview tree of loaded post.
prev = loaded[1]
print(RenderTree(prev["comments"]))
print()
for pre, fill, node in RenderTree(prev["comments"]):
  vote = node.attrs["vote"]
  if vote == None:
    vote = 0
  treestr = u"%s%d" % (pre, vote)
  print(treestr.ljust(8), node.attrs["text"][:50].replace('\n', ' '))