In [9]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import random
import re
import os
import time
from selenium.webdriver.chrome.options import Options
import csv

### Tutorials I've used
- https://towardsdatascience.com/web-scraping-using-selenium-and-beautifulsoup-99195cd70a58
- https://www.techbeamers.com/locate-elements-selenium-python/#locate-element-by-id

## Set up selenium and chrome

In [10]:
#start selenium driver
chrome_options = Options()
#disable notifications since it blocks scrolling
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
#need headless in order to make scrolling work
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.implicitly_wait(10)

  


## Scrap reddit subs
I've chosen mostly text-based subs. 
For example in science, text are more descriptive and the content is mostly a link or a video, so we save the title instead of the content based on length

In [7]:
subs = ['Vscode','relationships','Entrepreneur','tipofmytongue','science','CasualConversation',]
random.shuffle(subs)

In [11]:
#will write posts content in a file called posts.csv
csv_file = open('posts.csv', 'w')
csv_writer = csv.writer(csv_file)
#iterate through all wanted subs
for sub in subs:
    url = "https://www.reddit.com/r/%s" % sub
    print('scrapping',url)
    #open the sub url
    driver.get(url)
    for i in range(3):
        #scroll to bottom many times to load more posts
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        #wait for the page to load
        time.sleep(3)
    #get all post links
    links = set()
    #search all links
    href_elements = driver.find_elements_by_tag_name('a')
    for href_element in href_elements:
        link =  href_element.get_attribute("href")
        #if comments is in link, means it is a post
        if 'comments' in link:
            links.add(link)
    print('links',len(links))
    #go through all links
    total = 0
    for link in links:
        print('accessing',link)
        driver.get(link)
        #get post title
        title_divs = driver.find_elements_by_tag_name('h1')
        if len(title_divs) == 0: # is'nt this a real post?
            continue
        title_div = title_divs[0]
        title = title_div.text
        #get post content
        content_divs = driver.find_elements_by_class_name('RichTextJSON-root')
        if len(content_divs) == 0: # is'nt this a real post?
            continue
        content_div = content_divs[0]
        content = content_div.text
        #what to save? title or content (we keep the lengthiest)
        #some post has a long descriptive text and no content (or video/image content)
        #so we keep title instead of post
        post_content = max([title, content], key=len)
        #if for some reason title and content are empty, skip
        if post_content.strip() == '':
            continue
        #write to csv
        post_content = post_content.replace('\n', ' ')
        csv_writer.writerow([sub,post_content])
        csv_file.flush()
        total += 1
        #I don't want to get banned
        time.sleep(5)
print('scrapped', total)


scrapping https://www.reddit.com/r/Vscode
links 51
accessing https://www.reddit.com/r/vscode/comments/cr80v7/how_to_enable_commitpullpush_etc_for_collaborator/
accessing https://www.reddit.com/r/vscode/comments/cq97hy/vscode2077theme_cyberpunk_2077/
accessing https://www.reddit.com/r/vscode/comments/crwn30/performance_issues_when_files_start_to_grow_in/
accessing https://www.reddit.com/r/vscode/comments/crsxdg/how_to_modify_the_debug_program_to_add_linker/
accessing https://www.reddit.com/r/vscode/comments/cqxewh/exclude_some_source_files_based_on_c/
accessing https://www.reddit.com/r/vscode/comments/csl31f/how_to_revert_to_a_previous_version_of_file_with/
accessing https://www.reddit.com/r/vscode/comments/crbgy2/possible_glitch_with_vim_plugin_and_key_remappings/
accessing https://www.reddit.com/r/vscode/comments/cqqmad/format_as_type_just_like_visual_studio_or_intellij/
accessing https://www.reddit.com/r/vscode/comments/cqoiy5/node_debugger_variable_sections_always/
accessing https:/

KeyboardInterrupt: 