In [None]:
# Install Packages
from selenium import webdriver    # pip install selenium 
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
from bs4 import BeautifulSoup    # pip install beautifulsoup4

# Deal with Infinite Scrolling Page

In order to use selenium, we need a web driver.

In [None]:
# Install Chrome Driver
# Creating a webdriver instance
driver = webdriver.Chrome(ChromeDriverManager().install())

In [None]:
# Open Reddit search page by the keyword you specified
keyword = 'spx'
base_url = "https://www.reddit.com/search/?q="
driver.get(f"{base_url}{keyword}")

# waiting for the page to load
time.sleep(3)

# Get the height of the searched results
previous_height = driver.execute_script('return document.body.scrollHeight')

Now, we need to scroll to the bottom.If the new page 

In [None]:
# scroll to the bottom
while True:
    # Execute the scrolling
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    # waiting for the page to load
    time.sleep(3)
    # get the new height of the searched results
    new_height = driver.execute_script('return document.body.scrollHeight')
    # if the height doesn't change, it means we have scrolled to the bottom
    if new_height == previous_height:
        break
    previous_height = new_height

The page is now scrolled to the bottom. As the page is completely loaded, we will scrape the data we want.

# Extract Data

In [None]:
# create list variable to store the extracted data
subredditList = []
timestampList = []
titleList = []
upvoteList = []
numberOfCommentList = []
idList = []
urlList = []

In [None]:
# Store the source code of the web page in a variable
source = driver.page_source

# Now using beautiful soup
soup = BeautifulSoup(source, 'html.parser')

In [None]:
# Find the source code for each post
posts = soup.find_all("faceplate-tracker", {"source": "search",
                                              "action": "view",
                                              "noun": "post",
                                              "data-testid": "search-post"
                                              })

In [None]:
# Extract the information we need
for post in posts:
    subredditList.append(post.find("a", 
                          {"class": "flex items-center text-neutral-content-weak font-semibold"}).text.strip().split('/')[1])
    

    timestampList.append(post.find("faceplate-timeago")['ts'])

    titleList.append(post.find('a', {"data-testid": "post-title"}).text.split('\n')[1].strip())

    upvoteList.append(post.find("div", {"class": "text-neutral-content-weak text-12"}).find_all('faceplate-number')[0]['number'])

    numberOfCommentList.append(post.find("div", {"class": "text-neutral-content-weak text-12"}).find_all('faceplate-number')[1]['number'])

    idList.append(post.find("a", 
                    {"class": 
                      "text-16 xs:text-18 line-clamp-3 text-ellipsis text-neutral-content font-semibold mb-xs "
                      "no-underline hover:no-underline"})['id'].split('-')[-1]
                  )
    
    urlList.append(post.find("a", 
                    {"class": 
                      "text-16 xs:text-18 line-clamp-3 text-ellipsis text-neutral-content font-semibold mb-xs "
                      "no-underline hover:no-underline"})['href']
                    )

In [None]:
# Store the information into a Data Frame
data = pd.DataFrame({"subreddit": subredditList,
                      "timestamp": timestampList,
                      "title": titleList,
                      "upvote": upvoteList,
                      "number_of_comments": numberOfCommentList,
                      "id": idList,
                      "url": urlList
    })

In [None]:
# Create a data column
data['date'] = data['timestamp'].str[:10] 

In [None]:
data