Merge pull request avinashkranjan#1116 from RohiniRG/RohiniRG-scrapereddit

avinashkranjan · web-flow · commit 77ae12ddc40d · 2021-05-28T23:22:28.000+05:30
Reddit scraper without API
diff --git a/Reddit_Scraper_without_API/README.md b/Reddit_Scraper_without_API/README.md
@@ -0,0 +1,27 @@
+# Reddit Scraper
+
+- Using BeautifulSoup, a python library useful for web scraping, this script helps to scrape a desired subreddit to obtain all relevant data regarding its posts.
+
+- In the  `fetch_reddit.py` , we take user input for the subreddit name, tags and the maximum count of posts to be scraped, we fetch and store all this information in a database file.
+
+- In the `display_reddit.py` , we display the desired results from the database to the user.
+
+## Setup instructions
+
+- The requirements can be installed as follows:
+
+```shell
+    $ pip install -r requirements.txt
+```
+
+## Working screenshots
+
+
+![Image](https://i.imgur.com/2jHHjCh.png)
+#
+
+![Image](https://i.imgur.com/XW8dkrQ.png)
+
+## Author
+[Rohini Rao](www.github.com/RohiniRG)
+
diff --git a/Reddit_Scraper_without_API/display_reddit.py b/Reddit_Scraper_without_API/display_reddit.py
@@ -0,0 +1,51 @@
+import sqlite3
+import os
+
+
+def sql_connection():
+    """
+    Establishes a connection to the SQL file database
+    :return connection object:
+    """
+    path = os.path.abspath('SubredditDatabase.db')
+    con = sqlite3.connect(path)
+    return con
+
+
+def sql_fetcher(con):
+    """
+    Fetches all the tweets with the given hashtag from our database
+    :param con:
+    :return:
+    """
+    subreddit = input("\nEnter subreddit to search: r/")
+    count = 0
+    cur = con.cursor()
+    cur.execute('SELECT * FROM posts')  # SQL search query
+    rows = cur.fetchall()
+
+    for r in rows:
+        if subreddit in r:
+            count += 1
+            print(f'\nTAG: {r[1]}\nPOST TITLE: {r[2]}\nAUTHOR: {r[3]}\n'
+                  f'TIME STAMP: {r[4]}\nUPVOTES: {r[5]}\nCOMMENTS: {r[6]}'
+                  f'\nURL: {r[7]}\n')
+
+    if count:
+        print(f'{count} posts fetched from database\n')
+    else:
+        print('\nNo posts stored for this subreddit\n')
+
+
+con = sql_connection()
+
+while 1:
+    sql_fetcher(con)
+
+    ans = input('\nPress (y) to continue or any other key to exit: ').lower()
+    if ans == 'y':
+        continue
+    else:
+        print('\nExiting..\n')
+        break
+
diff --git a/Reddit_Scraper_without_API/fetch_reddit.py b/Reddit_Scraper_without_API/fetch_reddit.py
@@ -0,0 +1,159 @@
+import requests
+import csv
+import time
+import sqlite3
+from bs4 import BeautifulSoup
+
+
+def sql_connection():
+    """
+    Establishes a connection to the SQL file database
+    :return connection object:
+    """
+    con = sqlite3.connect('SubredditDatabase.db')
+    return con
+
+
+def sql_table(con):
+    """
+    Creates a table in the database (if it does not exist already)
+    to store the tweet info
+    :param con:
+    :return:
+    """
+    cur = con.cursor()
+    cur.execute("CREATE TABLE IF NOT EXISTS posts(SUBREDDIT text, TAG text, "
+                " TITLE text, AUTHOR text, TIMESTAMP text, UPVOTES int, " 
+                " COMMENTS text, URL text)")
+    con.commit()
+
+
+def sql_insert_table(con, entities):
+    """
+    Inserts the desired data into the table to store tweet info
+    :param con:
+    :param entities:
+    :return:
+    """
+    cur = con.cursor()
+    cur.execute('INSERT INTO posts(SUBREDDIT, TAG, TITLE, AUTHOR, '
+                'TIMESTAMP, UPVOTES, COMMENTS, URL) '
+                'VALUES(?, ?, ?, ?, ?, ?, ?, ?)', entities)
+    con.commit()
+
+
+def scraper():
+    """
+    The function scrapes the post info from the desired subreddit and stores it
+    into the desired file.
+    :return:
+    """
+    con = sql_connection()
+    sql_table(con)
+
+    while 1:
+        subreddit = input('\n\nEnter the name of the subreddit: r/').lower()
+        max_count = int(input('Enter the maximum number of entries to collect: '))
+        select = int(input('Select tags to add for the search: \n1. hot\n2. new'
+                            '\n3. rising\n4. controversial\n5. top\nMake your choice: '))
+
+        if select == 1:
+            tag = 'hot'
+            tag_url = '/'
+        elif select == 2:
+            tag = 'new'
+            tag_url = '/new/'
+        elif select == 3:
+            tag = 'rising'
+            tag_url = '/rising/'
+        elif select == 4:
+            tag = 'controversial'
+            tag_url = '/controversial/'
+        elif select == 5:
+            tag = 'top'
+            tag_url = '/top/'
+
+        # URL for the desired subreddit
+        url = 'https://old.reddit.com/r/' + subreddit
+
+        # Using a user-agent to mimic browser activity
+        headers = {'User-Agent': 'Mozilla/5.0'}
+
+        req = requests.get(url, headers=headers)
+
+        if req.status_code == 200:
+            soup = BeautifulSoup(req.text, 'html.parser')
+            print(f'\nCOLLECTING INFORMATION FOR r/{subreddit}....')
+
+            attrs = {'class': 'thing'}
+            counter = 1
+            full = 0
+            reddit_info = []
+            while 1:
+                for post in soup.find_all('div', attrs=attrs):
+                    try:
+                        # To obtain the post title 
+                        title = post.find('a', class_='title').text
+
+                        # To get the username of the post author
+                        author = post.find('a', class_='author').text
+
+                        # To obtain the time of the post
+                        time_stamp = post.time.attrs['title']
+
+                        # To obtain the number of comments on the post
+                        comments = post.find('a', class_='comments').text.split()[0]
+                        if comments == 'comment':
+                            comments = 0
+
+                        # To get the number of comments on the post
+                        upvotes = post.find('div', class_='score likes').text
+                        if upvotes == '•':
+                            upvotes = "None"
+
+                        # To get the URL of the post
+                        link = post.find('a', class_='title')['href']
+                        link = 'www.reddit.com' + link
+
+                        # Entering all the collected information into our database
+                        entities = (subreddit, tag, title, author, time_stamp, upvotes, 
+                                    comments, link)
+                        sql_insert_table(con, entities)
+
+                        if counter == max_count:
+                            full = 1
+                            break
+
+                        counter += 1
+                    except AttributeError:
+                        continue
+
+                if full:
+                    break
+
+                try:
+                    # To go to the next page
+                    next_button = soup.find('span', class_='next-button')
+                    next_page_link = next_button.find('a').attrs['href']
+
+                    time.sleep(2)
+
+                    req = requests.get(next_page_link, headers=headers)
+                    soup = BeautifulSoup(req.text, 'html.parser')
+                except:
+                    break
+
+            print('DONE!\n')
+            ans = input('Press (y) to continue or any other key to exit: ').lower()
+            if ans == 'y':
+                continue
+            else:
+                print('Exiting..')
+                break
+        else:
+            print('Error fetching results.. Try again!')
+
+
+if __name__ == '__main__':
+    scraper()
+
diff --git a/Reddit_Scraper_without_API/requirements.txt b/Reddit_Scraper_without_API/requirements.txt
@@ -0,0 +1,7 @@
+beautifulsoup4==4.9.3
+certifi==2020.12.5
+chardet==4.0.0
+idna==2.10
+requests==2.25.1
+soupsieve==2.2.1
+urllib3==1.26.4