Skip to content

Commit 77ae12d

Browse files
Merge pull request avinashkranjan#1116 from RohiniRG/RohiniRG-scrapereddit
Reddit scraper without API
2 parents 482d48c + 9837f7e commit 77ae12d

File tree

4 files changed

+244
-0
lines changed

4 files changed

+244
-0
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Reddit Scraper
2+
3+
- Using BeautifulSoup, a python library useful for web scraping, this script helps to scrape a desired subreddit to obtain all relevant data regarding its posts.
4+
5+
- In the `fetch_reddit.py` , we take user input for the subreddit name, tags and the maximum count of posts to be scraped, we fetch and store all this information in a database file.
6+
7+
- In the `display_reddit.py` , we display the desired results from the database to the user.
8+
9+
## Setup instructions
10+
11+
- The requirements can be installed as follows:
12+
13+
```shell
14+
$ pip install -r requirements.txt
15+
```
16+
17+
## Working screenshots
18+
19+
20+
![Image](https://i.imgur.com/2jHHjCh.png)
21+
#
22+
23+
![Image](https://i.imgur.com/XW8dkrQ.png)
24+
25+
## Author
26+
[Rohini Rao](www.github.com/RohiniRG)
27+
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import sqlite3
2+
import os
3+
4+
5+
def sql_connection():
6+
"""
7+
Establishes a connection to the SQL file database
8+
:return connection object:
9+
"""
10+
path = os.path.abspath('SubredditDatabase.db')
11+
con = sqlite3.connect(path)
12+
return con
13+
14+
15+
def sql_fetcher(con):
16+
"""
17+
Fetches all the tweets with the given hashtag from our database
18+
:param con:
19+
:return:
20+
"""
21+
subreddit = input("\nEnter subreddit to search: r/")
22+
count = 0
23+
cur = con.cursor()
24+
cur.execute('SELECT * FROM posts') # SQL search query
25+
rows = cur.fetchall()
26+
27+
for r in rows:
28+
if subreddit in r:
29+
count += 1
30+
print(f'\nTAG: {r[1]}\nPOST TITLE: {r[2]}\nAUTHOR: {r[3]}\n'
31+
f'TIME STAMP: {r[4]}\nUPVOTES: {r[5]}\nCOMMENTS: {r[6]}'
32+
f'\nURL: {r[7]}\n')
33+
34+
if count:
35+
print(f'{count} posts fetched from database\n')
36+
else:
37+
print('\nNo posts stored for this subreddit\n')
38+
39+
40+
con = sql_connection()
41+
42+
while 1:
43+
sql_fetcher(con)
44+
45+
ans = input('\nPress (y) to continue or any other key to exit: ').lower()
46+
if ans == 'y':
47+
continue
48+
else:
49+
print('\nExiting..\n')
50+
break
51+
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import requests
2+
import csv
3+
import time
4+
import sqlite3
5+
from bs4 import BeautifulSoup
6+
7+
8+
def sql_connection():
9+
"""
10+
Establishes a connection to the SQL file database
11+
:return connection object:
12+
"""
13+
con = sqlite3.connect('SubredditDatabase.db')
14+
return con
15+
16+
17+
def sql_table(con):
18+
"""
19+
Creates a table in the database (if it does not exist already)
20+
to store the tweet info
21+
:param con:
22+
:return:
23+
"""
24+
cur = con.cursor()
25+
cur.execute("CREATE TABLE IF NOT EXISTS posts(SUBREDDIT text, TAG text, "
26+
" TITLE text, AUTHOR text, TIMESTAMP text, UPVOTES int, "
27+
" COMMENTS text, URL text)")
28+
con.commit()
29+
30+
31+
def sql_insert_table(con, entities):
32+
"""
33+
Inserts the desired data into the table to store tweet info
34+
:param con:
35+
:param entities:
36+
:return:
37+
"""
38+
cur = con.cursor()
39+
cur.execute('INSERT INTO posts(SUBREDDIT, TAG, TITLE, AUTHOR, '
40+
'TIMESTAMP, UPVOTES, COMMENTS, URL) '
41+
'VALUES(?, ?, ?, ?, ?, ?, ?, ?)', entities)
42+
con.commit()
43+
44+
45+
def scraper():
46+
"""
47+
The function scrapes the post info from the desired subreddit and stores it
48+
into the desired file.
49+
:return:
50+
"""
51+
con = sql_connection()
52+
sql_table(con)
53+
54+
while 1:
55+
subreddit = input('\n\nEnter the name of the subreddit: r/').lower()
56+
max_count = int(input('Enter the maximum number of entries to collect: '))
57+
select = int(input('Select tags to add for the search: \n1. hot\n2. new'
58+
'\n3. rising\n4. controversial\n5. top\nMake your choice: '))
59+
60+
if select == 1:
61+
tag = 'hot'
62+
tag_url = '/'
63+
elif select == 2:
64+
tag = 'new'
65+
tag_url = '/new/'
66+
elif select == 3:
67+
tag = 'rising'
68+
tag_url = '/rising/'
69+
elif select == 4:
70+
tag = 'controversial'
71+
tag_url = '/controversial/'
72+
elif select == 5:
73+
tag = 'top'
74+
tag_url = '/top/'
75+
76+
# URL for the desired subreddit
77+
url = 'https://old.reddit.com/r/' + subreddit
78+
79+
# Using a user-agent to mimic browser activity
80+
headers = {'User-Agent': 'Mozilla/5.0'}
81+
82+
req = requests.get(url, headers=headers)
83+
84+
if req.status_code == 200:
85+
soup = BeautifulSoup(req.text, 'html.parser')
86+
print(f'\nCOLLECTING INFORMATION FOR r/{subreddit}....')
87+
88+
attrs = {'class': 'thing'}
89+
counter = 1
90+
full = 0
91+
reddit_info = []
92+
while 1:
93+
for post in soup.find_all('div', attrs=attrs):
94+
try:
95+
# To obtain the post title
96+
title = post.find('a', class_='title').text
97+
98+
# To get the username of the post author
99+
author = post.find('a', class_='author').text
100+
101+
# To obtain the time of the post
102+
time_stamp = post.time.attrs['title']
103+
104+
# To obtain the number of comments on the post
105+
comments = post.find('a', class_='comments').text.split()[0]
106+
if comments == 'comment':
107+
comments = 0
108+
109+
# To get the number of comments on the post
110+
upvotes = post.find('div', class_='score likes').text
111+
if upvotes == '•':
112+
upvotes = "None"
113+
114+
# To get the URL of the post
115+
link = post.find('a', class_='title')['href']
116+
link = 'www.reddit.com' + link
117+
118+
# Entering all the collected information into our database
119+
entities = (subreddit, tag, title, author, time_stamp, upvotes,
120+
comments, link)
121+
sql_insert_table(con, entities)
122+
123+
if counter == max_count:
124+
full = 1
125+
break
126+
127+
counter += 1
128+
except AttributeError:
129+
continue
130+
131+
if full:
132+
break
133+
134+
try:
135+
# To go to the next page
136+
next_button = soup.find('span', class_='next-button')
137+
next_page_link = next_button.find('a').attrs['href']
138+
139+
time.sleep(2)
140+
141+
req = requests.get(next_page_link, headers=headers)
142+
soup = BeautifulSoup(req.text, 'html.parser')
143+
except:
144+
break
145+
146+
print('DONE!\n')
147+
ans = input('Press (y) to continue or any other key to exit: ').lower()
148+
if ans == 'y':
149+
continue
150+
else:
151+
print('Exiting..')
152+
break
153+
else:
154+
print('Error fetching results.. Try again!')
155+
156+
157+
if __name__ == '__main__':
158+
scraper()
159+
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
beautifulsoup4==4.9.3
2+
certifi==2020.12.5
3+
chardet==4.0.0
4+
idna==2.10
5+
requests==2.25.1
6+
soupsieve==2.2.1
7+
urllib3==1.26.4

0 commit comments

Comments
 (0)