In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import os
os.chdir('/content/drive/MyDrive/Projects/reddit-vote-predictor')
import sys
import pandas as pd
from datetime import datetime
# Import custom module
from scripts.RedditScrape import * # imports a class `RedditScrape`

In [3]:
# Reddit's own api:
!pip3 install psaw
!pip3 install praw
import psaw
import praw

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting psaw
  Downloading psaw-0.1.0-py3-none-any.whl (15 kB)
Installing collected packages: psaw
Successfully installed psaw-0.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting praw
  Downloading praw-7.6.1-py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 6.8 MB/s 
[?25hCollecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Collecting websocket-client>=0.54.0
  Downloading websocket_client-1.4.2-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.5 MB/s 
Installing collected packages: websocket-client, update-checker, prawcore, praw
Successfully installed praw-7.6.1 prawcore-2.3.0 update-checker-0.18.0 websocket-client-1.4.2


In [None]:
# Set date range from which to extract data for two subreddits
# NOTE: ater running the data scraping and cleaning pipeline, one year of data
# resulted in around 1000 images, so we'll add time to the date range
# 10 years from the start of 2011 to the end of 2020
start_ = datetime(2011, 1, 1)
end_ = datetime(2020, 12, 31)
# Run the scraping pipeline using custom RedditScrape class imported above
cat = RedditScrape('cat', start_, end_)
cat.scrape_posts()
cat.posts.to_csv('data/tbl/cat_posts.csv', index=False)
dog = RedditScrape('dog', start_, end_)
dog.scrape_posts()
dog.posts.to_csv('data/tbl/dog_posts.csv', index=False)

In [4]:
# If the above is not run, read in the data
cat = pd.read_csv('data/tbl/cat_posts.csv', index_col = None)
dog = pd.read_csv('data/tbl/dog_posts.csv', index_col = None)

  exec(code_obj, self.user_global_ns, self.user_ns)


## Using `PRAW`
Now use `praw` to fetch additional information about each post now that it has been efficiently scraped using `psaw`. This will help us gather more information that will help more accurately filter for good posts/images, and acquire information that might be useful in modeling `score`.

Use the `id` field of the `psaw` results to get more information for each post.

In [None]:
# reddit = praw.Reddit(
#     client_id = 'client id',
#     client_secret = 'client secret',
#     username = 'username',
#     password = 'password',
#     user_agent = 'my user agent'
# )
# Fill in all fields above
# Docs: https://praw.readthedocs.io/en/stable/getting_started/quick_start.html
# Now simply lookup the scraped posts on reddit's praw using the id obtained by psaw
# Then use the following method to get more data for each post:
# post = reddit.submission(id = ID)
# where ID is the value in column "id" in the data obtained from psaw

The above arguments are saved in a secret json file.

In [5]:
import json
with open('reddit_args.json') as f:
  args = json.load(f)
  

### Now use the args (which is a dict of keyword arguments) to set up the instance of the reddit praw api Reddit class

In [None]:
reddit = praw.Reddit(**args)

Using each post in the data scraped using `psaw`, we search for the exact submission using the `reddit` object created above. Each submission has attributes:

* `submission.score` # the score upvotes - downvotes
* `submission.ups` # the number of upvotes
* `submission.downs` # the number of downvotes

Note that `score` also downloaded with he `psaw` scrape but is not as accurate as the one downloaded from `praw` and we'll use that instead.


In [None]:
cat_praw = []
dog_praw = []
for i in range(cat.shape[0]):
  id_ = cat.id[i] # first with one subreddit
  submission = reddit.submission(id_)
  cat_praw.append([submission.score, submission.ups, submission.downs])
cat_scores = pd.DataFrame(cat_praw, columns = ['total_score', 'upvotes', 'downvotes'])
for i in range(dog.shape[0]):
  id_ = dog.id[i] # then the other
  submission = reddit.submission(id_)
  dog_praw.append([submission.score, submission.ups, submission.downs])
dog_scores = pd.DataFrame(dog_praw, columns = ['total_score', 'upvotes', 'downvotes'])

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

### Now combine the variables scraped from `praw` with data from `psaw`

In [26]:
dir(test_sub)

['STR_FIELD',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_chunk',
 '_comments',
 '_comments_by_id',
 '_fetch',
 '_fetch_data',
 '_fetch_info',
 '_fetched',
 '_kind',
 '_reddit',
 '_reset_attributes',
 '_safely_add_arguments',
 '_url_parts',
 '_vote',
 'all_awardings',
 'allow_live_comments',
 'approved_at_utc',
 'approved_by',
 'archived',
 'author',
 'author_flair_background_color',
 'author_flair_css_class',
 'author_flair_richtext',
 'author_flair_template_id',
 'author_flair_text',
 'author_flair_text_color',
 'author_flair_type',
 'author_fullname',
 'author_is_blocked',
 'author_patreon_flair',
 'author_premium',
 'award',
 'awarders',
 'ban