In [1]:
import praw
import pandas as pd
import requests
import datetime
import uuid
from reddit_api_creds import *

In [2]:
# Kudos to Wikipedia https://en.wikipedia.org/wiki/Base36
def base36encode(integer: int) -> str:
    """Convert from Base10 to Base36."""
    chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'

    sign = '-' if integer < 0 else ''
    integer = abs(integer)
    result = ''

    while integer > 0:
        integer, remainder = divmod(integer, 36)
        result = chars[remainder] + result

    return sign + result

# Scraper

In [3]:
scraper = praw.Reddit(client_id=CLIENT_ID,
                     client_secret=CLIENT_SECRET,
                     user_agent=USER_AGENT
                     )

In [4]:
csuf = scraper.subreddit('csuf+news+Coronavirus+Python+computerscience')

In [5]:
#scraped_posts = csuf.hot(limit=10000)
scraped_posts = csuf.top(limit=10000)

In [6]:
data = []
data1 = []
for post in scraped_posts:
    # get base36 uuid
    uid = uuid.uuid1()
    uid = base36encode(uid.int)
    
    # username
    username = post.author
    if type(username) == praw.models.Redditor:
        username = post.author.name
    else:
        username = '[deleted]'
    
    # community_name
    community_name = post.subreddit.display_name
    data.append([uid, username, post.title, community_name, post.url, post.selftext, post.created])
    data1.append([uid, post.score, community_name])

In [7]:
posts = pd.DataFrame(data, columns=['uuid', 'username', 'title', 'community_name', 'url', 'description', 'published'])
votes = pd.DataFrame(data1, columns=['uuid', 'score', 'community_name'])
posts['published'] = [datetime.datetime.utcfromtimestamp(i).isoformat() for i in posts['published']]
# posts['username'] = [i.name if type(i)==praw.models.Redditor else '[deleted]' for i in posts['username']]
# posts['community_name'] = [i.display_name for i in posts['community_name']]
# votes['community_name'] = [i.display_name for i in votes['community_name']]
posts = posts.convert_dtypes()

# Shuffle dataframes

In [8]:
posts = posts.sample(frac=1, random_state=100).reset_index(drop=True)
votes = votes.sample(frac=1, random_state=100).reset_index(drop=True)

# Count and dtypes

In [9]:
print(posts.dtypes)

uuid              string
username          string
title             string
community_name    string
url               string
description       string
published         string
dtype: object


In [10]:
# print no of records in each community
posts.set_index(['uuid', 'community_name']).count(level='community_name')

Unnamed: 0_level_0,username,title,url,description,published
community_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Coronavirus,979,979,979,979,979
Python,993,993,993,993,993
computerscience,982,982,982,982,982
csuf,999,999,999,999,999
news,991,991,991,991,991


# Save to JSON and csv

In [11]:
posts.to_json('posts.json', index=False, orient='table', indent=1)

In [12]:
votes.to_json('votes.json', index=False, orient='table', indent=1)

In [13]:
posts.to_csv('posts.csv')
votes.to_csv('votes.csv')