In [3]:
# import praw
import pandas as pd
import requests
import datetime
import uuid
from reddit_api_creds import *

In [4]:
# Kudos to Wikipedia https://en.wikipedia.org/wiki/Base36
def base36encode(integer: int) -> str:
    """Convert from Base10 to Base36."""
    chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'

    sign = '-' if integer < 0 else ''
    integer = abs(integer)
    result = ''

    while integer > 0:
        integer, remainder = divmod(integer, 36)
        result = chars[remainder] + result

    return sign + result

In [6]:
import uuid
base36encode(uuid.uuid1().int)

'9H1TQXRQQ8JAL7HE1OSWN6K5Z'

In [9]:
datetime.datetime.now().timestamp()

1588265108.166598

In [11]:
datetime.datetime.utcfromtimestamp(1588265108)

datetime.datetime(2020, 4, 30, 16, 45, 8)

# Scraper

In [39]:
scraper = praw.Reddit(client_id=CLIENT_ID,
                     client_secret=CLIENT_SECRET,
                     user_agent=USER_AGENT
                     )

In [40]:
subreddits = scraper.subreddit('csuf+news+Coronavirus+Python+computerscience+bitcoin')
# subreddits = scraper.subreddit('csuf')

In [42]:
# scraped_posts = subreddits.hot(limit=10000)
scraped_posts = subreddits.top(limit=10000)
#scraped_posts = subreddits.top(limit=30)

In [43]:
data = []
data1 = []
for post in scraped_posts:
    # get base36 uuid
    uid = uuid.uuid1()
    uid = base36encode(uid.int)
    
    # username
    username = post.author
    if type(username) == praw.models.Redditor:
        username = post.author.name
    else:
        username = '[deleted]'
    
    # published
    published = post.created
    # published = datetime.datetime.utcfromtimestamp(post.created).isoformat()
    
    
    # community_name
    community_name = post.subreddit.display_name
    
    data.append([uid, username, post.title, community_name, post.url, post.selftext, published])
    data1.append([uid, post.score, community_name, published])

In [44]:
posts = pd.DataFrame(data, columns=['uuid', 'username', 'title', 'community_name', 'url', 'description', 'published'])
votes = pd.DataFrame(data1, columns=['uuid', 'score', 'community_name', 'published'])
posts = posts.convert_dtypes()

# Shuffle dataframes

In [45]:
posts = posts.sample(frac=1, random_state=100).reset_index(drop=True)
votes = votes.sample(frac=1, random_state=100).reset_index(drop=True)

# Count and dtypes

In [46]:
print(posts.dtypes)

uuid              string
username          string
title             string
community_name    string
url               string
description       string
published          Int64
dtype: object


In [47]:
# print no of records in each community
posts.set_index(['uuid', 'community_name']).count(level='community_name')

Unnamed: 0_level_0,username,title,url,description,published
community_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bitcoin,990,990,990,990,990
Coronavirus,980,980,980,980,980
Python,994,994,994,994,994
computerscience,981,981,981,981,981
csuf,1000,1000,1000,1000,1000
news,991,991,991,991,991


# Save to JSON and csv

In [48]:
posts.to_json('data/posts.json', index=False, orient='table', indent=1)

In [49]:
votes.to_json('data/votes.json', index=False, orient='table', indent=1)

In [50]:
posts.to_csv('data/posts.csv')
votes.to_csv('data/votes.csv')

### Manually check if answers are right (Ignore this)

In [51]:
#import datetime
datetime.datetime.utcfromtimestamp(1588089788)

datetime.datetime(2020, 4, 28, 16, 3, 8)

In [52]:
posts.sort_values(by=['published'], ascending=False)

Unnamed: 0,uuid,username,title,community_name,url,description,published
421,CAEPJIPK49FSWZ4K02JBAFYJB,Sponta7,Made an annoying Python script that sends a fr...,Python,https://i.redd.it/04iw5fezewv41.png,,1588256580
4115,CYBBQMXW3TICIOQBCND86Z0D3,itseddybruh321,Can I get an F?,csuf,https://i.redd.it/fyauh628svv41.jpg,,1588248893
2126,C59OGYZWADQVRCCOREWSOUP3R,Enclo,WEE DID IT GUYS!!!!,Bitcoin,https://i.redd.it/vstmm6h5gvv41.jpg,,1588246045
3213,ASD3C3PH204FAQ2EEHZY8IG7R,HotDamnGeoff,Judge rules Michigan stay-at-home order doesn’...,news,https://www.mlive.com/public-interest/2020/04/...,,1588242787
5808,B58TYAT4F0NHFBZUICWDPD40N,hash0t0,WATCH: Los Angeles Becomes The First Major Cit...,Coronavirus,https://laist.com/latest/post/20200429/Mayor-G...,,1588236015
...,...,...,...,...,...,...,...
1933,D78JO7OM0LR1WY6NHFWSBQDNB,RelentlessNoodle,Rainy day rage,csuf,http://imgur.com/73yOK,,1317909231
1928,DDTOCOD0UAES37EX4SCEE5F9Z,socatoa,GGG has class in McCarthy Hall,csuf,http://imgur.com/oQD0n,,1316742248
648,DCB6VGRG1W3ZVAFEGDW6BK6TZ,[deleted],ಠ_ಠ,csuf,http://i.imgur.com/014nl.jpg,,1315552265
4635,DDTMPJKPYG1X25DK3JF0AWVPZ,vashquash,Did anyone else see this in the elevators in t...,csuf,http://imgur.com/SfYd2,,1314961075
