In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, json
import pandas as pd
import subprocess
import time
from random import sample
from os import listdir
from os.path import isfile, join
import re
import praw
from supabase import create_client, Client
import random

In [3]:
creds = json.load(open(".db-creds.json"))

url = creds["SUPABASE_URL"]
key = creds["SUPABASE_KEY"]
supabase = create_client(url, key)

In [4]:
reddit_creds = json.load(open(".reddit_creds.json", "r"))
reddit = praw.Reddit(
    client_id=reddit_creds["client_id"],
    client_secret=reddit_creds["client_secret"],
    password=reddit_creds["password"],
    user_agent=reddit_creds["user_agent"],
    username=reddit_creds["username"],
)

In [5]:
r_all = reddit.subreddit('all')

In [6]:
def get_comment_data(comment, usernames, posts_result):    
    try:
        username = str(comment.author.name)
        if username[0:2] == '0x':
            username = username[2:]
        profile_pic = comment.author.icon_img
    except:
        username = "removed"
        profile_pic = "removed"

    if username not in usernames:
        user_db_data = {
            "reddit_username": username, 
            "profile_pic": profile_pic
        }
        user_data = supabase.table("users").insert(user_db_data).execute().data
        usernames.update({user_data[0]['reddit_username']: user_data[0]['id']})
        user_id = user_data[0]['id']
    else:
        user_id = usernames[username]

    comment_data = {
        "post_id": posts_result["id"],
        "user_id": user_id,
        "body": comment.body,
        "depth": comment.depth,
        "reddit_id": comment.name,
        "reddit_upvotes": comment.ups,
        "reddit_downvotes": comment.downs,
    }
    return comment_data

In [7]:
def add_comment(comment, comments, comment_reddit_ids, head_comments=None):
    if comment.name in comment_reddit_ids:
        return

    if type(comment) == praw.models.reddit.comment.Comment:
        if head_comments is not None:
            head_comments.append(comment.name)
        comment_data = get_comment_data(comment, usernames, posts_result)
        comments.append(comment_data)
#     else:
#         print(type(comment))
#         comments.extend(get_comments(comment.comments(), comment_reddit_ids, usernames, posts_result))

In [8]:
def get_comments(comment_list, comment_reddit_ids, usernames, posts_result):
    comments = []
    head_comments = []
    for comment in comment_list[:10]:
        add_comment(comment, comments, comment_reddit_ids, head_comments)
    for comment in comment_list[10:]:
        if comment.name in head_comments:
            add_comment(comment, comments)
        
    return comments

In [9]:
def get_data(table_name, select, initial_len=0):
    i = initial_len
    prev_len = -1
    data = []
    while (len(data) - initial_len) % 1000 == 0 and prev_len != len(data):
        prev_len = len(data)
        data.extend(supabase.table(table_name).select(select).range(i, i+1000).execute().data)
        i += 1000
    return data

# DB writer

In [10]:
minted_time = 0
last_block = 0


post_reddit_ids = [row['reddit_id'] for row in get_data("posts", 'reddit_id')]
comment_reddit_ids = [row['reddit_id'] for row in get_data("comments", "reddit_id")]
categories = {row['name']: row['id'] for row in get_data("categories", "id, name")}
usernames = {row['reddit_username']: row['id'] for row in get_data("users", "id, reddit_username")}

while True:
    # Mint reddit posts every hour
    if (time.time() - minted_time) / 60 > 30:
        post_reddit_ids.extend([row['reddit_id'] for row in get_data("posts", 'reddit_id', len(post_reddit_ids))])
        comment_reddit_ids.extend([row['reddit_id'] for row in get_data("comments", "reddit_id", len(comment_reddit_ids))])
        categories.update({row['name']: row['id'] for row in get_data("categories", "id, name", len(categories))})
        usernames.update({row['reddit_username']: row['id'] for row in get_data("users", "id, reddit_username", len(usernames))})
        
        posts = []
        for i, post in enumerate(r_all.top(time_filter='hour')):
            if post.name in post_reddit_ids:
                continue
                
            if i > 10:
                continue
                     
            category = post.subreddit.display_name
            if category not in categories:
                category_data = supabase.table("categories").insert({"name": category}).execute().data[0]
                category_id = category_data['id']
                categories.update({category_data['name']: category_id})
            else:
                category_id = categories[category]

            try:
                username = str(post.author.name)
                if username[0:2] == '0x':
                    username = username[2:]
                profile_pic = post.author.icon_img
            except:
                username = "removed"
                profile_pic = "removed"

            if username not in usernames:
                user_db_data = {
                    "reddit_username": username, 
                    "profile_pic": profile_pic
                }
                user_data = supabase.table("users").insert(user_db_data).execute().data[0]
                user_id = user_data['id']
                usernames.update({user_data['reddit_username']: user_id})
            else:
                user_id = usernames[username]

            post_data = {
                "category_id": category_id,
                "user_id": user_id,
                "title": post.title,
                "body": post.selftext,
                "url": post.url,
                "is_nsfw": post.over_18,
                "reddit_id": post.name,
                "reddit_upvotes": post.ups,
                "reddit_downvotes": post.downs,
            }
            posts.append(post_data)
            posts_result = supabase.table("posts").insert(post_data).execute().data[0]
            post_reddit_ids.append(posts_result['reddit_id'])

            comments = get_comments(post.comments.list(), comment_reddit_ids, usernames, posts_result)
                
            comments_data = supabase.table("comments").insert(comments).execute().data
            comment_reddit_ids.extend([d['reddit_id'] for d in comments_data])
            print(f"added {len(comments)} comments")
            
            user_ids = list(usernames.values())
            follow_ids = random.choices(user_ids, k=min(len(user_ids), 10))
            follower_ids = random.choices(user_ids, k=min(len(user_ids), 10))
            
            for i in follow_ids:
                supabase.table("follows").insert({"follower_id": user_id, "following_id": i}).execute()
            for i in follower_ids:
                supabase.table("follows").insert({"follower_id": i, "following_id": user_id}).execute()
        
        print(f"added {len(posts)} posts")
        print('')
        minted_time = time.time()
    
    time.sleep(1)

added 1 comments
added 10 comments
added 10 comments
added 10 comments
added 10 comments
added 4 comments
added 10 comments
added 7 comments
added 10 comments
added 10 comments
added 4 comments
added 11 posts

added 10 comments
added 10 comments
added 10 comments
added 5 comments
added 1 comments
added 9 comments
added 10 comments
added 10 comments
added 3 comments
added 8 comments
added 2 comments
added 11 posts

added 10 comments
added 10 comments
added 9 comments
added 4 comments
added 5 comments
added 8 comments
added 10 comments
added 10 comments
added 10 comments
added 1 comments
added 6 comments
added 11 posts

added 10 comments
added 9 comments
added 10 comments
added 10 comments
added 6 comments
added 10 comments
added 0 comments
added 10 comments
added 10 comments
added 4 comments
added 10 posts

added 10 comments
added 10 comments
added 10 comments
added 3 comments
added 10 comments
added 10 comments
added 9 comments
added 0 comments
added 10 comments
added 10 comments
added

ReadTimeout: The read operation timed out