In [16]:
# Dependencies
import os
import praw
import pandas as pd
import datetime as dt
from tqdm import tqdm
import time
from config import access_token, token_type, client_id, secret_id, user_agent, username, password


In [17]:
# function for getting date of extract
def get_date(created):
    return dt.datetime.fromtimestamp(created)

In [18]:
# function for creating reddit instance and connecting
def reddit_connection():
    reddit = praw.Reddit(
        client_id = client_id, \
        client_secret = secret_id, \
        user_agent = user_agent, \
        username = username, \
        password = password
    )
    return reddit

In [19]:
# Function for accessing API data and storing in dataframe
def build_dataset(reddit, search_words='wallstreetbets', items_limit=1000):
    
    # collect reddit posts
    subreddit = reddit.subreddit(search_words)
    new_subreddit = subreddit.new(limit=items_limit)
    topics_dict = {
        "title" : [],
        "score" : [],
        "id" : [],
        "url" : [],
        "comms_num": [],
        "created" : [],
        "body" : []
    }
    
    print(f"Retreive new reddit posts")
    
    for submission in tqdm(new_subreddit):
        topics_dict["title"].append(submission.title)
        topics_dict["score"].append(submission.score)
        topics_dict["id"].append(submission.id)
        topics_dict["url"].append(submission.url)
        topics_dict["comms_num"].append(submission.num_comments)
        topics_dict["created"].append(submission.created)
        topics_dict["body"].append(submission.selftext)
    
    for comment in tqdm(subreddit.comments(limit=1000)):
        topics_dict["title"].append("Comment")
        topics_dict["score"].append(comment.score)
        topics_dict["id"].append(comment.id)
        topics_dict["url"].append("")
        topics_dict["comms_num"].append(0)
        topics_dict["created"].append(comment.created)
        topics_dict["body"].append(comment.body)
        
    topics_df = pd.DataFrame(topics_dict)
    print(f"New reddit posts retrieved: {len(topics_df)}")
    topics_df['timestamp'] = topics_df['created'].apply(lambda x: get_date(x))
    
    return topics_df

In [20]:
# Function to upload and save dataset with newest data
def update_and_save_dataset(topics_df):   
    file_path = "reddit_wsb.csv"
    if os.path.exists(file_path):
        topics_old_df = pd.read_csv(file_path)
        print(f"past reddit posts: {topics_old_df.shape}")
        topics_all_df = pd.concat([topics_old_df, topics_df], axis=0)
        print(f"new reddit posts: {topics_df.shape[0]} past posts: {topics_old_df.shape[0]} all posts: {topics_all_df.shape[0]}")
        topics_new_df = topics_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
        print(f"all reddit posts: {topics_new_df.shape}")
        topics_new_df.to_csv(file_path, index=False)
    else:
        print(f"reddit posts: {topics_df.shape}")
        topics_df.to_csv(file_path, index=False)

In [21]:
# Bring it all together baby
if __name__ == "__main__": 
    reddit = reddit_connection()
    topics_data_df = build_dataset(reddit)
    update_and_save_dataset(topics_data_df)

0it [00:00, ?it/s]

Retreive new reddit posts


913it [00:15, 58.59it/s]
979it [00:08, 122.19it/s]

New reddit posts retrieved: 1892
reddit posts: (1892, 8)



