### Set Up

In [2]:
"""
PRAW Docs: https://praw.readthedocs.io/en/stable/tutorials/comments.html
Sentiment Analysis Using HuggingFace: https://huggingface.co/blog/sentiment-analysis-python
Cultural Analytics with Python: https://melaniewalsh.github.io/Intro-Cultural-Analytics/04-Data-Collection/14-Reddit-Data.html
"""

# Set up Pandas

import pandas as pd
pd.set_option('max_colwidth', 500)


# Set up PRAW with athentication

import praw
reddit = praw.Reddit(
    client_id = "QErgebb-REIyaM6wsoQ-Nw",
    client_secret = "PPuRFlKap6UQ4D5f9tYi_pvY68ePkA",
    username = "Ok_Scientist2546",
    password = "EZ8y@'ctT!f4A%L",
    user_agent = "Praw-test"
)



In [8]:
# Determine Available Attributes of a Submission object
import pprint

submission = reddit.submission("14828yd")
# print(submission.title)  # to make it non-lazy
# pprint.pprint(vars(submission))

# # Determine Available Attributes of a Comment object
# comment = list(submission.comments)[0]
# print(comment.body)  
# pprint.pprint(vars(comment))


### Build Submissions Dataframe

In [9]:
### FUNCTIONS ###

from datetime import datetime

def pull_submissions(num_subs: int, sub_name: str, sort: str):
    """
    Gets key details about num_subs number of submissions on a particular subreddit sub_name. 

    Inputs:
        num_subs [int]: the number of submissions to pull
        sub_name [str]: subreddit name without the r/, i.e., "scarystories"
        sort [str]: the way to sort the subreddit, i.e. by "controversial," "gilded," "hot," "new," "rising," or "top".

    
    Returns:
        [List[Dict[9 items]]]: a list of dictionaries, one for each submission in the specified subreddit 
    """

    res = []
    subreddit = reddit.subreddit(sub_name)

    SORTED_SUBMISSIONS = {"hot": subreddit.hot(limit=num_subs), 
                          "controversial": subreddit.controversial(limit=num_subs), 
                          "gilded": subreddit.gilded(limit=num_subs),
                          "top": subreddit.top(limit=num_subs),
                          "new": subreddit.new(limit=num_subs),
                          "rising": subreddit.rising(limit=num_subs)
                          }
    
    for submission in SORTED_SUBMISSIONS[sort]:
        story = {}
        story["title"] = submission.title
        story["submission_id"] = submission.id
        story["score"] = submission.score
        story["url"] = submission.url
        story["author"] = submission.author.name
        story["text"] = submission.selftext
        story["subreddit"] = submission.subreddit
        story["num_comments"] = submission.num_comments
        story["date_created"] = datetime.fromtimestamp(submission.created_utc)
        res.append(story)
    
    return res


In [10]:
# write submissions to csv and import as DF
import csv

hot_ten_stories = pull_submissions(num_subs=10, sub_name="scarystories", sort="hot")
fieldnames = hot_ten_stories[0].keys()

with open('submissions.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(hot_ten_stories)

# Import CSV data into Pandas as a DF

submissions_df = pd.read_csv("submissions.csv", delimiter=',', encoding="UTF-8")

### Build Comments Dataframe

In [11]:
def pull_comments(subreddit_id, amount="all"):
    """
    Pull all or top level comments from a certain reddit submission.

    Inputs:
        subreddit_id [str]: the subreddit id of subreddit you want to pull from
        amount [str]: how many comments to pull, all comments or only top level comments. 
            By default, this variable has value "all"


    Returns: 
        [List[Dict[8 items]]]: a list of comments from a single submission with the comment details
    """

    submission = reddit.submission(subreddit_id)

    # Select top level comments or all comments 
    submission.comments.replace_more(limit=None)
    if amount == "top_level":
        comments = []
        for top_level_comment in submission.comments:
            comments.append(top_level_comment) 
    else:
        comments = submission.comments.list()

    # Return List of dictionaries with comment details
    res = []
    for comment in comments:
        new_comment = {}
        new_comment["text"] = comment.body.replace("’", "'").replace("\n", " ")
        new_comment["author"] = comment.author.name
        new_comment["score"] = comment.score
        new_comment['comment_id'] = comment.id
        new_comment["is_op"] = comment.is_submitter
        new_comment["submission_id"] = comment._submission.id
        new_comment["subreddit"] = comment.subreddit_name_prefixed
        new_comment["subreddit_id"] = comment.subreddit_id
        res.append(new_comment)
    return res


In [5]:
# Write newly-pulled comments from r/scarystories to comments.csv file

import csv

new_story = pull_comments(subreddit_id="14828yd")
fieldnames = new_story[0].keys()

with open('comments.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(new_story)

# Import CSV data into a Pandas DF

comments_df = pd.read_csv("comments.csv", delimiter=',', encoding="UTF-8")
comments_df

Unnamed: 0,text,author,score,is_op,submission_id,subreddit,subreddit_id,comment_id
0,Very cool story,letsgoabbey,5,False,14828yd,r/scarystories,t5_2rrzd,jnyiaea
1,She's using her skills.,monkner,1,False,14828yd,r/scarystories,t5_2rrzd,jnysmzi
2,Nice,Deb6691,1,False,14828yd,r/scarystories,t5_2rrzd,jnyvqfc
3,"Kiss of death, nice.",TXJOEMAMA,1,False,14828yd,r/scarystories,t5_2rrzd,jo01y4x
4,Well done. Engaging story.,Quickhidemeplease,1,False,14828yd,r/scarystories,t5_2rrzd,jo15o93
5,holy crap!! i love this sm,julesisgayasf,1,False,14828yd,r/scarystories,t5_2rrzd,jo1icjb
6,Slow clap. Well done.,Impossible_Balance11,1,False,14828yd,r/scarystories,t5_2rrzd,jo2lpzc
7,Making lemonade from lemons😉,Stranger_at_Night,1,False,14828yd,r/scarystories,t5_2rrzd,jo47rsf
8,Nice!,ThrowAway072343,0,False,14828yd,r/scarystories,t5_2rrzd,jny4dqi
9,"Thank you, I'm glad you enjoyed it.",1000andonenites,3,True,14828yd,r/scarystories,t5_2rrzd,jnz9x5y


In [None]:
# Only pull between a certain time range

current_time = datetime.datetime.now(datetime.timezone.utc).timestamp()
one_week_ago = current_time - 604800
subreddit = reddit.subreddit("scarystories")

for submission in subreddit.new(limit=25):
	if submission.created_utc >= one_week_ago and < another_time_range:
		# do stuff
		pass

### Master Checklist

- [ ] pulling only from certain time range: 
you can use pushshift to grab the bulk posts from the specific date range and then parse the specific posts using their retrieved IDs with PRAW if needed.
- [ ] add work to Github
- [ ] filter by keywords during pull
- [ ] get access to dovetail?
