### Set Up

In [6]:
"""
PRAW Docs: https://praw.readthedocs.io/en/stable/tutorials/comments.html
Sentiment Analysis Using HuggingFace: https://huggingface.co/blog/sentiment-analysis-python
Cultural Analytics with Python: https://melaniewalsh.github.io/Intro-Cultural-Analytics/04-Data-Collection/14-Reddit-Data.html
"""

# Set up Pandas

import pandas as pd
from datetime import datetime
import csv
import praw

pd.set_option('max_colwidth', 500)


# Set up PRAW with athentication


reddit = praw.Reddit(
    client_id = "QErgebb-REIyaM6wsoQ-Nw",
    client_secret = "PPuRFlKap6UQ4D5f9tYi_pvY68ePkA",
    username = "Ok_Scientist2546",
    password = "EZ8y@'ctT!f4A%L",
    user_agent = "Praw-test"
)

In [8]:
# Determine Available Attributes of a Submission object
import pprint

submission = reddit.submission("14828yd")
# print(submission.title)  # to make it non-lazy
# pprint.pprint(vars(submission))

# # Determine Available Attributes of a Comment object
# comment = list(submission.comments)[0]
# print(comment.body)  
# pprint.pprint(vars(comment))


### Build Submissions Dataframe

In [11]:
### FUNCTIONS ###

from datetime import datetime

def pull_submissions(num_subs: int, sub_name: str, sort: str="hot", keywords=[]):
    """
    Gets key details about num_subs number of submissions on a particular subreddit sub_name. 


    Inputs:
        - num_subs [int]: the number of submissions to pull
        - sub_name [str]: subreddit name without the r/, i.e., "scarystories"
        - sort [str]: the way to sort the subreddit, i.e. by "controversial," 
            "gilded," "hot," "new," "rising," or "top". By default, sorted by "hot."
        - keywords [List[str]]: list of keywords to filter by. By default an empty list. At least 1 
            keyword must appear once in the submission text for the submission to be returned

    
    Returns:
        [List[Dict[9 items]]]: a list of dictionaries, one for each submission in the specified subreddit 
    """

    res = []
    subreddit = reddit.subreddit(sub_name)

    SORTED_SUBMISSIONS = {"hot": subreddit.hot(limit=num_subs), 
                          "controversial": subreddit.controversial(limit=num_subs), 
                          "gilded": subreddit.gilded(limit=num_subs),
                          "top": subreddit.top(limit=num_subs),
                          "new": subreddit.new(limit=num_subs),
                          "rising": subreddit.rising(limit=num_subs)
                          }
    
    for submission in SORTED_SUBMISSIONS[sort]:
        if keywords == [] or key_words_in_text(keywords, submission.selftext):
            story = {}
            story["title"] = submission.title
            story["submission_id"] = submission.id
            story["score"] = submission.score
            story["url"] = submission.url
            story["author"] = submission.author.name
            story["text"] = (submission.selftext.replace("’", "'").
                            replace("…", "...").replace("\n", " ").replace("“", "\"").
                            replace("”", "\""))
            story["subreddit"] = submission.subreddit
            story["num_comments"] = submission.num_comments
            story["date_created"] = datetime.fromtimestamp(submission.created_utc)
            res.append(story)
    return res

def key_words_in_text(keywords, text):
    """
    Checks if any of the keywords are in the text.

    Inputs:
        keywords [List[str]]: a list of key words to check
        text [str]: string text to check for words

    Returns: True if any of the keywords are in the text, False otherwise.
    """
    for word in keywords: 
        if word in text.lower(): 
            return True
    return False

def pull_comments(subreddit_id: str, amount: str="all"):
    """
    Pull all or top level comments from a certain reddit submission.

    Inputs:
        subreddit_id [str]: the subreddit id of subreddit you want to pull from
        amount [str]: how many comments to pull, all comments or only top level comments. 
            By default, this variable has value "all"


    Returns: 
        [List[Dict[8 items]]]: a list of comments from a single submission with the comment details
    """

    submission = reddit.submission(subreddit_id)

    # Select top level comments or all comments 
    submission.comments.replace_more(limit=None)
    if amount == "top_level":
        comments = []
        for top_level_comment in submission.comments:
            comments.append(top_level_comment) 
    else:
        comments = submission.comments.list()

    # Return List of dictionaries with comment details
    res = []
    for comment in comments:
        new_comment = {}
        new_comment["text"] = comment.body.replace("’", "'").replace("\n", " ").replace("“", "\"")
        # Text needs to contain the keyword to be returned!
        new_comment["author"] = comment.author.name
        new_comment["score"] = comment.score
        new_comment['comment_id'] = comment.id
        new_comment["is_op"] = comment.is_submitter
        new_comment["submission_id"] = comment._submission.id
        new_comment["subreddit"] = comment.subreddit_name_prefixed
        new_comment["subreddit_id"] = comment.subreddit_id
        res.append(new_comment)
    return res


def write_to_csv(obj, file, mode="a"):
     """
     Writes info from a List[Dict[items]] object into a csv file.

     Inputs:
        mode [str]: "w" for write or "a" for append. By default, "a" for append.
        obj [List[Dict[items]]]: the object that contains the info to write
        file [str]: csv file name

     Returns:
        Nothing
     """
     fieldnames = obj[0].keys()
     with open(file, mode, newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(obj)

### Get 10 stories

In [13]:
hot_ten_stories = pull_submissions(num_subs=10, sub_name="scarystories", sort="top")
write_to_csv(hot_ten_stories, "submissions.csv", "w")
stories_df = pd.read_csv("submissions.csv", delimiter=',', encoding='utf-8')
stories_df

Unnamed: 0,title,submission_id,score,url,author,text,subreddit,num_comments,date_created
0,This is not a suicide note!,dcvw5l,2286,https://www.reddit.com/r/scarystories/comments/dcvw5l/this_is_not_a_suicide_note/,joshuaandrew1985,"Three months ago, I learned of my Uncle's passing. We weren't particularly close but he left me one of his houses in the will. I was touched but just wanted to sell it off. I am a fairly new father of twin girls and didn't need another thing to manage right now. To speed up the process, I decided to travel across the country to live in the house until a sale was assured. My job just requires a laptop and phone so I was approved to work remotely for the time being. Unfortunately, my wife had...",scarystories,138,2019-10-03 15:36:19
1,Run,ijjshz,2202,https://www.reddit.com/r/scarystories/comments/ijjshz/run/,Outoftune7,"Run Run. And if you can't run then walk. And if you can't walk, then crawl. But whatever you do, don't stop. It's only been an hour since everyone on earth, as far as I could tell, heard those same words. I was already walking through the park when I heard them, and after seeing what happens, I won't be stopping anytime soon. It's been two hours now, I've kept walking, I'm tired but stopping is not an option. Some people tried driving, but the act of getting in a car requires stopping,...",scarystories,51,2020-08-30 16:25:18
2,Go back to sleep,im8rmm,2092,https://i.redd.it/9gmvnmgty1l51.jpg,Danielaxmalfoy,,scarystories,254,2020-09-03 23:45:49
3,Emergency Alert. Do not look outside your windows.,jkbgv7,1970,https://www.reddit.com/r/scarystories/comments/jkbgv7/emergency_alert_do_not_look_outside_your_windows/,JoshJ444,"EMERGENCY ALERT. *Warning, this is not a test, I repeat, this is not a test. Citizens of the world. Lock all doors. Close all of your blinds. Do not go near any windows. Do not look outside. Do not leave your house. Stay in a room with no windows. Gather what supplies you can. Turn all of the lights off, do not open the door for anything.* When I first heard it, it didn't even sound real. It sounded fake, like something in a horror movie, but it was an emergency so I decided I probably sho...",scarystories,187,2020-10-29 10:53:11
4,My son was diagnosed with something far worse than cancer.,jmuc22,1639,https://www.reddit.com/r/scarystories/comments/jmuc22/my_son_was_diagnosed_with_something_far_worse/,_KILLBLADE_,"""Mommy, my privates hurt."" Was there any way for me to know how frightening those four words were at the time? Like any mother would do, I got on my knees. ""Let me see,"" My five-year-old son pulled down his pants and showed me. As I looked, a shock of fear went through me. His testicles were red and swollen. I tried not to panic and thought about what it could be. Nothing - other than the worst-case scenario - came to mind. The night before, during bath time, his privates looked norma...",scarystories,91,2020-11-02 14:28:48
5,I let my mom count down to zero.,hvhk07,1542,https://www.reddit.com/r/scarystories/comments/hvhk07/i_let_my_mom_count_down_to_zero/,Jeremy_is_neat,It was scary,scarystories,69,2020-07-21 18:35:56
6,I was 6 when i shit my pants,kpfkao,1427,https://www.reddit.com/r/scarystories/comments/kpfkao/i_was_6_when_i_shit_my_pants/,Kenzydorjee,,scarystories,53,2021-01-03 01:37:13
7,Don’t Google yourself (horror story),kn5a52,1383,https://www.reddit.com/r/scarystories/comments/kn5a52/dont_google_yourself_horror_story/,Klutzy-Chemical-1929,"Don't Google yourself. Why? Because you may not like what you find... One day, I was bored and decided to search for my own name on Google. I have quite a rare name, so I didn't expect to find many results. Imagine how surprised I was when I came across a website that had my full name in the domain. www.(my name).com When I clicked the link, it brought me to a message board. I looked at the profile of the website owner and found out that the person was the same age as me and had the...",scarystories,53,2020-12-30 11:25:37
8,My house rules,ilpflt,1233,https://www.reddit.com/r/scarystories/comments/ilpflt/my_house_rules/,flowerprince-avi,"1. Always take off your shoes when you come in, they don't like dirt. 2. Always say hello. 3. When you enter the house, go straight to your room. 4. Lock your door. 5. If you leave your room, go straight to where you need to go. Do not stop walking. 6. Walk with your head up. 7. Never block your vision. 8. If you hear laughing, ignore it. It isn't there. 9. If you turn on a light, do not turn it back off. They will take care of it. 10. Do not turn off the music, one guest in particu...",scarystories,35,2020-09-03 04:11:49
9,Horrifying Experience during quarantine,g9qegv,1115,https://www.reddit.com/r/scarystories/comments/g9qegv/horrifying_experience_during_quarantine/,Rgracesauce,"I was upstairs in my room one night playing video games with the volume really loud when I thought I heard noises coming from downstairs. I muted the TV and listened closer. Nothing. I resumed playing the game for another hour when I heard it again only louder. I was sure I heard it this time and was a little freaked out. To assure myself I went downstairs to the basement where I thought I heard the noise coming from. As soon as I opened the basement door, I was hit with a cold breeze. I ...",scarystories,41,2020-04-28 12:29:02


In [14]:
### CASE STUDY TWO ###
# if text doesn't contain a keyword, don't pull the submission
subs_50 = pull_submissions(num_subs=50, sub_name="scarystories", keywords=["scary", "stab", "kill"])
write_to_csv(subs_50, 'submissions.csv', "w")

In [3]:
# TODO: Delete duplicates

subs_50 = pull_submissions(num_subs=50, sub_name="scarystories", keywords=["scary", "stab", "kill"])
write_to_csv(subs_50, 'submissions.csv', "w")

# other_50 = pull_submissions(num_subs=50, sort="hot", sub_name="scarystories", keywords=["frightened", "dark", "fear"])
# df = pd.read_csv("submissions.csv", delimiter=',', encoding='utf-8')
# write_to_csv(subs_50, 'submissions.csv', "a")
# pd.read_csv('submissions.csv').append(df).drop_duplicates().to_csv('submissions.csv')

### Master Checklist

- [ ] pulling only from certain time range: 
you can use pushshift to grab the bulk posts from the specific date range and then parse the specific posts using their retrieved IDs with PRAW if needed.
- [ ] filter by keywords during pull
- [ ] get access to dovetail?
