In [None]:
!pip3 install praw
!pip3 install nltk

In [4]:
import praw
import nltk
import numpy as np

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from nltk.sentiment import SentimentIntensityAnalyzer

Using TensorFlow backend.


In [5]:
# get a random submission from a given subreddit
def get_random_submission(subreddit_name, reddit):
    post = reddit.subreddit(subreddit_name).random()
    return post

# returns the text and the sentiment for each comment and
# for each comment, if the comment has a parent returns
# the parent and the comment as a child
def get_comments_and_parents(post):
    post.comments.replace_more(limit=None)
    comments = post.comments.list()
    vader_analyzer = SentimentIntensityAnalyzer()
    parents = []
    parents_scores = []
    rtn_comments = []
    scores = []
    for comment in comments:
        comment_parent = comment.parent()
        comment_scores = vader_analyzer.polarity_scores(comment.body)
        comment_scores_lst = [comment_scores["neg"], comment_scores["neu"],
                              comment_scores["pos"], comment_scores["compound"]]
        scores += [comment_scores_lst]
        try:
            parents += [comment_parent.body]
            parent_scores = vader_analyzer.polarity_scores(comment_parent.body)
            parent_scores_lst = [parent_scores["neg"], parent_scores["neu"],
                                parent_scores["pos"], parent_scores["compound"]]
            parents_scores += [parent_scores_lst]            
            rtn_comments += [comment.body]
        except AttributeError:
            pass
    return ([comment.body for comment in comments], scores,
            [comment.score for comment in comments]),(parents, parents_scores, rtn_comments)
  

In [6]:
reddit = praw.Reddit(client_id = "k7TYWsSn5jpn_w",
                     client_secret = "_X5Zeopsz9aSUTq_iMIRaDoYmv8",
                     user_agent='Python: Comment Scraper: v0.1(by /u/josmfred)')


In [None]:
comment_texts, comment_sentiment_scores, comment_upvotes = [], [], []
parent_texts, parent_sentiment_scores, children_texts = [], [], []

In [None]:
for i in range (100):
    score_predict_data, word_predict_data = (
            get_comments_and_parents(get_random_submission("ProgrammerHumor", reddit))
    )
    comment_texts.extend(score_predict_data[0])
    comment_sentiment_scores.extend(score_predict_data[1])
    comment_upvotes.extend(score_predict_data[2])
    parent_texts.extend(word_predict_data[0])
    parent_sentiment_scores.extend(word_predict_data[1])
    children_texts.extend(word_predict_data[2])
    if i % 9 == 0:
        print(len(comment_texts))
        pickle.dump(comment_texts, open("comment_texts.pkl", "wb"))
        pickle.dump(comment_sentiment_scores, open("comment_sentiment_scores.pkl", "wb"))
        pickle.dump(comment_upvotes, open("comment_upvotes.pkl", "wb"))
        pickle.dump(parent_texts, open("parent_texts.pkl", "wb"))
        pickle.dump(parent_sentiment_scores, open("parents_sentiment_scores.pkl", "wb"))
        pickle.dump(children_texts, open("children_texts.pkl", "wb"))

Having finished aquiring and saving the data, we will begin cleaning the data under the assumption that the data is saved in the current directory. The following portion of the notebook does not require any of the previous cells to have been run. All they require is that the correct data files are saved in the current directory. 

In [None]:
comment_texts = pickle.load(open("comment_texts.pkl", "rb"))
sentiment_scores = pickle.load(open("comment_sentiment_scores.pkl", "rb"))
comment_upvotes = pickle.load(open("comment_upvotes.pkl", "rb"))
parent_texts = pickle.load(open("parent_texts.pkl", "rb"))
parent_sentiment_scores = pickle.load(open("parents_sentiment_scores.pkl", "rb"))
children_texts = pickle.load(open("children_texts.pkl", "rb"))
# The tokenizer might not exist. If the tokenizer does not exist, then
# we assume that the data we are processing should have be used
# to fit a new tokenizer, and then this tokenizer is saved to use in later
# data processing. If the tokenizer does exist, we use the existing
# tokenizer on the new data.
try:
    tokenizer = pickle.load(open("tokenizer", "rb"))
except:
    tokenizer = None

In [15]:
# tokenizes and pads the comment data and vectorizes the upvote score data.
def prepare_score_data(comments, scores, labels, max_comment_length, vocab_size=10000, tokenizer=None):
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=vocab_size)
        tokenizer.fit_on_texts(comments)
    token_comments = tokenizer.texts_to_sequences(comments)
    pad_comments = sequence.pad_sequences(token_comments, maxlen=max_comment_length)
    return pad_comments, np.array(scores), np.array(labels), tokenizer

# tokenizes and pads the parent data and creates a array of the
# first word in the child comment
def prepare_next_word_data(parents, parents_scores, comments, texts,
                           vocab_size=10000, max_comment_length=150, tokenizer=None):
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=vocab_size)
        tokenizer.fit_on_texts(texts)
    token_parents = tokenizer.texts_to_sequences(parents)
    token_child = tokenizer.texts_to_sequences(comments)
    pad_parents = sequence.pad_sequences(token_parents, maxlen=max_comment_length)
    labels = []
    for child in token_child:
        if len(child) >= 1:
            labels.append([child[0]])
        else:
            labels.append([0])
    return pad_parents, np.array(parents_scores), np.array(labels), tokenizer


In [20]:
pad_parents, parent_scores, first_word, tokenizer =  prepare_next_word_data(parent_texts,
                                                                            parent_sentiment_scores,
                                                                            children_texts,
                                                                            comment_texts,
                                                                            tokenizer=tokenizer)
texts_pad, sentiment_scores, upvotes, tokenizer = prepare_score_data(comment_texts,
                                                                     comment_sentiment_scores,
                                                                     comment_upvotes,
                                                                     150,
                                                                     tokenizer=tokenizer)

In [23]:
# Save all the processed data, the word -> index dictionary of th
# tokenizer, and the tokenizer itself.
np.save("parent_texts.npy", pad_parents)
np.save("parent_sentiment_scores.npy", parent_scores)
np.save("child_first_word.npy", first_word)
np.save("comment_texts.npy", texts_pad)
np.save("comment_sentiment_scores.npy", sentiment_scores)
np.save("comment_upvotes.npy", upvotes)
pickle.dump(tokenizer.word_index, open("words.pkl", "wb"))
pickle.dump(tokenizer, open("tokenizer.pkl", "wb"))