#### Imports

In [1]:
import numpy as np
import pandas as pd
import json
import time
import math
import requests
import itertools
from datetime import datetime, timedelta
import praw

#### Praw Keys

In [2]:
config = {
    "username"      : "ixchelfrg",
    "client_id"     : "Toq58hLTqLD4Ow",
    "client_secret" : "hnPMxiwcVh5W9JSO8qyLLhrMTKc",
    "user_agent"    : "ixchel"
}

posts_from_reddit = []
comments_from_reddit = []

reddit = praw.Reddit(client_id = config['client_id'], \
                     client_secret = config['client_secret'], \
                     user_agent = config['user_agent'])


#### Extracting all posts and comments for subreddit `r/QuantifiedSelf`.

In [3]:
# Code from https://medium.com/@pasdan/how-to-scrap-reddit-using-pushshift-io-via-python-a3ebcc9b83f4
# https://github.com/ixchelfrg/sack_lunch/blob/master/Notebooks/Bots/Reddit.ipynb

# Code creates intervals by iterating through code to make multiple requests to Reddit, and extract posts going back to a specific amount of days.

# Method for building time period searches, in days.
def give_me_intervals(start_at, number_of_days_per_interval = 3):
    
    end_at = math.ceil(datetime.utcnow().timestamp())
        
    # 1 day = 86400,
    # Seconds times number of days specified.
    period = (86400 * number_of_days_per_interval)
    
    # Create end of while loop from "start_at" argument given and "period" calculated above.
    end = start_at + period
    yield (int(start_at), int(end))

    # Move the needle of time after each iteration.
    padding = 1
    while end <= end_at:
        start_at = end + padding
        end = (start_at - padding) + period
        yield int(start_at), int(end)

# Method that takes in a URI and handles the HTTP request/response. 
def make_request(uri, max_retries = 5):
    def fire_away(uri):
        response = requests.get(uri)
        assert response.status_code == 200
        return json.loads(response.content)

# If request fails, wait specified time and retry.
# 5 tries before code stops. 
    current_tries = 1
    while current_tries < max_retries:
        try:
            response = fire_away(uri)
            return response
        except:
            time.sleep(.150)
            current_tries += 1

    return fire_away(uri)
 
# Extract posts.
# If it extracts 500 posts, code checks to see if more exist.
def pull_posts_for(subreddit, start_at, end_at):
    
    def map_posts(posts):
        return list(map(lambda post: {
            'id': post['id'],
            'created_utc': post['created_utc'],
            'prefix': 't4_'
        }, posts))
    
    SIZE = 500
    URI_TEMPLATE = r'https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}'
    
    post_collections = map_posts( \
        make_request( \
            URI_TEMPLATE.format(subreddit, start_at, end_at, SIZE))['data'])

    n = len(post_collections)
    while n == SIZE:
        last = post_collections[-1]
        new_start_at = last['created_utc'] - 10
        
        more_posts = map_posts( \
            make_request( \
                URI_TEMPLATE.format(subreddit, new_start_at, end_at, SIZE))['data'])
        
        n = len(more_posts)
        # Posts added to "post_collections"
        post_collections.extend(more_posts)

    return post_collections

In [4]:
# Use of above functions to extract posts from QuantifiedSelf subreddit for 3,000 days.
# https://github.com/ixchelfrg/sack_lunch/blob/master/Notebooks/Bots/Reddit.ipynb
subreddit = 'QuantifiedSelf'

start_at = math.floor(\
    (datetime.utcnow() - timedelta(days=3000)).timestamp())

posts = []
for interval in give_me_intervals(start_at, 7):
    pulled_posts = pull_posts_for(
        subreddit, interval[0], interval[1])
    
    posts.extend(pulled_posts)

print('Posts Found:', len(posts))
print('Number of Unique Posts:', len(np.unique([ post['id'] for post in posts ])))

Posts Found: 1403
Number of Unique Posts: 1403


In [5]:
# Extract submissions and comments from posts.
# https://github.com/ixchelfrg/sack_lunch/blob/master/Notebooks/Bots/Reddit.ipynb

TIMEOUT_AFTER_COMMENT_IN_SECS = .250

for submission_id in np.unique([ post['id'] for post in posts ]):
    submission = reddit.submission(id=submission_id)

    posts_from_reddit.append(submission)

    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comments_from_reddit.append(comment)
        
        if TIMEOUT_AFTER_COMMENT_IN_SECS > 0:
            time.sleep(TIMEOUT_AFTER_COMMENT_IN_SECS)

print('Length of "posts_from_reddit" list: {}'.format(len(posts_from_reddit)))
print('Length of "comments_from_reddit" list: {}'.format(len(comments_from_reddit)))

Length of "posts_from_reddit" list: 1403
Length of "comments_from_reddit" list: 3483


In [6]:
# To check range of time for posts pulled.
epoch_times = []
for i in range(len(posts_from_reddit)):
    epoch_times.append(vars(posts_from_reddit[i])['created_utc'])
    
print('Length of list: {}'.format(len(epoch_times)))
epoch_times.sort()

print('{} (index 0) to {} (index -1)'.format(epoch_times[0], epoch_times[-1]))

Length of list: 1403
1306692867.0 (index 0) to 1557962482.0 (index -1)


***Using [EpochConverter](https://www.epochconverter.com/) :***

`Posts are from Sunday May 29, 2011 to Wednesday May 15, 2019.`

#### Extracting more granular data from submissions stored in `posts_from_reddit`.

In [7]:
# Lists containing data from "posts_from_reddit"
authors = []
sub_comments = []
created_utcs = []
ids = []
is_self_bools = []
names = []
num_cmts = []
permalinks = []
scores = []
selftexts = []
subreddits = []
titles = []
num_ups = []
upvote_ratios = []
urls = []
domains = []
comments_by_id = []
comments_ids = []

# Subtracting information from "posts_from_reddit" to store in dataframe.
for post in posts_from_reddit:
    authors.append(post.author)
    sub_comments.append(post.comments)
    created_utcs.append(post.created_utc)
    ids.append(post.id)
    is_self_bools.append(post.is_self)
    names.append(post.name)
    num_cmts.append(post.num_comments)
    permalinks.append(post.permalink)
    scores.append(post.score)
    selftexts.append(post.selftext)
    subreddits.append(post.subreddit)
    titles.append(post.title)
    num_ups.append(post.ups)
    upvote_ratios.append(post.upvote_ratio)
    urls.append(post.url)
    domains.append(post.domain)
    comments_by_id.append(post._comments_by_id)
    comments_ids.append(post._comments)

#### Storing submission data in a dataframe.

In [8]:
lsts = [
    authors,
    created_utcs,
    ids,
    is_self_bools,
    names,
    num_cmts,
    permalinks,
    scores,
    selftexts,
    subreddits,
    titles,
    num_ups,
    upvote_ratios,
    urls,
    domains,
    comments_by_id,
]

new_lsts = []
for lst in lsts:
    new_lsts.append([str(i) for i in lst])

subreddit_df = pd.DataFrame(new_lsts).T
print('Dataframe of subreddit submissions has {} rows and {} columns.'.format(subreddit_df.shape[0], subreddit_df.shape[1]))
subreddit_df.head(2)

Dataframe of subreddit submissions has 1403 rows and 16 columns.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,rossmoody,1347918772.0,101m5w,False,t3_101m5w,0,/r/QuantifiedSelf/comments/101m5w/moodpanda_mo...,1,,QuantifiedSelf,Moodpanda mood logger blog,1,1.0,http://moodpanda.tumblr.com/,moodpanda.tumblr.com,{}
1,robweedn,1348770055.0,10kp8t,False,t3_10kp8t,2,/r/QuantifiedSelf/comments/10kp8t/notch_launch...,4,,QuantifiedSelf,Notch launches a cool new way to visualize you...,4,0.76,http://techcrunch.com/2012/09/27/with-fitbit-a...,techcrunch.com,"{'t1_c6eaf17': Comment(id='c6eaf17'), 't1_c7um..."


#### Formatting dataframe of subreddit submissions.

In [9]:
# Renaming columns.
subreddit_df.rename(columns = {
    0  : 'author',
    1  : 'created_utc',
    2  : 'id',
    3  : 'is_self',
    4  : 'name',
    5  : 'number_comments',
    6  : 'permalink',
    7  : 'score',
    8  : 'text',
    9  : 'subreddit',
    10 : 'title',
    11 : 'number_ups',
    12 : 'upvote_ratio',
    13 : 'url',
    14 : 'domain',
    15 : 'comments_by_id' 
    
}, inplace = True)
subreddit_df.head(2)

Unnamed: 0,author,created_utc,id,is_self,name,number_comments,permalink,score,text,subreddit,title,number_ups,upvote_ratio,url,domain,comments_by_id
0,rossmoody,1347918772.0,101m5w,False,t3_101m5w,0,/r/QuantifiedSelf/comments/101m5w/moodpanda_mo...,1,,QuantifiedSelf,Moodpanda mood logger blog,1,1.0,http://moodpanda.tumblr.com/,moodpanda.tumblr.com,{}
1,robweedn,1348770055.0,10kp8t,False,t3_10kp8t,2,/r/QuantifiedSelf/comments/10kp8t/notch_launch...,4,,QuantifiedSelf,Notch launches a cool new way to visualize you...,4,0.76,http://techcrunch.com/2012/09/27/with-fitbit-a...,techcrunch.com,"{'t1_c6eaf17': Comment(id='c6eaf17'), 't1_c7um..."


In [10]:
# Changing specified columns to numeric datatype.
to_num = [
    'created_utc',
    'number_comments',
    'score',
    'number_ups',
    'upvote_ratio'
]
for column in to_num:
    subreddit_df[column] = pd.to_numeric(subreddit_df[column])
    
subreddit_df.dtypes

author              object
created_utc        float64
id                  object
is_self             object
name                object
number_comments      int64
permalink           object
score                int64
text                object
subreddit           object
title               object
number_ups           int64
upvote_ratio       float64
url                 object
domain              object
comments_by_id      object
dtype: object

In [11]:
# Sorting dataframe by time post was created.
subreddit_df = subreddit_df.sort_values('created_utc', ascending = False)
subreddit_df.head(2)

Unnamed: 0,author,created_utc,id,is_self,name,number_comments,permalink,score,text,subreddit,title,number_ups,upvote_ratio,url,domain,comments_by_id
1392,fergienz,1557962000.0,bp560o,True,t3_bp560o,1,/r/QuantifiedSelf/comments/bp560o/40_deep_slee...,3,I'm trying to find someone who is using Qualia...,QuantifiedSelf,40% Deep sleep score - Is it just me? Any Qual...,3,0.81,https://www.reddit.com/r/QuantifiedSelf/commen...,self.QuantifiedSelf,{'t1_enpejss': Comment(id='enpejss')}
1391,ran88dom99,1557941000.0,bp0prn,True,t3_bp0prn,2,/r/QuantifiedSelf/comments/bp0prn/best_nutriti...,5,Most food tracking apps have the ability to re...,QuantifiedSelf,Best nutrition tracking apps for advanced QS,5,1.0,https://www.reddit.com/r/QuantifiedSelf/commen...,self.QuantifiedSelf,"{'t1_enpg8qm': Comment(id='enpg8qm'), 't1_enpj..."


In [12]:
# Resetting index.
# https://pandas.pydata.org/pandas-docs/version/0.23/generated/pandas.DataFrame.reset_index.html
subreddit_df.reset_index(inplace = True)
subreddit_df.head(2)

Unnamed: 0,index,author,created_utc,id,is_self,name,number_comments,permalink,score,text,subreddit,title,number_ups,upvote_ratio,url,domain,comments_by_id
0,1392,fergienz,1557962000.0,bp560o,True,t3_bp560o,1,/r/QuantifiedSelf/comments/bp560o/40_deep_slee...,3,I'm trying to find someone who is using Qualia...,QuantifiedSelf,40% Deep sleep score - Is it just me? Any Qual...,3,0.81,https://www.reddit.com/r/QuantifiedSelf/commen...,self.QuantifiedSelf,{'t1_enpejss': Comment(id='enpejss')}
1,1391,ran88dom99,1557941000.0,bp0prn,True,t3_bp0prn,2,/r/QuantifiedSelf/comments/bp0prn/best_nutriti...,5,Most food tracking apps have the ability to re...,QuantifiedSelf,Best nutrition tracking apps for advanced QS,5,1.0,https://www.reddit.com/r/QuantifiedSelf/commen...,self.QuantifiedSelf,"{'t1_enpg8qm': Comment(id='enpg8qm'), 't1_enpj..."


In [13]:
# Checking for duplicates.
print("There are {} duplicate submissions.".format(subreddit_df['id'].duplicated().sum()))

There are 0 duplicate submissions.


#### Storing submissions dataframe.

In [14]:
subreddit_df.to_csv("./data/raw_subreddit_submissions.csv", index = False)

#### Extracting more granular data from submissions stored in `comments_from_reddit`.

In [15]:
# Lists containing data from "comments_from_reddit"
cmt_authors = []
cmt_bodies = []
cmt_created_utcs = []
cmt_ids = []
cmt_is_submitter_bools = []
cmt_link_ids = []
cmt_parent_ids = []
cmt_permalinks = []
cmt_scores = []
cmt_submissions = []
cmt_subreddits = []
cmt_subreddit_ids = []

# Subtracting information from "posts_from_reddit" to store in dataframe.
for comment in comments_from_reddit:
    cmt_authors.append(comment.author)
    cmt_bodies.append(comment.body)
    cmt_created_utcs.append(comment.created_utc)
    cmt_ids.append(comment.id)
    cmt_is_submitter_bools.append(comment.is_submitter)
    cmt_link_ids.append(comment.link_id)
    cmt_parent_ids.append(comment.parent_id)
    cmt_permalinks.append(comment.permalink)
    cmt_scores.append(comment.score)
    cmt_submissions.append(comment.submission)
    cmt_subreddits.append(comment.subreddit)
    cmt_subreddit_ids.append(comment.subreddit_id)

#### Storing comments data in a dataframe.

In [16]:
cmt_lsts = [
    cmt_authors,
    cmt_bodies,
    cmt_created_utcs,
    cmt_ids,
    cmt_is_submitter_bools,
    cmt_link_ids,
    cmt_parent_ids,
    cmt_permalinks,
    cmt_scores,
    cmt_submissions,
    cmt_subreddits,
    cmt_subreddit_ids,
]

new_cmt_lsts = []
for cmt_lst in cmt_lsts :
    new_cmt_lsts.append([str(i) for i in cmt_lst])

comments_df = pd.DataFrame(new_cmt_lsts).T
print('Dataframe has {} rows and {} columns.'.format(comments_df.shape[0], comments_df.shape[1]))
comments_df.head(2)

Dataframe has 3483 rows and 12 columns.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,robweedn,ADFA,1348770091.0,c6eaf17,True,t3_10kp8t,t3_10kp8t,/r/QuantifiedSelf/comments/10kp8t/notch_launch...,1,10kp8t,QuantifiedSelf,t5_2sivw
1,,"Personally, I find these smart pedometers pret...",1357849157.0,c7um88v,False,t3_10kp8t,t3_10kp8t,/r/QuantifiedSelf/comments/10kp8t/notch_launch...,1,10kp8t,QuantifiedSelf,t5_2sivw


#### Formatting dataframe of subreddit comments.

In [17]:
# Renaming columns.
comments_df.rename(columns = {
    0  : 'comment_author',
    1  : 'comment_text',
    2  : 'comment_created_utc',
    3  : 'comment_id',
    4  : 'comment_is_submitter',
    5  : 'comment_link_id',
    6  : 'comment_parent_id',
    7  : 'comment_permalink',
    8  : 'comment_score',
    9  : 'id',
    10 : 'comment_subreddit',
    11 : 'comment_subreddit_id',
    
}, inplace = True)
comments_df.head(2)

Unnamed: 0,comment_author,comment_text,comment_created_utc,comment_id,comment_is_submitter,comment_link_id,comment_parent_id,comment_permalink,comment_score,id,comment_subreddit,comment_subreddit_id
0,robweedn,ADFA,1348770091.0,c6eaf17,True,t3_10kp8t,t3_10kp8t,/r/QuantifiedSelf/comments/10kp8t/notch_launch...,1,10kp8t,QuantifiedSelf,t5_2sivw
1,,"Personally, I find these smart pedometers pret...",1357849157.0,c7um88v,False,t3_10kp8t,t3_10kp8t,/r/QuantifiedSelf/comments/10kp8t/notch_launch...,1,10kp8t,QuantifiedSelf,t5_2sivw


In [18]:
# Checking for duplicates.
print("There are {} duplicate comments.".format(comments_df['comment_id'].duplicated().sum()))

There are 0 duplicate comments.


In [19]:
# Changing specified columns to numeric datatype.
to_num_cmt = [
    'comment_created_utc',
    'comment_score',
]
for column in to_num_cmt:
    comments_df[column] = pd.to_numeric(comments_df[column])
    
comments_df.dtypes

comment_author           object
comment_text             object
comment_created_utc     float64
comment_id               object
comment_is_submitter     object
comment_link_id          object
comment_parent_id        object
comment_permalink        object
comment_score             int64
id                       object
comment_subreddit        object
comment_subreddit_id     object
dtype: object

#### Storing comments dataframe.

In [20]:
subreddit_df.to_csv("./data/raw_subreddit_comments.csv", index = False)

#### Checking size of both dataframes.

In [21]:
print('"subreddit_df" has {} rows and {} columns.'.format(subreddit_df.shape[0], subreddit_df.shape[1]))
print('"comments_df" has {} rows and {} columns.'.format(comments_df.shape[0], comments_df.shape[1]))

"subreddit_df" has 1403 rows and 17 columns.
"comments_df" has 3483 rows and 12 columns.


In [22]:
# Check null values in "subreddit_df" text column.
print('Checking how many empty string values in text column of "subreddit_df" ...\n{}'.format(subreddit_df[['text']][subreddit_df['text'] == ''].shape[0]))

Checking how many empty string values in text column of "subreddit_df" ...
752


In [25]:
# Check null values in "comments_df" text column.
print('Checking how many empty string values in text column of "comments_df" ...\n{}'.format(comments_df[['comment_text']][comments_df['comment_text'] == ''].shape[0]))

Checking how many empty string values in text column of "comments_df" ...
0


#### Adding text from `comments_df` to text from `subreddit_df`.

In [26]:
id_value = None

for index, row in subreddit_df.iterrows():
    comment_string = ''
    for c_index, c_row in comments_df[comments_df.id == row.id].iterrows():
        comment_string += "\n"
        comment_string += c_row['comment_text']
        comment_string += "\n"
    subreddit_df.loc[index,'text'] += comment_string.strip()
subreddit_df.head(2)

Unnamed: 0,index,author,created_utc,id,is_self,name,number_comments,permalink,score,text,subreddit,title,number_ups,upvote_ratio,url,domain,comments_by_id
0,1392,fergienz,1557962000.0,bp560o,True,t3_bp560o,1,/r/QuantifiedSelf/comments/bp560o/40_deep_slee...,3,I'm trying to find someone who is using Qualia...,QuantifiedSelf,40% Deep sleep score - Is it just me? Any Qual...,3,0.81,https://www.reddit.com/r/QuantifiedSelf/commen...,self.QuantifiedSelf,{'t1_enpejss': Comment(id='enpejss')}
1,1391,ran88dom99,1557941000.0,bp0prn,True,t3_bp0prn,2,/r/QuantifiedSelf/comments/bp0prn/best_nutriti...,5,Most food tracking apps have the ability to re...,QuantifiedSelf,Best nutrition tracking apps for advanced QS,5,1.0,https://www.reddit.com/r/QuantifiedSelf/commen...,self.QuantifiedSelf,"{'t1_enpg8qm': Comment(id='enpg8qm'), 't1_enpj..."


#### Adding text from `title` column in `subreddit_df` to `text` column of same dataframe.

In [27]:
# Check null values in "subreddit_df" title column.
print('Checking how many empty string values in title column of "subreddit_df" ...\n{}'.format(subreddit_df[['title']][subreddit_df['text'] == ''].shape[0]))

Checking how many empty string values in title column of "subreddit_df" ...
430


In [28]:
for index, row in subreddit_df.iterrows():
    title_to_add = ''
    title_to_add += " "
    title_to_add += row['title']
    title_to_add += " "
    subreddit_df.loc[index,'text'] += title_to_add   

#### Storing changed `subreddit_df`.

In [30]:
# Check null values in "subreddit_df" after adding comments and titles.
print('Checking how many empty string values in text column of subreddit_df ...\n{}'.format(subreddit_df[['text']][subreddit_df['text'] == ''].shape[0]))

Checking how many empty string values in text column of subreddit_df ...
0


In [31]:
subreddit_df.to_csv('./data/raw_combined_posts.csv')