In [1]:
# Define the time period of interest (inclusive of ends)

start_year = 2005
start_month = 12

end_year = 2006
end_month = 1

# Create a list of nicely formatted date strings

from data.iterators import list_date_strings

date_strings = list_date_strings(start_year=start_year,
                                 start_month=start_month,
                                 end_year=end_year,
                                 end_month=end_month)

from data.paths import LOCAL_COMMENTS_FMT_STR
from data.paths import LOCAL_THREADS_FMT_STR
from data.iterators import format_each
from data.preprocess import records_df, td_matrix, td_matrix2

comments_paths = format_each(fmt_str=LOCAL_COMMENTS_FMT_STR, *date_strings)
threads_paths = format_each(fmt_str=LOCAL_THREADS_FMT_STR, *date_strings)

print("\n".join(date_strings))

2005-12
2006-01


# Download some data from pushshift.io

In [2]:
from data.download import download_comments_and_threads

download_comments_and_threads(date_strings=date_strings,
                              verbose=True)

downloading http://files.pushshift.io/reddit/comments/RC_2007-01.bz2
downloading http://files.pushshift.io/reddit/submissions/RS_2007-01.bz2
downloading http://files.pushshift.io/reddit/comments/RC_2007-02.bz2
downloading http://files.pushshift.io/reddit/submissions/RS_2007-02.bz2


# Read comment metadata into dataframe

In [16]:
# Load all the wonderful comments into a dataframe and print the available columns
comments_full_df = records_df(paths=comments_paths)
print("\n".join(comments_full_df.columns.values))

author
author_flair_css_class
author_flair_text
body
controversiality
created_utc
distinguished
edited
gilded
id
link_id
parent_id
retrieved_on
score
stickied
subreddit
subreddit_id
ups


# Read thread metadata into dataframe

In [17]:
# Load all the wonderful threads into a dataframe and print the available columns
threads_full_df = records_df(paths=threads_paths)
print("\n".join(threads_full_df.columns.values))

archived
author
author_flair_css_class
author_flair_text
created
created_utc
distinguished
domain
downs
edited
from
from_id
from_kind
gilded
hide_score
id
is_self
link_flair_css_class
link_flair_text
media
media_embed
name
num_comments
over_18
permalink
quarantine
retrieved_on
saved
score
secure_media
secure_media_embed
selftext
stickied
subreddit
subreddit_id
thumbnail
title
ups
url


# Read a subset of comment metadata into dataframe

In [10]:
# pass any subset of column names as a list to the records_df function
# and it will construct a dataframe with only those columns.
# This will usually be faster than constructing the full dataframe.
comments_partial_df = records_df(paths=comments_paths, keys=["author", "subreddit"])
print(comments_partial_df.head())

      author   subreddit
0       frjo  reddit.com
1    zse7zse  reddit.com
2  [deleted]  reddit.com
3  [deleted]  reddit.com
4    rjoseph  reddit.com


# Get a lists of unique users and subreddits

In [12]:
%%timeit
author_subreddit_df = records_df(paths=comments_paths, keys=["author", "subreddit"])

# # Each entry in the resulting series is the number of comments associated with a particular user
# author_comment_counts = author_subreddit_df["author"].value_counts()

# # Each entry in the resulting series is the number of comments on the associated subreddit
# subreddit_comment_counts = author_subreddit_df["subreddit"].value_counts()
# print(author_comment_counts)

22.5 ms ± 811 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Get dicts of unique users and subreddits more efficient

In [14]:
%%timeit
td_mat1 = td_matrix(paths=comments_paths, term_key="author", doc_key="subreddit")

70.7 ms ± 507 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%%timeit
td_mat2 = td_matrix2(paths=comments_paths, term_key="author", doc_key="subreddit")

28.4 ms ± 823 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
