In [1]:
# Define the time period of interest (inclusive of ends)

start_year = 2005
start_month = 12

end_year = 2006
end_month = 1

# Create a list of nicely formatted date strings

from data.iterators import list_date_strings

date_strings = list_date_strings(start_year=start_year,
                                 start_month=start_month,
                                 end_year=end_year,
                                 end_month=end_month)

from data.paths import LOCAL_COMMENTS_FMT_STR, LOCAL_COMMENTS_DIR
from data.paths import LOCAL_THREADS_FMT_STR, LOCAL_THREADS_DIR
from data.iterators import format_each
from data.preprocess import records_df, extract_terms_by_document
import pandas as pd

comments_paths = format_each(fmt_str=LOCAL_COMMENTS_FMT_STR, *date_strings)
threads_paths = format_each(fmt_str=LOCAL_THREADS_FMT_STR, *date_strings)

# body text from comments
extract_terms_by_document(name="comment_bodies_per_sr",
                          dir_path=LOCAL_COMMENTS_DIR,
                          term_key="body",
                          doc_key="subreddit")

skipping /home/jdmoorman/Git/reddit/reddit/data/comments/RC_2005-12.json already extracted
skipping /home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-01.json already extracted
skipping /home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-02.json already extracted
skipping /home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-03.json already extracted
skipping /home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-04.json already extracted
skipping /home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-05.json already extracted
skipping /home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-06.json already extracted
skipping /home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-07.json already extracted
skipping /home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-08.json already extracted
skipping /home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-09.json already extracted
skipping /home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-10.json already extracted

# Download some data from pushshift.io

In [2]:
from data.download import download_comments_and_threads

download_comments_and_threads(date_strings=date_strings,
                              verbose=True)

downloading http://files.pushshift.io/reddit/comments/RC_2005-12.bz2
downloading http://files.pushshift.io/reddit/submissions/RS_2005-12.bz2
Remote file not found. Writing empty file.
downloading http://files.pushshift.io/reddit/comments/RC_2006-01.bz2
downloading http://files.pushshift.io/reddit/submissions/RS_2006-01.bz2


# Read comment metadata into dataframe

In [16]:
# Load all the wonderful comments into a dataframe and print the available columns
comments_full_df = records_df(paths=comments_paths)
comments_full_df.info()

author
author_flair_css_class
author_flair_text
body
controversiality
created_utc
distinguished
edited
gilded
id
link_id
parent_id
retrieved_on
score
stickied
subreddit
subreddit_id
ups


# Read thread metadata into dataframe

In [17]:
# Load all the wonderful threads into a dataframe and print the available columns
threads_full_df = records_df(paths=threads_paths)
threads_full_df.info()

archived
author
author_flair_css_class
author_flair_text
created
created_utc
distinguished
domain
downs
edited
from
from_id
from_kind
gilded
hide_score
id
is_self
link_flair_css_class
link_flair_text
media
media_embed
name
num_comments
over_18
permalink
quarantine
retrieved_on
saved
score
secure_media
secure_media_embed
selftext
stickied
subreddit
subreddit_id
thumbnail
title
ups
url


# Read a subset of comment metadata into dataframe

In [10]:
# pass any subset of column names as a list to the records_df function
# and it will construct a dataframe with only those columns.
# This will usually be faster than constructing the full dataframe.
comments_partial_df = records_df(paths=comments_paths, keys=["author", "subreddit"])
print(comments_partial_df.head())

      author   subreddit
0       frjo  reddit.com
1    zse7zse  reddit.com
2  [deleted]  reddit.com
3  [deleted]  reddit.com
4    rjoseph  reddit.com


# Get a lists of unique users and subreddits

In [4]:
author_subreddit_df = records_df(paths=comments_paths, keys=["author", "subreddit"])

# Each entry in the resulting series is the number of comments associated with a particular user
author_comment_counts = author_subreddit_df["author"].value_counts()
print(author_comment_counts)

# Each entry in the resulting series is the number of comments on the associated subreddit
subreddit_comment_counts = author_subreddit_df["subreddit"].value_counts()
print(subreddit_comment_counts)


subreddit           arxiv  askcaterina  ca  de  eo  es  eu  features  fr  \
author                                                                     
--k                     0            0   0   0   0   0   0         0   0   
-Buzza-                 0            0   0   0   0   0   0         0   0   
-Yh-                    0            0   0   0   0   0   0         0   0   
000                     0            0   0   0   0   0   0         0   0   
007                     0            0   0   0   0   0   0         0   0   
00helix                 0            0   0   0   0   0   0         0   0   
0123456789abcdef        0            0   0   0   0   0   0         0   0   
033                     0            0   0   0   0   0   0         0   0   
0904542020              0            0   0   0   0   0   0         0   0   
09weed09                0            0   0   0   0   0   0         0   0   
0at                     0            0   0   0   0   0   0         0   0   
0bvious     

# Extract some data per subreddit into files

In [3]:
# usernames from comments
extract_terms_by_document(name="comment_authors_per_sr",
                          dir_path=LOCAL_COMMENTS_DIR,
                          term_key="author",
                          doc_key="subreddit")

# usernames from threads
extract_terms_by_document(name="thread_authors_per_sr",
                          dir_path=LOCAL_THREADS_DIR,
                          term_key="author",
                          doc_key="subreddit")

# body text from comments
extract_terms_by_document(name="comment_bodies_per_sr",
                          dir_path=LOCAL_COMMENTS_DIR,
                          term_key="body",
                          doc_key="subreddit")

/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2005-12.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-01.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-02.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-03.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-04.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-05.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-06.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-07.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-08.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-09.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-10.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-11.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2006-12.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2007-01.json
/home/jdmoorman/Git/reddit/reddit/data/comments/RC_2007-02.json
/home/jdmoorman/Git/reddit/reddit/data/c