In [1]:
import pandas as pd
from pmaw import PushshiftAPI

## Download dataset
### Pulls pushshift data using PMAW api, visit https://github.com/mattpodolak/pmaw for docs and details
### Running will make pull request, unneccessary and takes forever so only uncomment if need to make a new dataset. Work from csv instead.

### Example of pulling a specified number of posts (aka submissions) from a subreddit

In [2]:
api = PushshiftAPI()

In [58]:
# Replace value for subreddit with desired subreddit name, case-sensitive
# Replace value for limit to set desired number of submissions(posts) to pull
# limit=None will pull all submissions(posts) from the subreddit
# Beware, pulling all posts will take time, mem, cpu
submissions = api.search_submissions(subreddit="disney", limit=2000)

sub_df = pd.DataFrame(submissions)

# Rename path, leaving the rest of the parameters should be fine
# sub_df.to_csv('./data/disney_500_subs.csv', header=True, index=False, columns=list(sub_df.axes[1]))

INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 20 - Batches: 2 - Items Remaining: 0


### Example of pulling all posts (aka submissions) from a subreddit
##### Beware, pulling all posts will take time, mem, cpu

In [7]:
all_submissions = api.search_submissions(subreddit="DisneyPlus", limit=None)

all_sub_df = pd.DataFrame(all_submissions)

# Rename path, leave the rest of the parameters
all_sub_df.to_csv('./data/disneyplus_all_subs.csv', header=True, index=False, columns=list(all_sub_df.axes[1]))

INFO:pmaw.PushshiftAPIBase:27963 result(s) available in Pushshift
INFO:pmaw.PushshiftAPIBase:Checkpoint:: Success Rate: 100.00% - Requests: 100 - Batches: 10 - Items Remaining: 18412
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 133 - Batches: 14 - Items Remaining: 16980
INFO:pmaw.PushshiftAPIBase:10 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Checkpoint:: Success Rate: 100.00% - Requests: 193 - Batches: 20 - Items Remaining: 10981
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 258 - Batches: 27 - Items Remaining: 6608
INFO:pmaw.PushshiftAPIBase:4 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Checkpoint:: Success Rate: 100.00% - Requests: 282 - Batches: 30 - Items Remaining: 4755
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 340 - Batches: 37 - Items Remaining: 13
INFO:pmaw.PushshiftAPIBase:3 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Reques

## Comments

### Download comments based on submissions pulled in above query. 
#### This code block will only the comments of the submissions queried above and saves to a csv. It takes much longer than simply pulling submissions

###### Running will make pull request, unneccessary and takes forever so only uncomment if need to make a new dataset. Work from csv instead.

In [None]:
# Replace path
subs_df = pd.read_csv('./data/disney_500_subs.csv',header=0) 

sub_ids = list(subs_df.loc[:, 'id']) 

# retrieve comment ids for submissions
comment_ids = api.search_submission_comment_ids(ids=sub_ids)
comment_ids = list(comment_ids)

# retrieve comments by id
comments = api.search_comments(ids=comment_ids)

comments_df = pd.DataFrame(comments)

# Replace path
comments_df.to_csv('./data/disney_500_comments.csv', header=True, index=False, columns=list(comments_df.axes[1]))

### How to pull a given number of comments rather than just those from samples already queried (FASTER)
#### This method is much faster and should be used for pulling a large number of comments or all comments

In [None]:
api = PushshiftAPI()

# Replace value for subreddit with desired subreddit name, case-sensitive
comments = api.search_comments(subreddit="wallstreetbets", limit=300000)

comments_df = pd.DataFrame(comments)

# Replace path
comments_df.to_csv('./data/wallstreetbets_comments.csv', header=True, index=False, columns=list(comments_df.axes[1]))

### Working with submissions data, mainly adding a new date column

In [1]:
# Returns column of dates when submissions were created
# Times are UTC timezone and Unix in format
all_sub_df.created_utc

NameError: name 'sub_df' is not defined

In [None]:
# Single example of converting from Unix date-time to readable string date-time format

from datetime import datetime

ts = sub_df.created_utc[0]
print(ts)
print(datetime.utcfromtimestamp(ts).strftime('%m-%d-%Y'))

In [None]:
# Create a new list of all Unix times in readable string date-time format

sub_dates = []

for _ in sub_df['created_utc']:
    fts = datetime.utcfromtimestamp(_).strftime('%m-%d-%Y')
    sub_dates.append(fts)

# Appends new column to dataframe, contains readable string date-times
sub_df['creation_date'] = all_sub_dates

In [None]:
# Saves dataset to csv
# Replace path
sub_df.to_csv('./data/disney_all_subs_w_dates.csv', header=True, index=False, columns=list(sub_df.axes[1]))

In [None]:
sub_df

### Working with comments data, mainly adding a new dates column

In [None]:
comments_df.columns

In [None]:
com_dates = []

for _ in comments_df['created_utc']:
    fts = datetime.utcfromtimestamp(_).strftime('%m-%d-%Y')
    com_dates.append(fts)

In [None]:
comments_df['creation_date'] = com_dates

In [None]:
comments_df