In [134]:
import pandas as pd
import time
import requests
import numpy as np

In [135]:
url = "https://api.pushshift.io/reddit/search/submission"

In [136]:
params = {
    'subreddit' : 'coffee', 
    'size' : 100,
}
res_c = requests.get(url, params)
res_c.status_code #200 indicates a successful response

200

In [137]:
data = res_c.json()
post_anxiety = data['data']
df = pd.DataFrame(post_anxiety)
df.head() #ensure that we were seeing the right columns

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,secure_media_embed,author_flair_template_id,author_flair_text_color,gallery_data,is_gallery,media_metadata,author_cakeday,author_flair_background_color,suggested_sort,distinguished
0,[],False,VincentVanGoober,,[],,text,t2_efiom35q,False,False,...,,,,,,,,,,
1,[],False,Future_South_3834,,[],,text,t2_jr1qlfwh,False,False,...,,,,,,,,,,
2,[],False,OwnExcitement2414,,[],,text,t2_mq4zuik6,False,False,...,,,,,,,,,,
3,[],False,GloopTown,,[],,text,t2_e8gx6,False,False,...,,,,,,,,,,
4,[],False,WooPig45,,[],,text,t2_zv3e4,False,False,...,,,,,,,,,,


Next, we define a function to pull the posts from each of the subreddits chosen. Since Pushshift limits every request to 100 posts, we will run multiple iterations to achieve the desired amount.

In [138]:
def get_submissions(subreddit, n_iter):
    
    df_list = []
    current_time = 1653022843 # all posts before 20 May 2022
    
    for i in range(n_iter):
        res = requests.get(
            url,
            params={
                "subreddit": subreddit,
                "size": 100, 
                "before": current_time,
                "stickied": False
            }
        )
        time.sleep(3)
        df = pd.DataFrame(res.json()['data'])
        df = df[["subreddit", "selftext","title","created_utc","is_robot_indexable"]] #"removed_by_category"
        df_list.append(df)
        current_time = df.created_utc.min()
        
    return pd.concat(df_list, axis=0)

Multiple reruns with readjusted `n_iter` were necessary for the eventual results seen below - considerable time was spent in finding the right parameters, data cleaning and ensuring the disparity of datapoints between datasets were kept below 500. Deleted submissions were removed using the <b>`is_robot_indexable`</b> (if TRUE) parameter, which is more accurate than the <b>`removed_by_category`</b> method. Stickied posts were also excluded as they included weekly discussions, which would likely result in repeat occurrences of the same words used if included.

In [139]:
df_coffee = get_submissions('coffee', 20)

In [140]:
df_coffee[df_coffee['title'].duplicated() == False]

Unnamed: 0,subreddit,selftext,title,created_utc,is_robot_indexable
0,Coffee,,Joe app - rewards for buying coffee,1653022579,True
1,Coffee,"I have a Saeco Incanto Carafe, and ran out of ...",I'm out of milk and I'm curious. Creamer in a ...,1653019779,True
2,Coffee,My office job doesn't provide any means of get...,"Best &amp; simple way to have ""decent/good"" co...",1653017482,True
3,Coffee,,Brewing espresso in morning,1653015337,True
4,Coffee,,I love my Dead Inside But Caffeinated Shirt! F...,1653015237,True
...,...,...,...,...,...
95,Coffee,,I enjoy more the “latte pillow” (almost over p...,1649580316,True
96,Coffee,,Is this just a Melbourne thing?,1649579835,True
97,Coffee,I just finished a glass of cold latte and ther...,Burn after taste,1649573901,True
98,Coffee,so i moved to a new city for college and I can...,"first time going to a coffee shop, what do i a...",1649570857,True


In [141]:
df_coffee

Unnamed: 0,subreddit,selftext,title,created_utc,is_robot_indexable
0,Coffee,,Joe app - rewards for buying coffee,1653022579,True
1,Coffee,"I have a Saeco Incanto Carafe, and ran out of ...",I'm out of milk and I'm curious. Creamer in a ...,1653019779,True
2,Coffee,My office job doesn't provide any means of get...,"Best &amp; simple way to have ""decent/good"" co...",1653017482,True
3,Coffee,,Brewing espresso in morning,1653015337,True
4,Coffee,,I love my Dead Inside But Caffeinated Shirt! F...,1653015237,True
...,...,...,...,...,...
95,Coffee,,I enjoy more the “latte pillow” (almost over p...,1649580316,True
96,Coffee,,Is this just a Melbourne thing?,1649579835,True
97,Coffee,I just finished a glass of cold latte and ther...,Burn after taste,1649573901,True
98,Coffee,so i moved to a new city for college and I can...,"first time going to a coffee shop, what do i a...",1649570857,True


In [142]:
df_coffee.reset_index(drop=True, inplace=True)

In [143]:
df_coffee

Unnamed: 0,subreddit,selftext,title,created_utc,is_robot_indexable
0,Coffee,,Joe app - rewards for buying coffee,1653022579,True
1,Coffee,"I have a Saeco Incanto Carafe, and ran out of ...",I'm out of milk and I'm curious. Creamer in a ...,1653019779,True
2,Coffee,My office job doesn't provide any means of get...,"Best &amp; simple way to have ""decent/good"" co...",1653017482,True
3,Coffee,,Brewing espresso in morning,1653015337,True
4,Coffee,,I love my Dead Inside But Caffeinated Shirt! F...,1653015237,True
...,...,...,...,...,...
1995,Coffee,,I enjoy more the “latte pillow” (almost over p...,1649580316,True
1996,Coffee,,Is this just a Melbourne thing?,1649579835,True
1997,Coffee,I just finished a glass of cold latte and ther...,Burn after taste,1649573901,True
1998,Coffee,so i moved to a new city for college and I can...,"first time going to a coffee shop, what do i a...",1649570857,True


In [144]:
df_coffee.to_csv("Downloads/coffee.csv")

Create another dataframe for submissions for r/prochoice.

In [145]:
df_tea = get_submissions('tea',20)

In [146]:
df_tea

Unnamed: 0,subreddit,selftext,title,created_utc,is_robot_indexable
0,tea,,Are There Any Changes to the Tea Making Rules ...,1653016724,False
1,tea,,Went to an apothecary today and got Herbs to m...,1653016469,True
2,tea,,Tonight's cup of tea,1653013214,True
3,tea,,I am the Lorax and I speak for the Whites,1653011251,False
4,tea,[removed],Thyme tea black when honey added,1653005283,False
...,...,...,...,...,...
94,tea,,Thoughts on Rose Hip tea?,1648030037,True
95,tea,[removed],tea coz why not,1648023608,False
96,tea,[removed],Does anyone else get dopesick when they don't ...,1648016165,False
97,tea,Hello everyone. I'm generally a coffee person ...,Need recepies,1648012727,True


In [None]:
df_tea[df_tea['selftext'].duplicated() == False]

In [None]:
df_tea.reset_index(drop=True, inplace=True)

In [None]:
df_tea['is_robot_indexable'].value_counts()

In [None]:
df_coffee.selftext

In [None]:
df_tea['selftext'] = df_tea['selftext'].str.replace("pu'erh","pu'er").str.replace("pu'erhh","pu'er").str.replace("pu'erhhh","pu'er").str.replace("puerh","pu-er").str.replace("pu-erh","pu'er").str.replace("erh","er")
df_tea['title'] = df_tea['title'].str.replace("pu'erh","pu'er").str.replace("pu'erhh","pu'er").str.replace("pu'erhhh","pu'er").str.replace("puerh","pu-er").str.replace("pu-erh","pu'er").str.replace("erh","er")

In [None]:
df_tea

In [None]:
df_tea.to_csv("Downloads/tea.csv")