In [36]:
import requests
import pandas as pd
import time
import datetime as dt
from bs4 import BeautifulSoup

In [37]:
def query_pushshift(subreddit, kind = 'submission', day_window = 30, n = 5):
    SUBFIELDS = ['title', 'subreddit', 'created_utc', 'author', 'num_comments', 'score','url']
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    # instantiate empty list for temp storage
    posts = []
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)

    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)  
    print("Query Complete!")    
    return full 

To note: Removing is_self is True, since the posts are linked articles that are not written by the creator of the post

In [38]:
results1=query_pushshift('worldnews')
results2=query_pushshift('news')

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=worldnews&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=worldnews&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=worldnews&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=worldnews&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=worldnews&size=500&after=150d
Query Complete!
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=news&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=news&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=news&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=news&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?

In [39]:
print(results1.shape)
results1.head()

(2500, 8)


Unnamed: 0,title,subreddit,created_utc,author,num_comments,score,url,timestamp
0,"@akanmaja : Hey twitz_end ✍ 👽(@twitz_end), tha...",worldnews,1577917026,micladceo,0,1,https://mobile.twitter.com/akanmaja/status/121...,2020-01-01
1,Former NBA Commissioner David Stern Dead at 77,worldnews,1577917066,PinheadLarry2323,2,1,https://www.tmz.com/2020/01/01/nba-commissione...,2020-01-01
2,إسقاط طائرة تجسسية في محور جيزان تعتبر الخامسة...,worldnews,1577917176,almshhadalyemeni,0,1,http://www.almshhadalyemeni.net/132646/,2020-01-01
3,Is Uber Doomed to a Point of No Return?,worldnews,1577917185,Open_c_Source,2,1,https://medium.com/@waseemezzie/uber-the-next-...,2020-01-01
4,Zelensky's New Year address: Everyone should a...,worldnews,1577917681,Fanrific,3,1,https://www.unian.info/society/10816211-zelens...,2020-01-01


In [40]:
print(results2.shape)
results2.head()

(2500, 8)


Unnamed: 0,title,subreddit,created_utc,author,num_comments,score,url,timestamp
0,Ask HN: Who Is Hiring January 2020,news,1577917156,121kiwi,0,1,https://121private.home.blog/2020/01/01/ask-hn...,2020-01-01
1,"Touting 5 Million Individual Contributions, Sa...",news,1577917236,acouplewavylines,0,1,https://www.commondreams.org/news/2020/01/01/t...,2020-01-01
2,I Was Trained To Destroy My Opponents – El-Ruf...,news,1577917336,Gistmania,0,1,"https://www.gistmania.com/talk/topic,417722.0....",2020-01-01
3,@Afriupdate_com: Bank Staff Reveals How He Hel...,news,1577917413,Afriupdatenews247,0,1,https://mobile.twitter.com/Afriupdate_com/stat...,2020-01-01
4,Black guest at a Portland Marriott hotel claim...,news,1577917483,AlohaWorld18,527,1,https://www.oregonlive.com/news/2019/12/black-...,2020-01-01


In [41]:
data=pd.concat([results1,results2],ignore_index=True)

In [42]:
data.shape

(5000, 8)

In [43]:
data.to_csv('./datasets/subredditdata.csv')

In [None]:
# for i in results1['url']:
#     url=i
#     res=requests.get(url)
#     print(url)
#     if res.status_code == 200:
#         soup=BeautifulSoup(res.content,'lxml')
        
#         x=soup.find_all('p')
#         articletext=''
#         for j in x:
            
#             articletext+=j.text
    
#     results1['selftext'][i]=articletext
#     time.sleep(2)
