In [2]:
import pandas as pd
import datetime as dt
import time
import requests

## Data Collection

In [3]:
# here's an example query url
#url = "https://api.pushshift.io/reddit/search/submission?subreddit=boardgames"

In [4]:
#https://www.reddit.com/r/Cricket/
url = "https://api.pushshift.io/reddit/search/submission?subreddit=Cricket"

In [5]:
res = requests.get(url)

In [7]:
assert res.status_code == 200

In [8]:
# checking res type
type(res)

requests.models.Response

In [11]:
# Dig into "data"
len(json_data['data'])

25

In [12]:
### Converting to a Pandas Dataframe

In [13]:
results_df = pd.DataFrame(json_data["data"])
results_df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,author_flair_background_color,media,media_embed,post_hint,preview,secure_media,secure_media_embed,thumbnail_height,thumbnail_width,removed_by_category
0,[],False,mashac,,[],,text,t2_fohvs,False,False,...,,,,,,,,,,
1,[],False,mashac,,[],,text,t2_fohvs,False,False,...,,,,,,,,,,
2,[],False,mashac,,[],,text,t2_fohvs,False,False,...,,,,,,,,,,
3,[],False,mashac,,[],,text,t2_fohvs,False,False,...,,,,,,,,,,
4,[],False,LMilto,qld,"[{'a': ':Queensland:', 'e': 'emoji', 'u': 'htt...",:Queensland: Queensland Bulls,richtext,t2_38tvby8a,False,False,...,,,,,,,,,,


In [6]:
def query_pushshift(subreddit, kind = 'submission', day_window = 30, n = 5):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

In [18]:
results = query_pushshift("cricket", n=15, day_window = 15)

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=cricket&size=500&after=15d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=cricket&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=cricket&size=500&after=45d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=cricket&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=cricket&size=500&after=75d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=cricket&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=cricket&size=500&after=105d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=cricket&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=cricket&size=500&after=135d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=cric

In [19]:
results.shape

(2458, 9)

In [20]:
results.to_csv("../data/cricket.csv")

In [21]:
results_soccer = query_pushshift("soccer", n=25, day_window = 10)

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=soccer&size=500&after=10d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=soccer&size=500&after=20d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=soccer&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=soccer&size=500&after=40d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=soccer&size=500&after=50d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=soccer&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=soccer&size=500&after=70d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=soccer&size=500&after=80d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=soccer&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=soccer&size=500&

In [22]:
results_soccer.shape

(2059, 9)

In [23]:
results_soccer.to_csv("../data/soccer.csv")