# Web Scraping Using Pushshift API

In [1]:
import pandas as pd
import datetime as dt
import time
import requests

### Scraping Function (Written by Mahdi)

In [2]:
def query_pushshift(subreddit, kind = 'submission', hour_window = 24, n = 5):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}h".format(stem, hour_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(6)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

## Subreddit 1: Life Pro Tips

In [3]:
life_pro_tips = query_pushshift('LifeProTips', kind = 'submission', hour_window = 12, n = 30) #15 days worth of data

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=LifeProTips&size=500&after=12h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=LifeProTips&size=500&after=24h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=LifeProTips&size=500&after=36h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=LifeProTips&size=500&after=48h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=LifeProTips&size=500&after=60h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=LifeProTips&size=500&after=72h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=LifeProTips&size=500&after=84h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=LifeProTips&size=500&after=96h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=LifeProTips&size=500&after=108h
Querying from: https://api.pushshift.io/reddi

In [6]:
life_pro_tips.shape

(2677, 9)

## Subreddit 2: Unethical Life Pro Tips

In [16]:
# Will oversample unethical LPTs to be able to remove 'Requests'

ue_life_pro_tips = query_pushshift('UnethicalLifeProTips', kind = 'submission', hour_window = 168, n = 40) # Scraping 1 week at a time

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=UnethicalLifeProTips&size=500&after=168h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=UnethicalLifeProTips&size=500&after=336h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=UnethicalLifeProTips&size=500&after=504h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=UnethicalLifeProTips&size=500&after=672h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=UnethicalLifeProTips&size=500&after=840h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=UnethicalLifeProTips&size=500&after=1008h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=UnethicalLifeProTips&size=500&after=1176h
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=UnethicalLifeProTips&size=500&after=1344h
Querying from: https://api.pushshift.io/reddit/search/submission?subr

In [17]:
ue_life_pro_tips.shape

(3961, 9)

## Concatenate Subreddit Data Frames

In [18]:
both_subreddits = pd.concat([life_pro_tips, ue_life_pro_tips])

In [19]:
both_subreddits['subreddit'].value_counts(normalize = True)

UnethicalLifeProTips    0.596716
LifeProTips             0.403284
Name: subreddit, dtype: float64

Although the value counts are pretty dissimilar right now, the Unethical Life Pro Tips subreddit contains alot of posts that requests for advice. I will remove these posts from the data frame. 

## Export to csv

In [20]:
both_subreddits.to_csv('../data/subreddits.csv')