# Data Collection Notebook

We will be using Pushshift, a free and open source API that allows users to access data from Reddit.

## Imports

In [33]:
import pandas as pd
import requests
import time

## Scraping Function

In [34]:
# NOTE: I've had no issues with the number of requests/sec, but if you do, you can
# throttle your requests by uncommenting the sleep statement in the loop below

def get_posts(subreddit: str, n: int, params: dict)->pd.DataFrame:
    """Download posts from a subreddit using the Pushshift API

    Args:
        subreddit (str): subreddit to scrape
        n (int): number requests to make (500 posts per request)
        params (dict): dictionary of parameters to pass to the API excluding subreddit and size
    """
    base_url = 'https://api.pushshift.io/reddit/search/submission'
    p_string = '&'.join([f'{k}={v}' for k, v in params.items()])
    
    dfs = []
    last_utc = None
    for i in range(n):
        if last_utc:
            # update the p-string to get the next batch
            params['before'] = last_utc
            p_string = '&'.join([f'{k}={v}' for k, v in params.items()])
        
        response = requests.get(f'{base_url}?subreddit={subreddit}&size=500&{p_string}').json()
        batch = pd.DataFrame(response['data'])
        
        # break if there are no more posts
        if batch.empty:
            break
        
        # store the utc of the oldest post in the batch to use
        # as the before parameter for the next batch
        last_utc = batch['created_utc'].min()
        dfs.append(batch)
        
        # optional sleep statement to throttle requests
        # time.sleep(1)
    
    return pd.concat(dfs).drop_duplicates(subset='title').reset_index(drop=True)

### Getting r/Consipracy posts

In [35]:
subreddit = 'conspiracy'
n = 50
params = {'sort_type': 'score', 'metadata': False, 'after': '365d'}

In [36]:
consp = get_posts(subreddit, n, params)

In [37]:
print(consp.shape)
consp.head(1)

(23894, 97)


Unnamed: 0,subreddit,selftext,author_fullname,gilded,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,...,utc_datetime_str,url_overridden_by_dest,media_metadata,author_cakeday,is_gallery,gallery_data,link_flair_template_id,edited_on,crosspost_parent_list,crosspost_parent
0,conspiracy,Video is here: [https://youtu.be/tCuIxIJBfCY]...,t2_pa2n77y,0,Veteran's Advocate and 2A supporter Jon Stewar...,[],r/conspiracy,False,0.0,,...,2023-03-05 21:16:11,,,,,,,,,


In [38]:
consp.columns

Index(['subreddit', 'selftext', 'author_fullname', 'gilded', 'title',
       'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls',
       'link_flair_css_class', 'thumbnail_height', 'top_awarded_type',
       'hide_score', 'quarantine', 'link_flair_text_color', 'upvote_ratio',
       'author_flair_background_color', 'subreddit_type',
       'total_awards_received', 'media_embed', 'thumbnail_width',
       'author_flair_template_id', 'is_original_content', 'secure_media',
       'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed',
       'link_flair_text', 'score', 'is_created_from_ads_ui', 'author_premium',
       'thumbnail', 'edited', 'author_flair_css_class',
       'author_flair_richtext', 'gildings', 'post_hint', 'content_categories',
       'is_self', 'link_flair_type', 'wls', 'removed_by_category',
       'author_flair_type', 'domain', 'allow_live_comments', 'suggested_sort',
       'view_count', 'archived', 'no_follow', 'is_crosspostable', 'pinne

In [39]:
# Filter df and label data
consp = consp[['selftext','title']]
consp['conspiracy'] = 1

In [40]:
consp.isna().sum()

selftext      0
title         0
conspiracy    0
dtype: int64

In [41]:
consp.dtypes

selftext      object
title         object
conspiracy     int64
dtype: object

In [42]:
# remove posts with no text
blank = (consp.selftext == '')
removed = (consp.selftext == '[removed]')
consp = consp.loc[~(blank | removed)]
consp.shape

(8006, 3)

### Getting r/AskPhilosophy posts

In [43]:
subreddit = 'askphilosophy'
n = 50
params = {
    'sort_type': 'score',
    'metadata': False,
    'after': '365d',
    'fields': 'selftext'
}

In [44]:
phil = get_posts(subreddit, n, params)

In [45]:
print(phil.shape)
phil.head(1)

(5533, 91)


Unnamed: 0,subreddit,selftext,author_fullname,gilded,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,...,media,is_video,retrieved_utc,updated_utc,utc_datetime_str,post_hint,preview,edited_on,link_flair_template_id,author_cakeday
0,askphilosophy,I've structed my life on a 'live and let live'...,t2_jrurtu4e,0,philosophical theory that addresses the idea t...,[],r/askphilosophy,False,6,,...,,False,1678047314,1678047315,2023-03-05 20:14:59,,,,,


In [46]:
phil.columns

Index(['subreddit', 'selftext', 'author_fullname', 'gilded', 'title',
       'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls',
       'link_flair_css_class', 'thumbnail_height', 'top_awarded_type',
       'hide_score', 'quarantine', 'link_flair_text_color', 'upvote_ratio',
       'author_flair_background_color', 'subreddit_type',
       'total_awards_received', 'media_embed', 'thumbnail_width',
       'author_flair_template_id', 'is_original_content', 'secure_media',
       'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed',
       'link_flair_text', 'score', 'is_created_from_ads_ui', 'author_premium',
       'thumbnail', 'edited', 'author_flair_css_class',
       'author_flair_richtext', 'gildings', 'content_categories', 'is_self',
       'link_flair_type', 'wls', 'removed_by_category', 'author_flair_type',
       'domain', 'allow_live_comments', 'suggested_sort', 'view_count',
       'archived', 'no_follow', 'is_crosspostable', 'pinned', 'over_18'

In [47]:
phil = phil[['selftext','title']]
phil['conspiracy'] = 0

In [48]:
phil.isna().sum()

selftext      0
title         0
conspiracy    0
dtype: int64

In [49]:
# remove posts with no text
blank = (phil.selftext == '')
removed = (phil.selftext == '[removed]')
phil = phil.loc[~(blank | removed)]
phil.shape

(4450, 3)

In [50]:
out = pd.concat([consp, phil]).reset_index(drop=True)
out.shape

(12456, 3)

In [51]:
out.head()

Unnamed: 0,selftext,title,conspiracy
0,Video is here: [https://youtu.be/tCuIxIJBfCY]...,Veteran's Advocate and 2A supporter Jon Stewar...,1
1,I've stumbled on a Youtube channel I think is ...,I think I found a propaganda Youtube Channel,1
2,"If you’re in the UK, you’ve probably seen the ...",Matt Hancock (UK senior politician turned show...,1
3,Anyone else get a 7 day ban from responding to...,Banned - Deleted topic,1
4,The FDA can 100% legally start genetically alt...,I learned the FDA is completely fine with comp...,1


### Some slight cleaning 

In [52]:
out.selftext = out.selftext.str.replace('\n', ' ').str.lower()

### Create a couple of features 

In [53]:
out['post_length'] = out.selftext.apply(lambda x: len(x))
out['post_words'] = out.selftext.apply(lambda x: len(x.split()))

In [54]:
out['title_length'] = out.title.apply(lambda x: len(x))
out['title_words'] = out.title.apply(lambda x: len(x.split()))

## Save data

In [55]:
out.to_csv('data/raw_data.csv', index=False)