# Reddit Data
### Step 1. Read in Data

In [1]:
# standard imports
import pandas as pd
import requests # Pushshift accesses Reddit via a url so this is needed
import json # JSON manipulation

In [69]:
# set URL
url = 'https://api.pushshift.io/reddit/search/submission/'

# point to subreddit
params_sr1 = {'subreddit': 'C_S_T'}
params_sr2 = {'subreddit': 'conspiracy'}

response_sr1 = requests.get(url, params_sr1)
response_sr2 = requests.get(url, params_sr2)

#check for errors in request -- we're all good
#response_sr2.status_code

In [70]:
# get the json data from both
data_sr1 = response_sr1.json()
data_sr2 = response_sr2.json()

# turn json dict into posts DataFrame
posts_sr1 = pd.DataFrame(data_sr1['data'])
posts_sr2 = pd.DataFrame(data_sr2['data'])

#inspect
posts_sr2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 69 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   all_awardings                25 non-null     object 
 1   allow_live_comments          25 non-null     bool   
 2   author                       25 non-null     object 
 3   author_flair_css_class       0 non-null      object 
 4   author_flair_richtext        25 non-null     object 
 5   author_flair_text            0 non-null      object 
 6   author_flair_type            25 non-null     object 
 7   author_fullname              25 non-null     object 
 8   author_is_blocked            25 non-null     bool   
 9   author_patreon_flair         25 non-null     bool   
 10  author_premium               25 non-null     bool   
 11  awarders                     25 non-null     object 
 12  can_mod_post                 25 non-null     bool   
 13  contest_mode          

In [71]:
# write a function to do this

# Function is variation on Katie Sylvia's BreakFast Hour Instruction
def get_pushshift_data(year, subreddit):
    year = str(year)
    
    start = year + '-01-01'
    if year == '2021':
        end = year + '-10-08'
    else:
        end = year + '-12-31'
    #print(start, end)    
    
    # URL
    url = 'https://api.pushshift.io/reddit/search/submission/'
    # Add params
    params = {'subreddit': subreddit,
              'size': 100,
              'after': start,
              'before': end,
              'filter': ['subreddit', 'title', 'selftext', 'num_comments', 'score'],
              #'num_comments': '>0',
              #'score': '>10'
              'selftext': True}
     
    # Get the data
    res = requests.get(url, params)
    # Convert the request into a list of dict objects
    data = res.json()
    # Go deeper
    posts = data['data']
    # Turn into DataFrame
    df = pd.DataFrame(posts)

    return df

In [74]:
cst_2021 = get_pushshift_data(2021, 'C_S_T')
cst_2020 = get_pushshift_data(2020, 'C_S_T')
cst_2019 = get_pushshift_data(2019, 'C_S_T')
cst_2018 = get_pushshift_data(2018, 'C_S_T')
cst_2017 = get_pushshift_data(2017, 'C_S_T')

cst_df = pd.concat([cst_2017, cst_2018, cst_2019, cst_2020, cst_2021])
cst_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 99
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   num_comments  500 non-null    int64 
 1   score         500 non-null    int64 
 2   selftext      500 non-null    object
 3   subreddit     500 non-null    object
 4   title         500 non-null    object
dtypes: int64(2), object(3)
memory usage: 23.4+ KB


In [76]:
#get reddit data from r/conspiracy
con_2021 = get_pushshift_data(2021, 'conspiracy')
con_2020 = get_pushshift_data(2020, 'conspiracy')
con_2019 = get_pushshift_data(2019, 'conspiracy')
con_2018 = get_pushshift_data(2018, 'conspiracy')
con_2017 = get_pushshift_data(2017, 'conspiracy')


#convert to DF
con_df = pd.concat([con_2017, con_2018, con_2019, con_2020, con_2021])
#ls_df.head()
con_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 99
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   num_comments  500 non-null    int64 
 1   score         500 non-null    int64 
 2   selftext      500 non-null    object
 3   subreddit     500 non-null    object
 4   title         500 non-null    object
dtypes: int64(2), object(3)
memory usage: 23.4+ KB


In [77]:
# 
subreddits = pd.concat([cst_df, con_df])

subreddits.to_csv('./data/subreddits.csv', index=False)
