# Data Collection

In [1]:
import time
import pandas as pd
import requests # Pushshift accesses Reddit via a url so this is needed
import json # JSON manipulation

## I. Creating Function

Creating a function to pull data from pushshift and define additional parameters.

In [2]:
# creating function to send requests to Reddit API. Code courtesy breakfast hour.
def get_pushshift_data(start, end, subreddit):


    # URL
    url = 'https://api.pushshift.io/reddit/search/submission'
    # Add params
    params = {'subreddit' : subreddit, 
              'size' : 100, 
              'after' : start, 
              'before' : end,
              'filter' : ['created_utc', 'subreddit', 'title', 'num_comments', 'score'], 
              'num_comments' : '>25'}
     
    # Get the data
    res = requests.get(url, params)
    # Convert the request into a list of dict objects
    data = res.json()
    # Go deeper
    posts = data['data']
    # Turn into DataFrame
    df = pd.DataFrame(posts)
    return df

## II. Creating Multiple DataFrames and Combining to Get Around Pull Request Limit

In [3]:
# Create DataFrames for each day in the beginning of Oct for r/NFL. Splitting up cells to reduce simultaneous pings.
nfl1 = get_pushshift_data('2021-10-01', '2021-10-02', 'NFL')
nfl2 = get_pushshift_data('2021-10-02', '2021-10-03', 'NFL')
nfl3 = get_pushshift_data('2021-10-03', '2021-10-04', 'NFL')

In [4]:
nfl4 = get_pushshift_data('2021-10-04', '2021-10-05', 'NFL')
nfl5 = get_pushshift_data('2021-10-05', '2021-10-06', 'NFL')
nfl6 = get_pushshift_data('2021-10-06', '2021-10-07', 'NFL')

In [5]:
nfl7 = get_pushshift_data('2021-10-07', '2021-10-08', 'NFL')
nfl8 = get_pushshift_data('2021-10-08', '2021-10-09', 'NFL')
nfl9 = get_pushshift_data('2021-10-09', '2021-10-10', 'NFL')
nfl10 = get_pushshift_data('2021-10-10', '2021-10-11', 'NFL')

In [7]:
# Create DataFrames for each day in the beginning of Oct for r/fantasyfootball. Splitting up cells to reduce simultaneous pings.
ff1 = get_pushshift_data('2021-10-01', '2021-10-02', 'fantasyfootball')
ff2 = get_pushshift_data('2021-10-02', '2021-10-03', 'fantasyfootball')
ff3 = get_pushshift_data('2021-10-03', '2021-10-04', 'fantasyfootball')

In [8]:
ff4 = get_pushshift_data('2021-10-04', '2021-10-05', 'fantasyfootball')
ff5 = get_pushshift_data('2021-10-05', '2021-10-06', 'fantasyfootball')
ff6 = get_pushshift_data('2021-10-06', '2021-10-07', 'fantasyfootball')

In [9]:
ff7 = get_pushshift_data('2021-10-07', '2021-10-08', 'fantasyfootball')
ff8 = get_pushshift_data('2021-10-08', '2021-10-09', 'fantasyfootball')
ff9 = get_pushshift_data('2021-10-09', '2021-10-10', 'fantasyfootball')
ff10 = get_pushshift_data('2021-10-10', '2021-10-11', 'fantasyfootball')

In [10]:
# spot checking an r/NFL pull to make sure DataFrame was created successfully
nfl1

Unnamed: 0,created_utc,num_comments,score,subreddit,title
0,1633046573,61,1,nfl,Lamar Jackson Hate
1,1633046634,103,1,nfl,[Lombardi] 49ers continue advocating for a rul...
2,1633049017,37,1,nfl,Is it just a coincidence that Golden Tate and ...
3,1633049545,62,1,nfl,[Highlight] James Robinson 6-YD TD Run. Jags I...
4,1633049571,30,1,nfl,Qb vs oline
...,...,...,...,...,...
95,1633118382,88,1,nfl,Penei Sewell: My play has not been good enough
96,1633118779,32,1,nfl,Sports books: Brady-Belichick showdown most he...
97,1633119243,58,1,nfl,[Schefter] Bucs officially listing TE Rob Gron...
98,1633119413,53,1,nfl,[ProFootballTalk] Josh Gordon won't play this ...


In [11]:
# spot checking an r/fantasyfootball pull to make sure DataFrame was created successfully
ff1

Unnamed: 0,created_utc,num_comments,score,subreddit,title
0,1633046415,8990,1,fantasyfootball,THURSDAY NIGHT GAMETHREAD - WEEK 4
1,1633047722,44,1,fantasyfootball,Who Did You Bench For Rondale Moore in Week 3?
2,1633047821,32,1,fantasyfootball,Where did you buy your championship ring/troph...
3,1633048036,134,1,fantasyfootball,D.J. Chark is down for the #Jaguars and it doe...
4,1633048054,48,1,fantasyfootball,Jaguars WR D.J. Chark being carted off after g...
...,...,...,...,...,...
95,1633112630,130,1,fantasyfootball,Matt Nagy says the starting QB will be a game ...
96,1633112795,54,1,fantasyfootball,Reid says #Chiefs Josh Gordon won't play this ...
97,1633113128,32,1,fantasyfootball,Sean McVay says that he anticipates RB Darrell...
98,1633113198,107,1,fantasyfootball,[Schefter] Rams’ HC Sean McVay said RB Darrell...


In [12]:
# Concat the pulls for each subreddit into a combined DataFrame
full_nfl_df = pd.concat([nfl1, nfl2, nfl3, nfl4, nfl5, nfl6, nfl7, nfl8, nfl9, nfl10])
full_ff_df = pd.concat([ff1, ff2, ff3, ff4, ff5, ff6, ff7, ff8, ff9, ff10])

In [13]:
# Concat both subreddits into full DataFrame
reddit_df = pd.concat([full_nfl_df, full_ff_df])
reddit_df.head(2)

Unnamed: 0,created_utc,num_comments,score,subreddit,title
0,1633046573,61,1,nfl,Lamar Jackson Hate
1,1633046634,103,1,nfl,[Lombardi] 49ers continue advocating for a rul...


In [14]:
# Create target column
reddit_df['is_nfl'] = [1 if i == 'nfl' else 0 for i in reddit_df['subreddit']]

In [15]:
# checking that the concat was successful
reddit_df.head(2)

Unnamed: 0,created_utc,num_comments,score,subreddit,title,is_nfl
0,1633046573,61,1,nfl,Lamar Jackson Hate,1
1,1633046634,103,1,nfl,[Lombardi] 49ers continue advocating for a rul...,1


In [16]:
# Create CSV from final DF
reddit_df.to_csv('./data/reddit.csv', index = False)