# Data Collection

In [130]:
import time
import pandas as pd
import requests # Pushshift accesses Reddit via a url so this is needed
import json # JSON manipulation

## I. Creating Function

Creating a function to pull data from pushshift and define additional parameters.

In [191]:
def get_pushshift_data(start, end, subreddit):


    # URL
    url = 'https://api.pushshift.io/reddit/search/submission'
    # Add params
    params = {'subreddit' : subreddit, 
              'size' : 100, 
              'after' : start, 
              'before' : end,
              'filter' : ['created_utc', 'subreddit', 'title', 'num_comments', 'score'], 
              'num_comments' : '>25'}
     
    # Get the data
    res = requests.get(url, params)
    # Convert the request into a list of dict objects
    data = res.json()
    # Go deeper
    posts = data['data']
    # Turn into DataFrame
    df = pd.DataFrame(posts)
    return df

## II. Creating Multiple DataFrames and Combining to Get Around Pull Request Limit

In [192]:
# Create DataFrames for months in 2020
nfl_aug = get_pushshift_data('2020-08-01', '2020-08-30', 'NFL')
nfl_sep = get_pushshift_data('2020-09-01', '2020-09-30', 'NFL')
nfl_oct = get_pushshift_data('2020-10-01', '2020-10-30', 'NFL')

In [193]:
nfl_nov = get_pushshift_data('2020-11-01', '2020-11-30', 'NFL')
nfl_dec = get_pushshift_data('2020-12-01', '2020-12-30', 'NFL')

In [194]:
# Pick another subreddit and do the same!
ff_aug = get_pushshift_data('2020-08-01', '2020-08-30', 'fantasyfootball')
ff_sep = get_pushshift_data('2020-09-01', '2020-09-30', 'fantasyfootball')
ff_oct = get_pushshift_data('2020-10-01', '2020-10-30', 'fantasyfootball')

In [195]:
ff_nov = get_pushshift_data('2020-11-01', '2020-11-30', 'fantasyfootball')
ff_dec = get_pushshift_data('2020-12-01', '2020-12-30', 'fantasyfootball')

In [196]:
nfl_nov

Unnamed: 0,created_utc,num_comments,score,subreddit,title
0,1604191395,108,1,nfl,What do you call a lineman that can play all 5...
1,1604191916,67,1,nfl,Who’s in your dream O-Line?
2,1604193001,331,1,nfl,[Le'Veon Bell] excited for tomorrow. first hom...
3,1604195555,111,1,nfl,Bit of running backs matter content: The Detro...
4,1604197702,186,1,nfl,"Jon Bostic fined only $12,000 for hit that kno..."
...,...,...,...,...,...
95,1604264105,28,1,nfl,This is the first 4 game losing streak for the...
96,1604264113,5851,1,nfl,Game Thread: San Francisco 49ers (4-3) at Seat...
97,1604264264,38,1,nfl,[Highlight] Cam Newton loses the football on t...
98,1604264272,37,1,nfl,Comparing Trevor Lawrence and Justin Fields to...


In [197]:
ff_dec

Unnamed: 0,created_utc,num_comments,score,subreddit,title
0,1606780931,63,1,fantasyfootball,How would you handle the provisional players g...
1,1606780955,40,1,fantasyfootball,"Official: [Add/Drop] - Mon Evening, 11/30/2020"
2,1606780964,41,1,fantasyfootball,"Official: [Monday Miracle] - Mon Evening, 11/3..."
3,1606780970,29,1,fantasyfootball,Official: [Simple Questions and League Issues]...
4,1606780974,55,1,fantasyfootball,"Official: [Trade] - Mon Evening, 11/30/2020"
...,...,...,...,...,...
95,1606862270,27,1,fantasyfootball,Did the Ravens already get there tests back fo...
96,1606862663,50,1,fantasyfootball,Hopkins ROS?
97,1606863148,103,1,fantasyfootball,Kittle 'very optimistic' about returning in 2020
98,1606863286,46,1,fantasyfootball,Great news for the Ravens Steelers game tomorr...


In [198]:
ff_nov

Unnamed: 0,created_utc,num_comments,score,subreddit,title
0,1604188819,93,1,fantasyfootball,"Lutz owners, are you dropping or holding him w..."
1,1604190127,52,1,fantasyfootball,Anyone else here starting Samaje Perine with c...
2,1604190847,147,1,fantasyfootball,Best Landing Spot for Will Fuller?
3,1604192479,117,1,fantasyfootball,Week 8 Fantasy Football Live Q&amp;A - Start/S...
4,1604194853,244,1,fantasyfootball,"Drop Your Kicker Week 8: Add Denzel Mims, you ..."
...,...,...,...,...,...
95,1604270779,30,1,fantasyfootball,Jonathan Taylor ROS - possible RBBC in Indy?
96,1604271325,32,1,fantasyfootball,The #49ers RB Tevin Coleman (knee) is question...
97,1604271455,96,1,fantasyfootball,Frank Reich Postgame interview on the RB situa...
98,1604271891,228,1,fantasyfootball,Who are your RB1’s


In [199]:
# Concat
full_nfl_df = pd.concat([nfl_aug, nfl_sep, nfl_oct, nfl_nov, nfl_nov])
full_ff_df = pd.concat([ff_aug, ff_sep, ff_oct, ff_nov, ff_nov])
full_ff_df

Unnamed: 0,created_utc,num_comments,score,subreddit,title
0,1596243790,31,1,fantasyfootball,Opinion on Ertz
1,1596252321,29,1,fantasyfootball,August 4th is the opt out deadline. What big n...
2,1596279753,65,1,fantasyfootball,"Official: [Keeper] - Sat , 08/01/2020"
3,1596299348,52,1,fantasyfootball,Deshaun Watson says that his chemistry with Br...
4,1596302551,35,1,fantasyfootball,Idea: Can We Mock Draft together?
...,...,...,...,...,...
95,1604270779,30,1,fantasyfootball,Jonathan Taylor ROS - possible RBBC in Indy?
96,1604271325,32,1,fantasyfootball,The #49ers RB Tevin Coleman (knee) is question...
97,1604271455,96,1,fantasyfootball,Frank Reich Postgame interview on the RB situa...
98,1604271891,228,1,fantasyfootball,Who are your RB1’s


In [200]:
# Concat again
reddit_df = pd.concat([full_nfl_df, full_ff_df])
reddit_df.head(2)

Unnamed: 0,created_utc,num_comments,score,subreddit,title
0,1596241942,31,1,nfl,Why so many people say that Rice is the greate...
1,1596242800,129,1,nfl,"[Undisputed] ""I believe it's the greatest run ..."


In [201]:
# Create target column
reddit_df['is_nfl'] = [1 if i == 'nfl' else 0 for i in reddit_df['subreddit']]

In [202]:
reddit_df

Unnamed: 0,created_utc,num_comments,score,subreddit,title,is_nfl
0,1596241942,31,1,nfl,Why so many people say that Rice is the greate...,1
1,1596242800,129,1,nfl,"[Undisputed] ""I believe it's the greatest run ...",1
2,1596243198,164,1,nfl,"In history of NFL, only 9 players have ever wo...",1
3,1596247069,42,1,nfl,"Most valuable QBs in 2019, per PFF WAR ➤ Russe...",1
4,1596248203,74,1,nfl,"J.K. Dobbins shocked by draft fall, but thinks...",1
...,...,...,...,...,...,...
95,1604270779,30,1,fantasyfootball,Jonathan Taylor ROS - possible RBBC in Indy?,0
96,1604271325,32,1,fantasyfootball,The #49ers RB Tevin Coleman (knee) is question...,0
97,1604271455,96,1,fantasyfootball,Frank Reich Postgame interview on the RB situa...,0
98,1604271891,228,1,fantasyfootball,Who are your RB1’s,0


In [203]:
# Create CSV from final DF
reddit_df.to_csv('./data/reddit.csv', index = False)