## 01 DATA ACQUISITION

In [1]:
# imports and libraries

import pandas as pd

import requests
import time
import pickle

### function to make df of subreddit data

In [11]:
def extract_reddit(res): # only make one df at atime
    
    if res.status_code == 200:
        posts = res.json()['data'] # posts is a list
        df = pd.DataFrame(posts)[['subreddit', 'title', 'selftext', 'score', 'upvote_ratio', 'num_comments',
                                  'created_utc']] # appends to df
    
    return df

### subreddit 1: witcher3

In [27]:
subreddit_1= 'witcher3'

url = 'https://api.pushshift.io/reddit/search/submission/'
params = {'subreddit': subreddit_1, 'size': 100, 'metadata': 'true'} # params

# print(res.status_code)
res = requests.get(url, params) 

# tells us how many total posts are in subreddit
posts_total= res.json()['metadata']['total_results']
print(f' total number of posts in {subreddit_1} subreddit is: {posts_total}')

 total number of posts in witcher3 subreddit is: 44073


In [29]:
sub_1_all_posts= pd.DataFrame() # want to append function df to empty ALL POSTS df

subreddit_1= 'witcher3'

url = 'https://api.pushshift.io/reddit/search/submission/'
params = {'subreddit': subreddit_1, 'size': 100} # params


for i in range(50): # should give 50 times * 100 posts so 5000 posts
    time.sleep(20) # does task every sleep(#) seconds
    
    res = requests.get(url, params) 
    func_df= extract_reddit(res) # df made at each i
    params['before'] = func_df.iloc[-1]['created_utc'] # the last time data was taken from online
    
    sub_1_all_posts= sub_1_all_posts.append(func_df, ignore_index= True)

sub_1_all_posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   subreddit     5000 non-null   object 
 1   title         5000 non-null   object 
 2   selftext      5000 non-null   object 
 3   score         5000 non-null   int64  
 4   upvote_ratio  5000 non-null   float64
 5   num_comments  5000 non-null   int64  
 6   created_utc   5000 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 273.6+ KB


In [31]:
# find all posts with selftexts saying the posts were deleted or removed --> 67 total posts

sub_1_all_posts.loc[(sub_1_all_posts['selftext']== '[removed]') | (sub_1_all_posts['selftext']== '[deleted]')]

Unnamed: 0,subreddit,title,selftext,score,upvote_ratio,num_comments,created_utc
279,Witcher3,weapon repair kits,[deleted],1,1.0,0,1641784983
456,Witcher3,20 extra crowns don't hurt,[deleted],1,1.0,0,1641671355
606,Witcher3,I didn't know Red Bull existed in the Middle Age,[removed],1,1.0,0,1641587931
607,Witcher3,I didn't know Red Bull existed in the Middle Age,[removed],1,1.0,0,1641587900
1480,Witcher3,Is there any witcher 3 mod which can improve t...,[removed],1,1.0,0,1640890072
1552,Witcher3,Nice,[deleted],1,1.0,0,1640832986
1760,Witcher3,Can't find diagram for Griffin Silver Sword.,[deleted],1,1.0,0,1640663380
2402,Witcher3,Witcher 3 Spooky Mansion,[deleted],1,1.0,0,1640036465
2801,Witcher3,any news on witcher 4?,[removed],1,1.0,0,1639482129
3117,Witcher3,This sub makes me want to play Witcher 3 again,[deleted],3,1.0,0,1638986260


In [32]:
# drop all deleted/removed posts

sub_1_all_posts = sub_1_all_posts.drop(
    sub_1_all_posts[(sub_1_all_posts['selftext']== '[removed]') | (sub_1_all_posts['selftext']== '[deleted]')].index)

sub_1_all_posts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4966 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   subreddit     4966 non-null   object 
 1   title         4966 non-null   object 
 2   selftext      4966 non-null   object 
 3   score         4966 non-null   int64  
 4   upvote_ratio  4966 non-null   float64
 5   num_comments  4966 non-null   int64  
 6   created_utc   4966 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 310.4+ KB


In [33]:
# no null posts
sub_1_all_posts[sub_1_all_posts['selftext'].isnull()]

Unnamed: 0,subreddit,title,selftext,score,upvote_ratio,num_comments,created_utc


In [77]:
# drop all null rows

# sub_1_all_posts.dropna(inplace= True)
# sub_1_all_posts.info()

In [34]:
sub_1_all_posts.head()

Unnamed: 0,subreddit,title,selftext,score,upvote_ratio,num_comments,created_utc
0,Witcher3,Why did CD never patch the Wolf set bug?,This bug has always frustrated me. Why couldn'...,1,1.0,0,1642038723
1,Witcher3,The heart of the woods quest,"As you may remember/ know, the quest has two c...",1,1.0,0,1642037974
2,Witcher3,Bought the complete edition on PS4,I had this in Xbox but ended up moving and lef...,1,1.0,0,1642037613
3,Witcher3,This merchant will give you 10 crowns minimum ...,,1,1.0,0,1642035331
4,Witcher3,When will my winter berry’s grow back,Ice tried everything saving and quitting skipp...,1,1.0,0,1642032600


In [76]:
# pickle out witcher_3_5000 df to dataframes folder

with open('dataframes/witcher3_5000.pkl', mode= 'wb') as pickle_out:
    pickle.dump(sub_1_all_posts, pickle_out) 

### subreddit 2: netflixwitcher

In [36]:
subreddit_2= 'netflixwitcher'

url = 'https://api.pushshift.io/reddit/search/submission/'
params = {'subreddit': subreddit_2, 'size': 100, 'metadata': 'true'} # params

res = requests.get(url, params) 

# tells us how many total posts are in subreddit
posts_total= res.json()['metadata']['total_results']
print(f' total number of posts in {subreddit_2} subreddit is: {posts_total}')

 total number of posts in netflixwitcher subreddit is: 13566


In [37]:
sub_2_all_posts= pd.DataFrame() # want to append function df to empty ALL POSTS df

subreddit_2= 'netflixwitcher'

url = 'https://api.pushshift.io/reddit/search/submission/'
params = {'subreddit': subreddit_2, 'size': 100} # params


for i in range(50): # should give 50 times * 100 posts so 5000 posts
    time.sleep(20) # does task every sleep(#) seconds
    
    res = requests.get(url, params) 
    func_df= extract_reddit(res) # df made at each i
    params['before'] = func_df.iloc[-1]['created_utc'] # the last time data was taken from online
    
    sub_2_all_posts= sub_2_all_posts.append(func_df, ignore_index= True)

sub_2_all_posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4995 entries, 0 to 4994
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   subreddit     4995 non-null   object 
 1   title         4995 non-null   object 
 2   selftext      4991 non-null   object 
 3   score         4995 non-null   int64  
 4   upvote_ratio  4995 non-null   float64
 5   num_comments  4995 non-null   int64  
 6   created_utc   4995 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 273.3+ KB


In [39]:
# find all posts with selftexts saying the posts were deleted or removed --> 176 total posts

sub_2_all_posts.loc[(sub_2_all_posts['selftext']== '[removed]') | (sub_2_all_posts['selftext']== '[deleted]')]

Unnamed: 0,subreddit,title,selftext,score,upvote_ratio,num_comments,created_utc
17,netflixwitcher,Guys how many seasons do you think they will r...,[removed],1,1.00,1,1641935999
59,netflixwitcher,So some problems with s2,[removed],1,1.00,0,1641765253
96,netflixwitcher,Just finished the show and I'm really conflicted.,[removed],1,1.00,0,1641596447
217,netflixwitcher,A rewrite of season 2 - What could've been a g...,[removed],1,1.00,0,1641170501
221,netflixwitcher,"The characters in this show don't act, behave,...",[removed],1,1.00,0,1641167105
...,...,...,...,...,...,...,...
4908,netflixwitcher,Would you have been okay with a Black or Asian...,[removed],0,0.17,3,1590959324
4923,netflixwitcher,Henry Cavill,[deleted],1,1.00,0,1590768757
4926,netflixwitcher,Hey friends don't be bored watch Netflix [[free]],[removed],1,1.00,1,1590688710
4980,netflixwitcher,Filavandrel meme lord,[deleted],1,1.00,2,1589492028


In [40]:
# drop all deleted/removed posts

sub_2_all_posts = sub_2_all_posts.drop(
    sub_2_all_posts[(sub_2_all_posts['selftext']== '[removed]') | (sub_2_all_posts['selftext']== '[deleted]')].index)

sub_2_all_posts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4819 entries, 0 to 4994
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   subreddit     4819 non-null   object 
 1   title         4819 non-null   object 
 2   selftext      4815 non-null   object 
 3   score         4819 non-null   int64  
 4   upvote_ratio  4819 non-null   float64
 5   num_comments  4819 non-null   int64  
 6   created_utc   4819 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 301.2+ KB


In [41]:
sub_2_all_posts[sub_2_all_posts['selftext'].isnull()]

Unnamed: 0,subreddit,title,selftext,score,upvote_ratio,num_comments,created_utc
1717,netflixwitcher,She looks great,,1,1.0,0,1639072007
1741,netflixwitcher,The show isn't slavic enough,,0,0.25,2,1638916457
2296,netflixwitcher,"""USA"" FREE 1 YEAR NETFLIX ACCOUNT",,1,1.0,2,1633560282
2681,netflixwitcher,"Hi guys, iam big fan of the witcher only iam a...",,1,1.0,2,1628779571


In [42]:
# drop null posts --> 4 posts from 'selftext' column
sub_2_all_posts.dropna(inplace= True)
sub_2_all_posts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4815 entries, 0 to 4994
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   subreddit     4815 non-null   object 
 1   title         4815 non-null   object 
 2   selftext      4815 non-null   object 
 3   score         4815 non-null   int64  
 4   upvote_ratio  4815 non-null   float64
 5   num_comments  4815 non-null   int64  
 6   created_utc   4815 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 300.9+ KB


In [78]:
# pickle out netflixwitcher_5000 df to dataframes folder

with open('dataframes/netflixwitcher_5000.pkl', mode= 'wb') as pickle_out:
    pickle.dump(sub_2_all_posts, pickle_out) 

### merge dataframes

In [45]:
# merge dataframes

all_witcher_5000 = pd.concat([sub_1_all_posts, sub_2_all_posts])

In [46]:
all_witcher_5000.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9781 entries, 0 to 4994
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   subreddit     9781 non-null   object 
 1   title         9781 non-null   object 
 2   selftext      9781 non-null   object 
 3   score         9781 non-null   int64  
 4   upvote_ratio  9781 non-null   float64
 5   num_comments  9781 non-null   int64  
 6   created_utc   9781 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 611.3+ KB


In [47]:
# don't need UTC time anymore so drop that column

all_witcher_5000.drop(columns= ['created_utc'], inplace= True)

In [73]:
all_witcher_5000

Unnamed: 0,subreddit,title,selftext,score,upvote_ratio,num_comments
0,Witcher3,Why did CD never patch the Wolf set bug?,This bug has always frustrated me. Why couldn'...,1,1.0,0
1,Witcher3,The heart of the woods quest,"As you may remember/ know, the quest has two c...",1,1.0,0
2,Witcher3,Bought the complete edition on PS4,I had this in Xbox but ended up moving and lef...,1,1.0,0
3,Witcher3,This merchant will give you 10 crowns minimum ...,,1,1.0,0
4,Witcher3,When will my winter berry’s grow back,Ice tried everything saving and quitting skipp...,1,1.0,0
...,...,...,...,...,...,...
4990,netflixwitcher,Ciri waiting for her skype partner to show up ...,,1,1.0,0
4991,netflixwitcher,Freya Allan,Has anyone seen Freya Allan's Instagram? I'm a...,1,1.0,13
4992,netflixwitcher,Some will say the artistic direction of the sh...,,1,1.0,69
4993,netflixwitcher,Some will say the artistic direction of the sh...,&amp;#x200B;\n\nhttps://preview.redd.it/25979h...,1,1.0,2


In [75]:
# pickle out all_witcher_5000 df to dataframes folder

with open('dataframes/all_witcher_5000.pkl', mode= 'wb') as pickle_out:
    pickle.dump(all_witcher_5000, pickle_out) 