In [7]:
import requests
import time
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt
import seaborn as sns 

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_colwidth', 999)


In [27]:
def get_data(subreddit,epoch_time):
    url =f'https://api.pushshift.io/reddit/search/submission?subreddit={subreddit}&author!=[deleted]&size=500&is_self=true&before={epoch_time}'
    res = requests.get(url)
    data = res.json()
    return data['data']

In [24]:
def get_posts(subreddit, post_count):
    all_posts = []
    epoch_time = int(time.time())
    
    while len(all_posts) < post_count:
        all_posts.extend(get_data(subreddit, epoch_time))
        if epoch_time != all_posts[-1]['created_utc']:
            epoch_time = all_posts[-1]['created_utc']
        else:
            break
    return all_posts

In [25]:
def filter_posts(all_posts):
    vetted_posts = [post for post in all_posts if "author" and "selftext" in post]
    final_posts = []
    for post in vetted_posts:
        author = post['author']
        selftext = post['selftext']
        if (author != 'deleted' and author != 'removed' 
            and selftext != 'removed' and selftext != ""
            and selftext != 'deleted' and selftext != '[removed]'
            and selftext != '[deleted]'
            and "http://" not in selftext and "https://" not in selftext):
            final_posts.append(post)
    
    return final_posts 

### Obtaining Data:

#### Obtaining data from the r/mexicanfood subreddit:

Note that after increasing the number of posts that I pull from the API for the mexican food subreddit, after filtering the posts for those that have useful text, the max number of useful posts I get is 1189. Let's work to see if this is enough data to make meaningful insight. 

In [12]:
real_mex = get_posts("mexicanfood", 3000)

In [13]:
len(real_mex)

1568

Filtering from the data we pulled:

In [14]:
vetted_mex = filter_posts(real_mex)

In [15]:
len(vetted_mex)

1189

In [16]:
mexican_food = pd.DataFrame(vetted_mex)

In [17]:
mexican_food.shape

(1189, 84)

#### Obtaining data from the r/tacobell subreddit:

In order to avoid having unbalanced data, I have to make a smaller initial pull of posts from the tacobell subreddit. I try to work with about the same number of posts as the mexican food subreddit. 

In [28]:
tacobell_posts = get_posts("tacobell", 2500)

Filtering the data we pulled

In [29]:
vetted_tacobell = filter_posts(tacobell_posts)

In [30]:
len(vetted_tacobell)

1931

In [31]:
snip_tacobell = vetted_tacobell[0:1190]

In [32]:
taco_bell = pd.DataFrame(snip_tacobell)

In [33]:
taco_bell.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,whitelist_status,wls,link_flair_css_class,link_flair_template_id,link_flair_text,author_flair_background_color,author_flair_template_id,author_flair_text_color,author_cakeday,edited
0,[],False,mooshakez,,[],,text,t2_lw6ry694,False,False,...,all_ads,6,,,,,,,,
1,[],False,kevinmattress,,[],,text,t2_7og4ebf4,False,False,...,all_ads,6,one,97b37a16-d95b-11e3-a816-12313b0a74a7,Discussion,,,,,
2,[],False,improvius,,[],,text,t2_1107vz,False,False,...,all_ads,6,,7a19695c-13bc-11e9-a8c6-0e11312a4076,TB App/Website,,,,,
3,[],False,lostsurfer24t,,[],,text,t2_jyfu21b0,False,False,...,all_ads,6,,,,,,,,
4,[],False,bannanmonk,,[],,text,t2_2bpvfqav,False,False,...,all_ads,6,,,,,,,,


#### Create a columns specifying which subreddit the text is coming from

In [34]:
taco_bell['subreddit'] = 'tacobell'

In [35]:
mexican_food['subreddit'] = 'mexicanfood'

#### Check for duplicates

In [36]:
taco_bell['selftext'].nunique()

1187

In [37]:
taco_bell.drop_duplicates('selftext', inplace=True)

In [38]:
mexican_food['selftext'].nunique()

1185

In [39]:
mexican_food.drop_duplicates('selftext', inplace=True)

#### Export my dataframes

In [40]:
taco_bell.to_csv('./Data/tacobell.csv', index=False)

In [41]:
mexican_food.to_csv('./Data/mex_food.csv', index=False)