# Reddit Project - 01: Pushshift Requests
***

## Import Packages

In [139]:
import requests
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import re
import regex
import time
import json

### Filename format log - function

In [152]:
def filename_format_log(file_path, 
                        logfile = '../data/file_log.txt', 
                        now = round(time.time()), 
                        file_description = None): 
    try:
        ext = re.search('(?<!^)(?<!\.)\.(?!\.)', file_path).start() 
    except:
        raise NameError('Please enter a relative path with a file extension.') 
    stamp = re.search('(?<!^)(?<!\.)[a-z]+_[a-z]+(?=\.)', file_path).start()
    formatted_name = f'{file_path[:stamp]}{now}_{file_path[stamp:]}'  
    if not file_description:
        file_description = f'Pull: {time.asctime(time.gmtime(now))}'
    with open(logfile, 'a+') as f:
        f.write(f'{formatted_name}: {file_description}\n')
    return formatted_name, now, file_description

### Reddit Query Function

In [196]:
before = None

In [199]:
# Function was made by a collaboration with Hovanes Gasparian and Rene Wilkening.  May contain artifacts
def reddit_query(subreddits, n_samples=1000, after=None):
    global before
    url = f'https://api.pushshift.io/reddit/search/submission'
    last_comment = round(time.time())
    comment_list = []
    run = 1
    while len(comment_list) < n_samples:
        try:
            params = {
              'subreddit': subreddits,
              'sort':'desc',
              'size':n_samples,
              'before':before,
              'after':after,
             }
            response = requests.get(url, params = params)
            posts = response.json()['data']
            if len(posts) == 0:
                last_comment = last_comment
            else:
                last_comment = posts[-1]['created_utc']
                comment_list.extend(posts)
                timestamp = posts[-1]['created_utc']
                time.sleep(1) 
                run += 1
        except:
            if response.status_code != 200:
                return f'Check status. Error code: {response.status_code}'
            else:
                return 'Error. Pull not completed.'
    formatted_name, now, file_description = filename_format_log(file_path =f'../data/raw_{subreddits}.json', now=timestamp)
    with open(formatted_name, 'w+') as f:
        json.dump(comment_list, f)
        #df_sp = pd.DataFrame(comment_list)
        # spir_list = comment_list
        
    before =timestamp
    
    print(f'Saved and completed query and returned {len(comment_list)} submissions.')
    print(f'Reddit text is ready for processing.')
    return print(f'Last timestamp was {timestamp}.')

### Query 10 sets of 1000 rows from the 'spirituality' Subreddit

In [198]:
for i in range(10):
    print("Starting query", i)
    reddit_query('spirituality')

Starting query 0
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1578716144.
Starting query 1
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1577188855.
Starting query 2
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1575495943.
Starting query 3
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1573755045.
Starting query 4
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1571674913.
Starting query 5
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1569697881.
Starting query 6
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1567584046.
Starti

### Query 10 sets of 1000 rows from the 'meditation' Subreddit

In [200]:
for i in range(10):
    print("Starting query", i)
    reddit_query('meditation')

Starting query 0
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1559216363.
Starting query 1
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1557115758.
Starting query 2
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1554908037.
Starting query 3
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1552846578.
Starting query 4
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1550974135.
Starting query 5
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1549381649.
Starting query 6
Saved and completed query and returned 1000 submissions.
Reddit text is ready for processing.
Last timestamp was 1547931148.
Starti