# ![](https://ga-dash.s3.amazonaws.com/production/assets/logo-9f88ae6c9c3871690e33280fcf557f33.png) Project 3: Web APIs & NLP
### Problem Statement 

As an outside consultant to reddit, I have been approached to create a solution to utilize machine learning to evaluate whether a post was unintensionally posted to an incorrect subreddit. To initially show a proof of concept, I have chosen the subreddit's r/science and r/space to evaluate a binary prediction of whether an inputted post best-applies to one, or the other subreddit to hopefully contribute to a broader classification in the future.

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import time
import os

In [2]:
class Reddit_Source:
    def __init__(self,sub):
        self.base_link = 'https://api.pushshift.io/reddit/search/submission'     
        self.params = {
            'subreddit': sub,
            'size': None #doesn't appear to be working
        }
        self.all_posts = []
        self.subreddit = sub
        
    def get_post_count(self):
        return len(self.all_posts)
        
    def request_data(self, request_size):
        
        assert request_size > 0
        
        self.params['size'] = request_size
        post_count = 0
        
        res = requests.get(self.base_link, params = self.params)
        self.last_pull_status = res.status_code
        
        if res.status_code == 200:
            posts = pd.DataFrame(res.json()['data'])
            post_count += len(posts)
            self.all_posts.append(posts)
            if post_count > 0:
                self.params['before'] = posts['created_utc'].min()
                self.last_pull = True
            else:
                self.last_pull = False
        else:
            self.last_pull = False

        #else:
            #print(f'status: {res.status_code}')
            #print(res.url)
            
            
        #print("scraped from {}: {}".format(self.params['subreddit'], post_count))
        
        
        
    def scrape(self,iterations,amount_per_pull):
        i = 0
        while i < iterations:
            print('currently on interation:' + str(i+1))
            self.request_data(amount_per_pull)
            print(self.last_pull_status)
            #if self.last_pull:
            i += 1
            
            
    def consolidate(self):
        self.raw_data =  pd.concat(self.all_posts,axis=0)


In [3]:
science = Reddit_Source('science')

In [4]:
science.scrape(300,500)

currently on interation:1
200
currently on interation:2
200
currently on interation:3
200
currently on interation:4
200
currently on interation:5
504
currently on interation:6
200
currently on interation:7
504
currently on interation:8
200
currently on interation:9
200
currently on interation:10
200
currently on interation:11
200
currently on interation:12
200
currently on interation:13
200
currently on interation:14
200
currently on interation:15
200
currently on interation:16
200
currently on interation:17
200
currently on interation:18
200
currently on interation:19
504
currently on interation:20
200
currently on interation:21
200
currently on interation:22
200
currently on interation:23
504
currently on interation:24
200
currently on interation:25
200
currently on interation:26
200
currently on interation:27
200
currently on interation:28
200
currently on interation:29
200
currently on interation:30
200
currently on interation:31
200
currently on interation:32
504
currently on inte

In [5]:
physics = Reddit_Source('physics')

In [6]:
physics.scrape(300,500)

currently on interation:1
200
currently on interation:2
504
currently on interation:3
200
currently on interation:4
200
currently on interation:5
200
currently on interation:6
200
currently on interation:7
200
currently on interation:8
200
currently on interation:9
200
currently on interation:10
200
currently on interation:11
200
currently on interation:12
200
currently on interation:13
200
currently on interation:14
200
currently on interation:15
504
currently on interation:16
200
currently on interation:17
200
currently on interation:18
200
currently on interation:19
200
currently on interation:20
504
currently on interation:21
200
currently on interation:22
200
currently on interation:23
200
currently on interation:24
200
currently on interation:25
200
currently on interation:26
200
currently on interation:27
200
currently on interation:28
200
currently on interation:29
200
currently on interation:30
200
currently on interation:31
200
currently on interation:32
200
currently on inte

In [12]:
physics.consolidate()

In [13]:
science.raw_data.to_csv('./Data/Science.csv')

In [14]:
physics.raw_data.to_csv('./Data/Physics.csv')

In [18]:
space = Reddit_Source('space')

In [19]:
space.scrape(400,500)

space.consolidate()

space.raw_data.to_csv('./Data/Space.csv')

currently on interation:1
200
currently on interation:2
504
currently on interation:3
200
currently on interation:4
200
currently on interation:5
200
currently on interation:6
200
currently on interation:7
504
currently on interation:8
200
currently on interation:9
200
currently on interation:10
200
currently on interation:11
200
currently on interation:12
200
currently on interation:13
504
currently on interation:14
200
currently on interation:15
200
currently on interation:16
200
currently on interation:17
200
currently on interation:18
200
currently on interation:19
200
currently on interation:20
200
currently on interation:21
200
currently on interation:22
200
currently on interation:23
200
currently on interation:24
200
currently on interation:25
200
currently on interation:26
504
currently on interation:27
504
currently on interation:28
200
currently on interation:29
200
currently on interation:30
200
currently on interation:31
200
currently on interation:32
200
currently on inte