# Reddit API and Classification

**Previous:** [Data Collection and Wrangling](./01_data_collection_and_wrangling.ipynb)

## Preprocessing and Modeling

**Next:** [Preprocessing and Modeling](./03_preprocessing_and_modeling.ipynb)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

from bs4 import BeautifulSoup
import regex as re
import requests
import time
import random

In [4]:
#defining url to access
url_android = 'https://www.reddit.com/r/Android.json'
url_apple = 'https://www.reddit.com/r/apple.json'

In [5]:
#
def obtain_posts(url, file_path):
    
    posts = []
    after = None

    for n in range(5):
        if after == None:
            current_url = url
        else:
            current_url = url + '?after=' + after
        #print url to track request
        print(current_url)
        res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})

        if res.status_code != 200:
            print('Status error', res.status_code)
            break

        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        after = current_dict['data']['after']

        if n > 0:
            prev_posts = pd.read_csv(file_path)
            current_df = pd.DataFrame(prev_posts)

        else:
            pd.DataFrame(posts).to_csv(file_path, index = False)

        # generate a random sleep duration to look more 'natural'
        sleep_duration = random.randint(2,6)
        print(sleep_duration)
        time.sleep(sleep_duration)

In [7]:
obtain_posts(url_android, '../datasets/test1.csv')

https://www.reddit.com/r/Android.json
2
https://www.reddit.com/r/Android.json?after=t3_iz1dj7
6
https://www.reddit.com/r/Android.json?after=t3_iy5q3e
3
https://www.reddit.com/r/Android.json?after=t3_ixg1a9
3
https://www.reddit.com/r/Android.json?after=t3_iwe1y1
2


In [8]:
df = pd.read_csv('../datasets/test1.csv')

In [10]:
df.shape

(26, 108)

In [11]:
def obtain_posts1(url, file_path):
    posts = []
    after = None

    for a in range(4):
        if after == None:
            current_url = url
        else:
            current_url = url + '?after=' + after
        print(current_url)
        res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})

        if res.status_code != 200:
            print('Status error', res.status_code)
            break

        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        after = current_dict['data']['after']

        if a > 0:
            prev_posts = pd.read_csv(file_path)
            current_df = pd.DataFrame()

        else:
            pd.DataFrame(posts).to_csv(file_path, index = False)

        # generate a random sleep duration to look more 'natural'
        sleep_duration = random.randint(2,6)
        print(sleep_duration)
        time.sleep(sleep_duration)

In [12]:
obtain_posts1(url_android, '../datasets/test2.csv')

https://www.reddit.com/r/Android.json
3
https://www.reddit.com/r/Android.json?after=t3_izq9xs
4
https://www.reddit.com/r/Android.json?after=t3_iy5q3e
4
https://www.reddit.com/r/Android.json?after=t3_ixg1a9
2


In [18]:
posts = []
after = None

for a in range(10):
    if after == None:
        current_url = url_apple
    else:
        current_url = url_apple + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)
    after = current_dict['data']['after']
    
    if a > 0:
        prev_posts = pd.read_csv('test3.csv')
        current_df = pd.DataFrame()
        
    else:
        pd.DataFrame(posts).to_csv('test3.csv', index = False)

    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,6)
    print(sleep_duration)
    time.sleep(sleep_duration)

https://www.reddit.com/r/apple.json
5
https://www.reddit.com/r/apple.json?after=t3_j05uh7
6
https://www.reddit.com/r/apple.json?after=t3_izh06i
4
https://www.reddit.com/r/apple.json?after=t3_j04tcj
3
https://www.reddit.com/r/apple.json?after=t3_iyvj6c
6
https://www.reddit.com/r/apple.json?after=t3_iyc4bn
5
https://www.reddit.com/r/apple.json?after=t3_ixv210
4
https://www.reddit.com/r/apple.json?after=t3_iy04j7
2
https://www.reddit.com/r/apple.json?after=t3_iw2156
2
https://www.reddit.com/r/apple.json?after=t3_ivt5mi
3


In [19]:
test3 = pd.read_csv('test3.csv')

In [20]:
test3.shape

(27, 108)

In [21]:

test3.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,url_overridden_by_dest,link_flair_template_id
0,,apple,\n\nWelcome to the daily Tech Support thread f...,t2_6l4z3,False,,0,False,Daily Tech Support Thread - [September 26],"[{'e': 'text', 't': 'Official Megathread'}]",...,all_ads,True,https://www.reddit.com/r/apple/comments/j07nhs...,1797890,1601133000.0,0,,False,,
1,,apple,"## Hello, /r/Apple, and welcome to Wallpaper W...",t2_6l4z3,False,,0,False,Wallpaper Wednesday - [September 23],"[{'e': 'text', 't': 'Official Megathread'}]",...,all_ads,True,https://www.reddit.com/r/apple/comments/iy9tar...,1797890,1600867000.0,0,,False,,
2,,apple,,t2_jp69e,False,,0,False,"Hey, I made a 3D minesweeper game that's free ...","[{'e': 'text', 't': 'Promo Saturday'}]",...,all_ads,False,https://apps.apple.com/us/app/id1529127991,1797890,1601120000.0,0,,False,https://apps.apple.com/us/app/id1529127991,854c34e2-5702-11e9-bf73-0e73ef6cdf98
3,,apple,,t2_khozz,False,,0,False,"In memo to employees, Tim Cook once again give...","[{'e': 'text', 't': 'Discussion'}]",...,all_ads,False,https://twitter.com/markgurman/status/13096129...,1797890,1601071000.0,0,{'oembed': {'provider_url': 'https://twitter.c...,False,https://twitter.com/markgurman/status/13096129...,86b258de-5702-11e9-98ce-0eebcac587ec
4,,apple,,t2_30vizfh0,False,,0,False,Your Apple Watch measures your heart rate ever...,"[{'e': 'text', 't': 'Promo Saturday'}]",...,all_ads,False,https://apps.apple.com/us/app/cardiobot-heart-...,1797890,1601124000.0,0,,False,https://apps.apple.com/us/app/cardiobot-heart-...,854c34e2-5702-11e9-bf73-0e73ef6cdf98


In [22]:
posts = []
after = None

for a in range(4):
    if after == None:
        current_url = url_apple
    else:
        current_url = url_apple + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)
    after = current_dict['data']['after']
    
    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,60)
    print(sleep_duration)
    time.sleep(sleep_duration)

https://www.reddit.com/r/apple.json
32
https://www.reddit.com/r/apple.json?after=t3_j02mpv
17
https://www.reddit.com/r/apple.json?after=t3_j04j1w
43
https://www.reddit.com/r/apple.json?after=t3_izn5lt
11


In [23]:
len(posts)

102

In [24]:
df = pd.DataFrame(posts)

In [25]:
df.shape

(102, 108)

In [None]:
pd.DataFrame(posts).to_csv('test4.csv', index = False)