## Data Collection 

In [1]:
# imports
import requests
import pandas as pd
import time
import random

## Scrapping

In [12]:
# function that takes in reddit url and returns a dataframe of scrapped data
def reddit_scrape(url):
    posts = []
    after = None

    for a in range(40):
        if after == None:
            current_url = url
        else:
            current_url = url + '?after=' + after
        res = requests.get(current_url, headers={'User-agent': 'Russ1337'})
    
        if res.status_code != 200:
            print('Status error', res.status_code)
            break
    
        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        after = current_dict['data']['after']
    
        # generate a random sleep duration to look more 'natural'
        sleep_duration = random.randint(2,5)
        time.sleep(sleep_duration)
    df = pd.DataFrame(posts)
    return df

## Subreddit Thread: History

In [13]:
# subreddit thread history
url_hist = 'https://www.reddit.com/r/history.json'

In [16]:
# scrapping history
df_hist = reddit_scrape(url_hist)

In [20]:
# removing duplicate posts, if any
df_hist = df_hist.drop_duplicates(subset='title')

In [32]:
# keeping relevant columns
df_hist = df_hist[['title', 'selftext', 'subreddit']]

In [57]:
# shape of dataframe
df_hist.shape

(1000, 3)

In [48]:
# checking the dataframe after filter
df_hist.head()

Unnamed: 0,title,selftext,subreddit
0,"Silly Questions Saturday, January 25, 2020",Do you have a question about history and have ...,history
1,75 years since Auschwitz: Survivors share stor...,,history
2,"Forty five years ago, eight Soviet women climb...",,history
3,Why is it so uncommon for U.S. Presidents to c...,It's really quite surprising to me that only 3...,history
4,According to a study done by the United States...,[https://encyclopedia.ushmm.org/content/en/ar...,history


In [46]:
# checking for null
df_hist.isnull().sum()

title        0
selftext     0
subreddit    0
dtype: int64

## Subreddit Thread: No such thing as stupid questions

In [15]:
# subreddit thread no such thing as stupid questions
url_nsq = 'https://www.reddit.com/r/NoStupidQuestions.json' 

In [17]:
# scrapping no such thing as stupid questions
df_nsq = reddit_scrape(url_nsq)

In [34]:
# removing duplicate posts, if any
df_nsq = df_nsq.drop_duplicates(subset='title')

In [37]:
# keeping relevant columns
df_nsq = df_nsq[['title', 'selftext', 'subreddit']]

In [56]:
# shape of dataframe
df_nsq.shape

(947, 3)

In [49]:
# checking the dataframe after filter
df_nsq.head()

Unnamed: 0,title,selftext,subreddit
0,Impeachment Megathread,"So, when the ""Why hasn't Trump been impeached ...",NoStupidQuestions
1,The Best of No Stupid Questions 2019,Thanks to everyone who participated and contri...,NoStupidQuestions
2,Why can't I sensor political content from my R...,I come to this website to relax and laugh. I ...,NoStupidQuestions
3,What's the correct way to have a through shower?,I grew up neglected and abused. I have never t...,NoStupidQuestions
4,Is it normal to check the faces of friends and...,Ever since I was a kid I always found myself t...,NoStupidQuestions


In [50]:
# checking for null
df_nsq.isnull().sum()

title        0
selftext     0
subreddit    0
dtype: int64

## Merging the Datasets

In [51]:
# concatenating the 2 datasets
df_reddit = pd.concat([df_hist, df_nsq], ignore_index=True)

In [53]:
# verifying the concatenation
df_reddit

Unnamed: 0,title,selftext,subreddit
0,"Silly Questions Saturday, January 25, 2020",Do you have a question about history and have ...,history
1,75 years since Auschwitz: Survivors share stor...,,history
2,"Forty five years ago, eight Soviet women climb...",,history
3,Why is it so uncommon for U.S. Presidents to c...,It's really quite surprising to me that only 3...,history
4,According to a study done by the United States...,[https://encyclopedia.ushmm.org/content/en/ar...,history
...,...,...,...
1942,Why was Columbus such a dumbass?,,NoStupidQuestions
1943,Why arent there any younger children coming ou...,,NoStupidQuestions
1944,Of the last hundred days with a 1% chance of r...,Am I mangling both meteorology and math here...,NoStupidQuestions
1945,Too satisfying = nervous?,"So occationally, when I'm seeing something bee...",NoStupidQuestions


## Exporting Combined Dataset

In [55]:
# saving to csv
df_reddit.to_csv('subreddit_combined.csv')