# **<span style="font-size:larger;"> 01: <span style="color:blue">Data Collection & Cleaning</span>** #

In [21]:
# Import Libraries
import requests
import json
import time
import pandas as pd

# **1.1 Data Collection**

In [25]:
def pushshift_query(full_df_path, subreddit):
    epochs = ['1451624400', '1454302800', '1456808400','1459483200', '1462075200', '1464753600',
              '1467345600', '1470024000', '1472702400', '1475294400', '1477972800', '1480568400',
              '1483246800','1485925200', '1488344400', '1491019200', '1493611200', '1496289600',
              '1498881600', '1501560000', '1504238400', '1506830400', '1509508800', '1512104400',
              '1514782800', '1517461200', '1519880400', '1522555200', '1525147200', '1527825600',
              '1530417600', '1533096000', '1535774400', '1538366400','1541044800','1543640400',
              '1546300800', '1548979200', '1551398400','1554076800', '1556668800','1559347200', 
              '1561939200', '1564617600', '1567296000', '1569888000','1572566400', '1575158400',
              '1577898000', '1580576400', '1583082000', '1585756800', '1588348800','1591027200', 
              '1593619200', '1596297600', '1598976000', '1601568000','1604250000', '1606842000', 
              '1609520400', '1612198800', '1614618000','1617292800', '1619884800', '1622563200']
              # These are pulled from the begining of each month from January 2016 to June 2021
    for epoch in epochs:
        full_df = pd.read_csv(full_df_path)
        url = f'https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&before={epoch}&size=100'

        res = requests.get(url)
        if res.status_code == 200:
            post_list = res.json()['data']
            temp_df = pd.DataFrame(post_list)[['title', 'author', 'created_utc', 'subreddit']]
            full_df = pd.concat([full_df, temp_df])
            
            print(full_df.shape)
            print(full_df['title'].nunique())
            print("Waiting until next pull...")

            time.sleep(5)
        else:
            continue
        full_df.to_csv(full_df_path, index=False)

### Collecting Data From r/AnimalsBeingBros

In [23]:
full_df = pd.DataFrame(columns=['title', 'author', 'created_utc', 'subreddit'])
full_df.to_csv('./data/animalsbeingbros.csv', index=False)

In [24]:
pushshift_query(full_df_path = './data/animalsbeingbros.csv', subreddit = 'animalsbeingbros')

### Collecting Data From r/AnimalsBeingJerks

In [17]:
full_df2 = pd.DataFrame(columns=['title', 'author', 'created_utc', 'subreddit'])
full_df2.to_csv('./data/animalsbeingjerks.csv', index=False)

In [18]:
pushshift_query(full_df_path = './data/animalsbeingjerks.csv', subreddit = 'animalsbeingjerks')

# **1.2 Data Cleaning**

In [19]:
# Read in data
df1 = pd.read_csv('./data/animalsbeingbros.csv')
df2 = pd.read_csv('./data/animalsbeingjerks.csv')

**Concatenating DataFrames**

In [26]:
df = pd.concat([df1, df2])

**Dropping Duplicates**

In [None]:
df.drop_duplicates(subset='title', inplace=True)

**Fomatting Titles**

In [None]:
df['title'] = df['title'].str.lower()

**Creating Target**

In [None]:
# Engineer a feature to turn subreddit into a 1/0 column, where 1 indicates r/AnimalsBeingJerks
df['is_AnimalsBeingJerks'] = [1 if i == 'AnimalsBeingJerks' else 0 for i in df['subreddit']]

**Saving Work**

In [None]:
df.to_csv('./data/original_df.csv')

# ***Next Notebook***

## [02: EDA & Pre-Processing](https://github.com/ksylvia16/Subreddit-Classification/blob/cd3150a33a8d98eb171840af34465468e11431c8/code/02_EDA_%26_Pre-Processing.ipynb)