# Data Gathering and Cleaning


## Pushshift API Wrapper

In [5]:
import pandas as pd
import time
import requests
import datetime as dt

In [7]:
def get_date(created):
    return dt.datetime.fromtimestamp(created)

def query_pushshift(subreddit, 
                    kind='submission', 
                    skip=5, 
                    times=50, 
                    subfield = ['title', 
                                'selftext', 
                                'subreddit', 
                                'created_utc', 
                                'author', 
                                'num_comments',
                                'score', 
                                'is_self']):
    
    stem = f"https://api.pushshift.io/reddit/search/{kind}/?subreddit={subreddit}&size=500" 
    mylist = []
    
    # Create the loop to scrape Reddit x times
    for x in range(1, times + 1):
        URL = f"{stem}&after={skip * x}d"
        print(URL)

        response = requests.get(URL)
        assert response.status_code == 200
        the_json = response.json()
        df = pd.DataFrame.from_dict(the_json['data'])
        mylist.append(df)
        time.sleep(3)
        
    full = pd.concat(mylist, sort=False)
    if kind == "submission":
        full = full[subfield]
        full = full.drop_duplicates()
        full = full.loc[full['is_self'] == True]
          
    _timestamp = full["created_utc"].apply(get_date)
    full['timestamp'] = _timestamp
    print(full.shape)
    return full

In [8]:
# Scrape environment data - increase dates to get more data
df_env = query_pushshift(subreddit='environment', skip=5, times=70)

https://api.pushshift.io/reddit/search/submission/?subreddit=environment&size=500&after=5d
https://api.pushshift.io/reddit/search/submission/?subreddit=environment&size=500&after=10d
https://api.pushshift.io/reddit/search/submission/?subreddit=environment&size=500&after=15d
https://api.pushshift.io/reddit/search/submission/?subreddit=environment&size=500&after=20d
https://api.pushshift.io/reddit/search/submission/?subreddit=environment&size=500&after=25d
https://api.pushshift.io/reddit/search/submission/?subreddit=environment&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=environment&size=500&after=35d
https://api.pushshift.io/reddit/search/submission/?subreddit=environment&size=500&after=40d
https://api.pushshift.io/reddit/search/submission/?subreddit=environment&size=500&after=45d
https://api.pushshift.io/reddit/search/submission/?subreddit=environment&size=500&after=50d
https://api.pushshift.io/reddit/search/submission/?subreddit=environment&size=500

In [4]:
# Scrape technology data
df_tech = query_pushshift(subreddit='technology', skip=5, times=50)

https://api.pushshift.io/reddit/search/submission/?subreddit=technology&size=500&after=5d
https://api.pushshift.io/reddit/search/submission/?subreddit=technology&size=500&after=10d
https://api.pushshift.io/reddit/search/submission/?subreddit=technology&size=500&after=15d
https://api.pushshift.io/reddit/search/submission/?subreddit=technology&size=500&after=20d
https://api.pushshift.io/reddit/search/submission/?subreddit=technology&size=500&after=25d
https://api.pushshift.io/reddit/search/submission/?subreddit=technology&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=technology&size=500&after=35d
https://api.pushshift.io/reddit/search/submission/?subreddit=technology&size=500&after=40d
https://api.pushshift.io/reddit/search/submission/?subreddit=technology&size=500&after=45d
https://api.pushshift.io/reddit/search/submission/?subreddit=technology&size=500&after=50d
https://api.pushshift.io/reddit/search/submission/?subreddit=technology&size=500&after=55d


In [9]:
# Randomly sample rows in tech df to have balanced classes with env data
df_tech = df_tech.sample(n=4024)

# Confirm shapes are the same
df_tech.shape, df_env.shape

((4024, 9), (4024, 9))

In [10]:
# Combine env and tech dataframes
df = pd.concat([df_env, df_tech], ignore_index=True)
df.shape

(8048, 9)

In [11]:
# save raw file
df.to_csv('../data/posts_raw.csv', index=False)

# Data Cleaning
In this section I combine the "selftext" and "titles" from each reddit post into one column for analysis. I made this decision because there were many posts without text in the body. I also drop posts made by the Automoderator and remove non-letter characters.

In [12]:
# Import for cleaning
posts = pd.read_csv('../data/posts_raw.csv')

In [13]:
posts['subreddit'].value_counts()

environment    4024
technology     4024
Name: subreddit, dtype: int64

In [15]:
# Confirm how many posts do not have selftext
len(posts[(posts['selftext']=='[removed]')])

2853

In [16]:
# Fill empty self text values with empty string
posts['selftext'] = posts['selftext'].replace('[removed]',' ')

In [17]:
# Drop posts by the AutoModerator
posts.drop(posts[(posts['author']=='AutoModerator')].index, inplace=True)

# Confirm none left
posts[(posts['author']=='AutoModerator')]

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp


In [19]:
# combine selftext and title columns into one text column
posts['text'] = posts['selftext'] + posts['title']

# Drop all columns except subreddit identifier and text
posts.drop(columns=['title',
                    'selftext',
                    'created_utc',
                    'author',
                    'num_comments',
                    'score',
                    'is_self',
                   'timestamp'], inplace=True)
posts.head()

Unnamed: 0,subreddit,text
0,environment,"Hey all,\n\nI'm posting this. . . Well, bc the..."
1,environment,I’m trying to find a responsible way to get ri...
2,environment,This documentary is breaking my heart.Planet o...
3,environment,Destroying a natural simbiotic system (nature)...
4,environment,2 million chickens will be killed in Delaware ...


In [20]:
# Convert environment and tech into binary labels
posts['subreddit'] = posts['subreddit'].map({'environment': 0, 'technology': 1})

In [21]:
posts['subreddit'].value_counts()

0    4024
1    4014
Name: subreddit, dtype: int64

In [22]:
# check for null values
posts.isnull().sum()

subreddit      0
text         467
dtype: int64

In [23]:
# drop rows with null values
posts.dropna(inplace=True)

## Save to csv

In [24]:
posts.to_csv('../data/posts_clean.csv', index=False)