In [3]:
import pandas as pd
import datetime as dt
import time
import requests

In [12]:
# thank you to the pushshift demo code we covered in class
def query_pushshift(subreddit, kind = 'submission', day_window = 7, n = 100):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full

In [14]:
# get tales from retail data
results_retail = query_pushshift('TalesFromRetail')

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=7d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=14d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=21d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=28d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=35d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=42d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=49d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=56d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=63d
Querying fr

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=525d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=532d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=539d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=546d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=553d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=560d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=567d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=574d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TalesFromRetail&size=500&after=581d
Q

In [15]:
# results_retail with n=100 searches over a 7-day time frame
print(results_retail.shape)
print(results_retail[results_retail['selftext'] != '[removed]'].shape)

(8163, 9)
(2925, 9)


In [16]:
# filter out the removed posts
retail = results_retail[~results_retail['selftext'].isin(['[removed]'])].copy()

In [17]:
retail.shape

(2925, 9)

In [18]:
# add indicator column for classification
retail['is_tech'] = 0

In [7]:
# get tales from tech data
results_tech = query_pushshift('talesfromtechsupport')

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=7d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=14d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=21d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=28d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=35d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=42d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=49d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=56d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesf

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=504d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=511d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=518d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=525d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=532d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=539d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=546d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=talesfromtechsupport&size=500&after=553d
Querying from: https://api.pushshift.io/reddit/search/submission?subredd

In [8]:
# results_tech with n=80 searches over a 7-day time frame
print(results_tech.shape)
print(results_tech[results_tech['selftext'] != '[removed]'].shape)

(4140, 9)
(2503, 9)


In [9]:
# filter out the removed posts
tech = results_tech[~results_tech['selftext'].isin(['[removed]'])].copy()

In [10]:
tech.shape

(2503, 9)

In [11]:
# create indicator column
tech['is_tech'] = 1

In [19]:
tech.shape

(2503, 10)

In [20]:
tech.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp,is_tech
1,This is urgent! Fix this right now! But s week...,Got a call from a vip working at home. He comp...,talesfromtechsupport,1631096858,TheQuarantinian,69,1,True,2021-09-08,1
2,Every time I'm here the network sucks! Fix it!...,"Martin was a manager at humungocorp, overseein...",talesfromtechsupport,1631112720,TheQuarantinian,2,1,True,2021-09-08,1
3,"""Please stop asking me to do that.""",I have a person in my organization who just RE...,talesfromtechsupport,1631117739,jaxmagicman,347,1,True,2021-09-08,1
4,"How do I fix this? ""Support has ended the chat...","Support: ""Can you provide more details in rega...",talesfromtechsupport,1631117913,jggunbeliever,9,1,True,2021-09-08,1
5,Dear director. You get getting a new hard driv...,User complains that her laptop is slow. Unders...,talesfromtechsupport,1631127660,TheQuarantinian,107,1,True,2021-09-08,1


In [23]:
tech['is_tech'].value_counts()

1    2503
Name: is_tech, dtype: int64

In [21]:
retail.shape

(2925, 10)

In [22]:
retail.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp,is_tech
7,Local Homeless Threatened Me and Smoked M*th B...,Disclaimer: this is my first time writing a st...,TalesFromRetail,1631145547,zhalfface01z,3,1,True,2021-09-08,0
8,Customers that lack common sense or common cou...,I deal with customers that lack common sense a...,TalesFromRetail,1631146701,petalsrose,1,1,True,2021-09-08,0
12,Funny story. Accidentally Looked Up NSFW Relat...,\nA very funny and short story. Where I use to...,TalesFromRetail,1631195356,TylPlas26,0,1,True,2021-09-09,0
13,Customer doesn't understand the passage of time,"This happened to me a few years ago now, I've ...",TalesFromRetail,1631197247,Notiser,91,1,True,2021-09-09,0
17,Humans are descended from apes. Example:,Tl;dr at the end\n\nSo this story happened a f...,TalesFromRetail,1631204279,Admiral_Ced,0,1,True,2021-09-09,0


In [24]:
retail['is_tech'].value_counts()

0    2925
Name: is_tech, dtype: int64

In [25]:
# combine retail and tech
tales = retail.append(tech)

In [27]:
tales['is_tech'].value_counts()

0    2925
1    2503
Name: is_tech, dtype: int64

In [28]:
tales.shape

(5428, 10)

In [26]:
# save as csv
tales.to_csv('../data/tales.csv', index = False)