# Part I: Reddit Data

Retrieving data from reddit via the Pushshift API.

Approach is based on O'Brien, 2020:

https://github.com/iterative/aita_dataset

DOI: 10.5281/zenodo.3677563

In [1]:
# Step 1: Loading required modules
import requests
import json
import pandas as pd
import time
import math
import datetime

In [2]:
# Step 2: Defining the function to retrieve data from the API
def getPushshiftData(after, before):
    url = 'https://api.pushshift.io/reddit/submission/search/'
    sort = '?sort_type=created_utc&sort=asc'
    subr = '&subreddit=Bitcoin'
    after = '&after=' + str(after)
    before = '&before=' + str(before)
    size = '&size=100'
    full_url = url + sort + subr + after + before + size
    print(full_url)
    r = requests.get(full_url)
    data = json.loads(r.text)
    return data['data']

In [3]:
# Step 3: Defining the function to put raw data into a dataset
def scraping(first_epoch, last_epoch, filename):
    timestamps = []
    authors = []
    scores = []
    comments = []
    ids = []
    titles = []
    texts = []

    after = first_epoch
    while int(after) < last_epoch:
        data = getPushshiftData(after,last_epoch)
        tmp_times = []; tmp_authors = []; tmp_scores = [];
        tmp_coms = []; tmp_ids = []; tmp_titles = [];
        tmp_texts = [];

        for post in data:
            tmp_times.append(post['created_utc'])
            tmp_authors.append(post['author'])
            tmp_scores.append(post['score'])
            tmp_coms.append(post['num_comments'])
            tmp_ids.append(post['id'])
            tmp_titles.append(post['title'])
            try:
                tmp_texts.append(post['selftext'])
            except:
                tmp_texts.append(math.nan)

        try:
            if max(tmp_times) not in timestamps:
                timestamps = timestamps + tmp_times
                authors = authors + tmp_authors
                scores = scores + tmp_scores
                comments = comments + tmp_coms
                ids = ids + tmp_ids
                titles = titles + tmp_titles
                texts = texts + tmp_texts
            else:
                break
        except:
            break


        after = max(timestamps)

        print([str(len(ids)) + " posts collected so far."])
        time.sleep(3)

    # Write to a csv file
    d = {'id':ids, 'timestamp':timestamps, 'author':authors,
        'score':scores, 'comments':comments, # 'sticks':sticks,
        'title':titles, 'text':texts}
    df = pd.DataFrame(d)
    df.to_csv(filename, index=False)

In [4]:
# Step 4: Calling the function to retrieve the data and save.
# Note: In order not to overuse the API, the entire time period
# is split up in different subsets
# Note 2: The files will subfiles are not in the GitHub repo because
# of the .gitignore.

scraping(1451606400, 1467331200, filename="redditjanjun2016.csv")     # 1
scraping(1467331200, 1483228800, filename="redditjuldec2016.csv")     # 2

scraping(1483228800, 1498867200, filename="redditjanjun2017.csv")     # 3
scraping(1498867200, 1514764800, filename="redditjuldec2017.csv")     # 4

scraping(1514764800, 1528963814, filename="redditjanjun2018a.csv")     # 5a
scraping(1528963814, 1530403200, filename="redditjanjun2018b.csv")     # 5b
scraping(1530403200, 1546300800, filename="redditjuldec2018.csv")     # 6

scraping(1546300800, 1561939200, filename="redditjanjun2019.csv")     # 7
scraping(1561939200, 1577836800, filename="redditjuldec2019.csv")     # 8

scraping(1577836800, 1593561600, filename="redditjanjun2020.csv")     # 9
scraping(1593561600, 1609459200, filename="redditjuldec2020.csv")     # 10

In [5]:
# Step 5: Concatenating and writing to a single file
datasets = ['redditjanjun2016', 'redditjuldec2016',
    'redditjanjun2017', 'redditjuldec2017',
    'redditjanjun2018a', 'redditjanjun2018b', 'redditjuldec2018',
    'redditjanjun2019', 'redditjuldec2019',
    'redditjanjun2020', 'redditjuldec2020']

final_df = pd.DataFrame()

for elem in datasets:
    csv = elem + '.csv'
    df = pd.read_csv(csv)
    final_df = pd.concat([final_df, df])
    
final_df.to_csv('intermediate_dataset.csv')

In [6]:
# Step 6: Edit variables
df = pd.read_csv('intermediate_dataset.csv', index_col = 0)
# ID
df['id'] = df['id'].astype(str)

# Timestamp
# transform

df['timestamp'] = df['timestamp'].astype(int)

def epoch_to_time(elem):
    a = datetime.datetime.utcfromtimestamp(elem)
    return a

def time_to_date(elem):
    b = elem.date()
    return b

df['Time'] = df['timestamp'].apply(epoch_to_time)
df['Day'] = df['Time'].apply(time_to_date)

# Author
df['author'] = df['author'].astype(str)

# Score 
df['score'] = df['score'].astype(int)

# Comments
df['comments'] = df['comments'].astype(int)

# Text
df['text'] = df['text'].astype(str)

df.to_csv('df_final.csv', index=False)

In [7]:
# Step 7: Zipping the file (done manually)

# Part II: Coindesk data
Powered by Coindesk (https://www.coindesk.com/price/bitcoin).

In [8]:
# Step 1: Import Modules
import requests
import json
import pandas as pd

In [9]:
# Step 2: Define the Function 
# Feed in the YYYY-MM-DD format
def get_bitcoin_prices(start, end):
    url = 'https://api.coindesk.com/v1/bpi/historical/close.json'
    first = '?start=' + start
    last = '&end=' + end
    full_url = url + first + last
    print(full_url)
    r = requests.get(full_url)
    data = json.loads(r.text)
    prices = data['bpi']
    return prices

In [10]:
# Step 3: Write the function and write to .csv file
prices = get_bitcoin_prices('2016-01-01', '2020-12-31')

df = pd.DataFrame(list(prices.items()), columns=['Date', 'BPI'])
df = df.set_index('Date')
df.to_csv('bpi.csv', index=True)

https://api.coindesk.com/v1/bpi/historical/close.json?start=2016-01-01&end=2020-12-31


# Done!