# Getting Reddit images and comments
The Pushift API is the easiest way to get data from Reddit. I used it to scrape images and comments from the r/curlyhair subreddit. These images were flaired either "before and after" or "hair victory." I got images from as far back as images were flaired.

One of the rules of the subreddit is that if you post a picture, you must post a comment describing your hair routine. Usin the same API, I grabbed all comment data associated with each image, so that I could later pull out those comments.

In [None]:
import pandas as pd
import requests
import json
import csv
import datetime
import numpy as np

In [None]:
subreddits = ['curlyhair']

# Get posts and image urls

In [None]:
# https://www.unixtimestamp.com/index.php
# before and after dates
after =  "1464480000" # 5/29/2016 Earliest post with flair data, that I can tell
before = "1600000000"  # 9/13/2020

def getPushshiftData(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/submission/search/?after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    print(url)
    
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

def collectSubData(subm):
    subData = list() # List to store the data
    title = subm['title']  
    author = subm['author']
    
    sub_id = subm['id']
    url = subm['url']
    subreddit = subm['subreddit']
    created = datetime.datetime.fromtimestamp(subm['created_utc'])
    numComms = subm['num_comments']

    # If the picture doesn't have a flair, supply a NaN instead
    try:
        flair = subm['link_flair_text']
    except KeyError:
        flair = np.nan
    perm = subm['permalink']
                 
    subData.append((sub_id,url,perm,title,author,subreddit,created,numComms, flair))
    subStats[sub_id] = subData

# Here, I am only querying one subreddit, so this only executes one loop
for sub in subreddits:
    
    print(sub)
    query = ""
    subCount = 0
    subStats = {}

    data = getPushshiftData(query, after, before, sub)
    # Will run until all posts have been gathered 
    # from the 'after' date up until before date
    while len(data) > 0:
        for submission in data:
            collectSubData(submission)
            subCount+=1
        after = data[-1]['created_utc']
        try:
            data = getPushshiftData(query, after, before, sub)
        except:
            pass
    
    def updateSubs_file(filename):
        upload_count = 0
        location = "./"
        file = location + filename + '.csv'
        with open(file, 'a', newline='', encoding='utf-8') as file: 
            a = csv.writer(file, delimiter=',')
            headers = ['sub_id','image_url','permalink','text','author','subreddit','created','n_comments','flair']
            a.writerow(headers)
            for sub in subStats:
                a.writerow(subStats[sub][0])
                upload_count+=1

            print(str(upload_count) + " submissions have been uploaded")

    updateSubs_file(sub)

# Get comments

In [None]:
# https://www.unixtimestamp.com/index.php
# before and after dates
after = "1464480000" # 5/29/2016 Earliest post with flair data, that I can tell
before = "1600000000"  # 9/13/2020

def getPushshiftData(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/search/comment/?size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)
    print(url)
    
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

def collectSubData(subm):
    subData = list() # List to store the data 
    title = subm['body']  
    author = subm['author']
    sub_id = subm['id']
    parent_id = subm['parent_id']
    subreddit = subm['subreddit']
    created = datetime.datetime.fromtimestamp(subm['created_utc'])
    link_id = subm['link_id']

    try:
        submitter= subm['is_submitter']
    except KeyError:
        submitter = np.nan
    try:
        perm = subm['permalink']
    except KeyError:
        perm = np.nan
    
    subData.append((sub_id,link_id,parent_id,title,author,subreddit,created,submitter, perm))
    subStats[sub_id] = subData

for sub in subreddits:
    
    print(sub)

    query = ""
    subCount = 0
    subStats = {}

    data = getPushshiftData(query, after, before, sub)
    # Will run until all posts have been gathered 
    # from the 'after' date up until before date
    while len(data) > 0:
        for submission in data:
            collectSubData(submission)
            subCount+=1
        after = data[-1]['created_utc']
        data = getPushshiftData(query, after, before, sub)

    def updateSubs_file(filename):
        upload_count = 0
        location = "./"
        fil = location + 'comments_' + filename + '.csv'
        with open(fil, 'w', newline='', encoding='utf-8') as fil: 
            a = csv.writer(fil, delimiter=',')
            headers = ['sub_id','link_id','parent_id','text','author','subreddit','created','is_subm','permalink']
            a.writerow(headers)
            for sub in subStats:
                a.writerow(subStats[sub][0])
                upload_count+=1

            print(str(upload_count) + " submissions have been downloaded")

    updateSubs_file(sub)

# Get images
Now that I have the image URLS, I have to go grab the actual images. Note that in some cases, such as images stored on Imgur.com, rather than uploaded directly to Reddit, could not be scraped with this code. I chose to skip those images.

In [None]:
from PIL import Image 
import os, io, hashlib

# Define a function to grab the images
def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        if os.path.getsize(file_path) < 5000:
            os.remove(file_path)
        else:
            print(f"SUCCESS - saved {url} - as {file_path}")
            return file_path
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

### I needed a way to connect the image filenames with the URLS so that I could keep track of which image went with which comments.
Note that this chunk of code was added later than the above code, and could only be executed once the "curlyhair.csv" file was created (which happends in another notebook.) That CSV file is the master file with the comments, image URLs, poster information, etc.

In [None]:
curly_df = pd.read_csv('curlyhair.csv')

curly_df = curly_df[curly_df['flair'] == 'hair victory']
curly_urls = curly_df['image_url']

# Create a file that has a column for the image URL and the path to the downloaded image
f = open('hair_images/image_urls.dat', 'w')
f.write('#           image_url                         file_path\n')

for url in curly_urls:
    file_path = persist_image('./hair_images/', url)
    if file_path != None:
        f.write(f'{url}    {file_path}\n')
    else:
        f.write(f'{url}    not downloaded\n')
f.close()

# This doesn't work with imgur hosted files, and sometimes there are "comment" posts which 
# don't have images anyway, so big deal.