This notebook is for scraping Reddit. I followed [this video](https://www.youtube.com/watch?v=gIZJQmX-55U) for getting the Reddit ID and Reddit secret.

In [1]:
from google.colab import userdata
import os

REDDIT_ID = userdata.get('REDDIT_ID')
REDDIT_SECRET = userdata.get('REDDIT_SECRET')

In [None]:
from google.colab import drive
drive.mount('/content/drive')
save_path = '/content/drive/MyDrive'

In [None]:
!pip install praw

In [4]:
from datetime import datetime
import numpy as np, pandas as pd
import praw
from praw.models import MoreComments

# Process post
We split a full post into paragraphs, and then split each paragraph into texts that are at most 10 sentences long. We keep the title, and only keep the first 99 such splits of a post.

In [5]:
def process(id, title, text, max_text_split=99, max_sentences=10):
    result_list = [f'{id}_00\t{title.strip()}\n']
    paragraphs = text.split('\n')
    paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip() != '']
    split_id = 1
    for i, paragraph in enumerate(paragraphs):
        if split_id > max_text_split:
            break
        sentences = paragraph.split('.')
        sentences = [sentence.strip() for sentence in sentences if sentence.strip() != '']
        for j in range(0, len(sentences), max_sentences):
            if split_id > max_text_split:
                break
            split_id_str = str(split_id).zfill(2)
            split_text = '. '.join(sentences[j: j+max_sentences]) + '\n'
            result_list.append(f'{id}_{split_id_str}\t{split_text}')
            split_id += 1
    return result_list

We split a comment into paragraphs, and then split each paragraph into texts that are at most 10 sentences long. Comments have no title, we only keep the first 100 such splits of a comment.

In [6]:
def process_comment(id, body, max_text_split=100, max_sentences=10):
    result_list = []
    if body.startswith("Thank you for posting in r/abusiverelationships."):
        return result_list
    paragraphs = body.split('\n')
    paragraphs = [paragraph.strip() for paragraph in paragraphs if paragraph.strip() != '']
    split_id = 0
    for i, paragraph in enumerate(paragraphs):
        if split_id > max_text_split:
            break
        sentences = paragraph.split('.')
        sentences = [sentence.strip() for sentence in sentences if sentence.strip() != '']
        for j in range(0, len(sentences), max_sentences):
            if split_id > max_text_split:
                break
            split_id_str = str(split_id).zfill(2)
            split_text = '. '.join(sentences[j: j+max_sentences]) + '\n'
            result_list.append(f'{id}_{split_id_str}\t{split_text}')
            split_id += 1
    return result_list

In [7]:
reddit = praw.Reddit(client_id=REDDIT_ID, client_secret=REDDIT_SECRET, user_agent='Reddit Scraper', check_for_async=False)

In [8]:
def scrape(subreddit_name, version, num_of_posts=900):
    print(f'Subreddit: {subreddit_name}')
    subreddit = reddit.subreddit(subreddit_name)
    count = 1
    with open(f'{save_path}/subreddit_{subreddit_name}_{num_of_posts}_{version}.tsv', 'w') as f:
        f.write(f'id\ttext\n')
        for submission in subreddit.new(limit=num_of_posts):
            if count == 1:
                utc = submission.created_utc
                print(f'\tTime of last post: {datetime.fromtimestamp(utc):%Y-%m-%d %H:%M:%S}')
            if count == num_of_posts:
                utc = submission.created_utc
                print(f'\tTime of first post: {datetime.fromtimestamp(utc):%Y-%m-%d %H:%M:%S}')
            for result in process(submission.id, submission.title, submission.selftext):
                f.write(result)
            count += 1

In [9]:
def scrape_comments(subreddit_name, version, num_of_posts=600):
    print(f'Subreddit: {subreddit_name}')
    subreddit = reddit.subreddit(subreddit_name)
    count = 1
    with open(f'{save_path}/subreddit_{subreddit_name}_{num_of_posts}_{version}_comments.tsv', 'w') as f:
        f.write(f'id\ttext\n')
        for submission in subreddit.new(limit=num_of_posts):
            if count == 1:
                utc = submission.created_utc
                print(f'\tTime of last post: {datetime.fromtimestamp(utc):%Y-%m-%d %H:%M:%S}')
            if count == num_of_posts:
                utc = submission.created_utc
                print(f'\tTime of first post: {datetime.fromtimestamp(utc):%Y-%m-%d %H:%M:%S}')
            for comment in submission.comments.list():
                if isinstance(comment, MoreComments):
                    continue
                for result in process_comment(comment.id, comment.body):
                    f.write(result)
            count += 1

# Scrape r/SuicideWatch and r/abusiverelationships

In [None]:
scrape('SuicideWatch', 'v2')
scrape('abusiverelationships', 'v2')

Subreddit: SuicideWatch
	Time of last post: 2024-09-20 12:53:18
	Time of first post: 2024-09-17 21:21:40
Subreddit: abusiverelationships
	Time of last post: 2024-09-20 12:12:02
	Time of first post: 2024-08-31 08:01:33


In [10]:
scrape_comments('abusiverelationships', 'v2')

Subreddit: abusiverelationships
	Time of last post: 2024-09-21 09:46:33
	Time of first post: 2024-09-09 00:35:04
