# Import required libraries

In [5]:
import math
import praw
import json
import requests
import itertools
from tqdm import tqdm
import numpy as np
import time
from datetime import datetime, timedelta
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

Text clearing function

* Lowercase the text
* Remove unicode characters
* Remove stop words
* Remove mentions
* Remove URL
* Remove Hashtags
* Remove ticks and the next character
* Remove punctuations
* Remove numbers
* Replace the over spaces

In [118]:
def clear_text(x):
    x = x.lower()
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    x = x.encode('ascii', 'ignore').decode()
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'@\S+', ' ', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    return x

Data request function

In [3]:
def make_request(uri, max_retries = 5):
    def fire_away(uri):
        response = requests.get(uri)
        assert response.status_code == 200
        return json.loads(response.content)
    current_tries = 1
    while current_tries < max_retries:
        try:
            time.sleep(1)
            response = fire_away(uri)
            return response
        except:
            time.sleep(1)
            current_tries += 1
    return fire_away(uri)

Collecting the required data
* **title**: The title of the submission
* **score**: The number of upvotes for the submission
* **id**: ID of the submission
* **url**: The URL the submission links to
* **num_comments**: The number of comments on the submission
* **created_utc**: Time the submission was created
* **selftext**: The submissions’ selftext        
* **author**: Provides an instance of Redditor
* **is_self**: Whether or not the submission is a selfpost (text-only)
* **subreddit**: Provides an instance of Subreddit
* **cleared_text**: Cleared selftext,
* **link_flair_text**: The link flair’s text content

In [5]:
def map_posts(posts):
    return list(map(lambda post: {
        'title': post['title'],
        'score': post['score'],
        'id': post['id'],
        'url': post['url'],
        'num_comments': post['num_comments'],
        'created_utc': post['created_utc'],
        'selftext': post.get('selftext', ''),        
        'author': post['author'],
        'is_self': post['is_self'],
        'subreddit': post['subreddit'],
        'cleared_text': '',
        'link_flair_text': post.get('link_flair_text', '')
    }, posts))

Function for selecting documents with a number of characters greater than the threshold

In [6]:
def get_post(post_collections):
    post_collections_cleared = []
    for post in post_collections:
        if post['is_self'] and len(post['selftext'])>=2000: 
            cleared_text = clear_text(post['selftext'])
            if len(cleared_text)>=2000:
                post['cleared_text'] = cleared_text
                post_collections_cleared.append(post)
    return post_collections_cleared

A function that returns all the documents of a certain community for a certain period

In [7]:
def pull_posts_for(subreddit, start_at, end_at):
    SIZE = 100
    URI_TEMPLATE = r'https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}'
    it = 0
    post_collections = map_posts(make_request(URI_TEMPLATE.format(subreddit, start_at, end_at, SIZE))['data'])
    n_1 = len(post_collections)
    last = post_collections[-1]
    post_collections = get_post(post_collections)
    n_2 = len(post_collections)
    
    while n_1 == SIZE and n_2 <= 10000:
        new_start_at = last['created_utc'] + (5)
        more_posts = map_posts(make_request(URI_TEMPLATE.format(subreddit, new_start_at, end_at, SIZE))['data'])
        n_1 = len(more_posts)
        last = more_posts[-1]
        more_posts = get_post(more_posts)        
        post_collections.extend(more_posts)
        n_2 = len(post_collections)
        it+=1
    print(it, n_1, n_2)
    return post_collections

Selected communities, because they are dominated by documents with a large number of characters

In [8]:
set_subreddit = [
    'relationships',
    'love',
    'family',
    'Marriage',
    'Parenting',
    'askwomenadvice',
    'DecidingToBeBetter',
    'depression',
    'SuicideWatch',
    'TwoXChromosomes'
]

# Data collection
The request returns no more than 100 documents, we make requests until we get the required number of documents

In [10]:
%%time

post_collections = []
end_at = math.ceil(datetime.utcnow().timestamp())
start_at = math.floor((datetime.utcnow() - timedelta(days=730)).timestamp())
URI_TEMPLATE = r'https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}'
SIZE = 100
for subreddit in set_subreddit:
    it = 0
    n_3 = 0
    more_posts = map_posts(make_request(URI_TEMPLATE.format(subreddit, start_at, end_at, SIZE))['data'])
    n_1 = len(more_posts)
    last = more_posts[-1]
    n_temp = len(post_collections)
    post_collections.extend(get_post(more_posts))
    n_2 = len(post_collections)
    n_3+=(n_2-n_temp)
    while n_1 == SIZE and n_3<=10000:
        new_start_at = last['created_utc'] + (5)
        more_posts = map_posts(make_request(URI_TEMPLATE.format(subreddit, new_start_at, end_at, SIZE))['data'])
        n_1 = len(more_posts)
        last = more_posts[-1]
        n_temp = len(post_collections)
        post_collections.extend(get_post(more_posts))
        n_2 = len(post_collections)
        n_3+=(n_2-n_temp)
        it+=1
    print(subreddit, it*SIZE+n_1, n_2, n_3)

relationships 19200 10026 10026
love 32500 20035 10009
family 17500 30037 10002
Marriage 25400 40039 10002
Parenting 21900 50043 10004
askwomenadvice 27600 60080 10037
DecidingToBeBetter 26100 70108 10028
depression 29600 80143 10035
SuicideWatch 31300 90159 10016
TwoXChromosomes 31100 100180 10021
Wall time: 2h 25min 15s


In [115]:
posts_data = pd.DataFrame(post_collections)

# Export data to csv

In [119]:
posts_data['title_cleared'] = posts_data['title'].apply(clear_text)
posts_data.to_csv("posts.csv")