# Average words per sentence of different subreddits

### Background
An analysis of the average word count per sentence of comments from the most popular Subreddits. 

wps = words per sentence

### Subreddits analyzed (top 20 most subscribed as of April 2019):

### Contents
1. [Setup](#1.-Setup)
2. [Average Words Per Sentence for Comments](#2-Average-Words-Per-Sentence-for-Comments)
3. [Average wps for Submission](#3.-Average-wps-for-submission)
4. [Average wps for Subreddit](#4.-Average-wps-for-subreddit)
5. [Data Visualization](#5.-Data-Visualization)

## 1. Setup

In [1]:
import praw
import pandas as pd
import datetime
import json
import time
import string
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import defaultdict

In [2]:
# Load credfile and display when last updated
credfile = 'credfile.json'
credfile_prefix = ''

# Read credentials to a dictionary
with open(credfile) as fh:
    creds = json.loads(fh.read())

print(f"[{datetime.datetime.now()}]" + f"{credfile} {'.' * 10} is being used as credfile")

reddit = praw.Reddit(client_id=creds['client_id'],
                     client_secret=creds['client_secret'],
                     user_agent=creds['user_agent']
                    )

print(reddit.read_only)  # Output: True

[2019-04-13 14:32:17.493945]credfile.json .......... is being used as credfile
True


In [3]:
subreddit_list = [#'AskReddit', 
#                   'funny', 
#                   'gaming', 
#                   'pics', 
#                   'science', 
#                   'worldnews', 
#                   'todayilearned', 
#                   'movies', 
#                   'soccer', 
#                   'videos', 
#                   'Music', 
#                   'IAmA', 
#                   'gifs', 
#                   'news', 
#                   'EarthPorn', 
#                   'askscience', 
                  'blog', 
                  'Showerthoughts', 
                  'explainlikeimfive', 
                  'books']

## 2. Average Words Per Sentence for Comments

In [4]:
def standardize_words(word_list):
    """
    Remove stopwords, one-caracter words, and convert words to lowercase.
    Given a list of tokenized words)
    """
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in word_list]
    
    # convert to lower case
    stripped_lower = [word.lower() for word in stripped]

    # Remove one character words
    text_tokenized = [word for word in stripped_lower if len(word) > 2]

    # remove stop words
    stop_words = set(stopwords.words('english'))
    text_stopped = [w for w in text_tokenized if not w in stop_words]
    
    return text_stopped

In [5]:
def avg_sentence_len_comment(comment):
    
    sentences = sent_tokenize(comment)
    words = word_tokenize(comment)   

    cleaned_list = [ x for x in words if len(x) > 2 ]
    
    try:
        avg_sent_len = round(len(cleaned_list) / len(sentences), 2)
    except ZeroDivisionError:
        avg_sent_len = 0
        
    return avg_sent_len

In [16]:
submission = reddit.submission(id='ba4dyn')

# iterate over top comments in the submission and\= create list of sentences
submission.comments.replace_more(limit=None)

total_submission_len = 0
for top_level_comment in submission.comments[1:]: # Skip AutoMod comment
    avg_sent_len = avg_sentence_len_comment(top_level_comment.body.replace('“', '').replace('”', ''))         
    total_submission_len += avg_sent_len

avg_submission_len = total_submission_len / len(submission.comments[1:])

print("Submission comments average wps: ", avg_submission_len)
print("Submission upvote ratio: ", submission.upvote_ratio)

Submission comments average wps:  11.932023809523804
Submission upvote ratio:  0.88


## 3. Average wps for Submission

#### Test on one submission using `id`

In [8]:
def avg_wps_submission(comment_list):
    """
    Iterate over list of comments in a submission and calculate the average words
    per sentence of all top-level comments
    """
    total_submission_sentence_len = 0
    for comment in comment_list:
        avg_sent_len = avg_sentence_len_comment(comment.body.replace('“', '').replace('”', ''))                 
        total_submission_sentence_len += avg_sent_len

    try:
        avg_wps_submission = total_submission_sentence_len / len(comment_list)
    except ZeroDivisionError:
        print("Submission has 0 comments, continuing...")
        avg_wps_submission = 0
        
    return avg_wps_submission

## 4. Average wps for Subreddit

In [9]:
def avg_wps_subreddit(submission_list):
    """
    Iterate over all x number of submissions and get the average words 
    per sentence for the entire subreddit
    """
    total_wps_subreddit = 0
    for submission in submission_list:
        # iterate over top comments in the submission and create list of sentences
        submission.comments.replace_more(limit=None)
        avg_wps_per_submission = avg_wps_submission(list(submission.comments))

        # keep running total of totals for entire subreddit
        total_wps_subreddit += avg_wps_per_submission
    
    avg_wps_subreddit = total_wps_subreddit / len(submission_list)
    
    return avg_wps_subreddit

In [29]:
def get_submission_data(submission_list):
    nested_dict = lambda: defaultdict(nested_dict)
    submission_data_dict = nested_dict()
    for submission in submission_list:
        # iterate over top comments in the submission and create list of sentences
        submission.comments.replace_more(limit=None)
        submission_data_dict['id']['upvotes'] = submission.ups
        submission_data_dict['id']['upvotes_ratio'] = submission.upvote_ratio
        submission_data_dict['id']['date'] = datetime.datetime.fromtimestamp(int(submission.created_utc)).strftime("%m/%d/%y %H:%M:%S")
        submission_data_dict['id']['comments'] = len(list(submission.comments))
        submission_data_dict['id']['avg_comment_wps'] = round(avg_wps_submission(list(submission.comments)), 2)
        
    return submission_data_dict

#### Iterate over the list of Subreddits
`lt` is the number of submissions returned per subreddit

In [36]:
def iterate_over_subs(subreddit_list, method, limit):
    
    lt = limit
    nested_dict = lambda: defaultdict(nested_dict)
    submission_metadata = nested_dict()
    start = time.time()
    for sub in subreddit_list:
        subreddit = reddit.subreddit(sub)
        if method == "hot":
            submission_list = list(subreddit.hot(limit=limit))
        elif method == "controversial":
            submission_list = list(subreddit.controversial(limit=limit))
        elif method == "top":
            submission_list = list(subreddit.top(limit=limit))
        else:
            raise ValueError
            print("Please select a valid type from: top, hot, controversial")
        submission_metadata[sub] = get_submission_data(submission_list)

    end = time.time()
    net = net = end-start

    print(f"Runtime: {round(net, 2)}s")
    
    return submission_metadata

In [None]:
submission_data = iterate_over_subs(subreddit_list, "hot", 1)

In [25]:
lt = 1

nested_dict = lambda: defaultdict(nested_dict)
submission_metadata = nested_dict()
start = time.time()
for sub in subreddit_list:
    subreddit = reddit.subreddit(sub)
    submission_list = list(subreddit.hot(limit=lt))
    submission_metadata[sub] = get_submission_data(submission_list)
    
end = time.time()
net = net = end-start

print(f"Runtime: {round(net, 2)}s")

Runtime: 55.31s


In [27]:
def default_to_regular(d):
    if isinstance(d, defaultdict):
        d = {k: default_to_regular(v) for k, v in d.items()}
    return d

In [28]:
submission_metadata_dict = default_to_regular(submission_metadata)
submission_metadata_dict

{'blog': {'id': {'upvotes': 37278,
   'upvotes_ratio': 0.84,
   'date': '04/08/19 10:34:10',
   'comments': 306,
   'avg_comment_wps': 9.536797385620917}},
 'Showerthoughts': {'id': {'upvotes': 4910,
   'upvotes_ratio': 0.99,
   'date': '02/13/18 17:46:38',
   'comments': 131,
   'avg_comment_wps': 8.59442748091603}},
 'explainlikeimfive': {'id': {'upvotes': 6211,
   'upvotes_ratio': 0.94,
   'date': '04/13/19 02:25:24',
   'comments': 14,
   'avg_comment_wps': 16.917142857142856}},
 'books': {'id': {'upvotes': 108,
   'upvotes_ratio': 0.99,
   'date': '03/30/19 09:10:33',
   'comments': 11,
   'avg_comment_wps': 10.083636363636364}}}

In [33]:
submission_metadata_dict['books']

{'id': {'upvotes': 108,
  'upvotes_ratio': 0.99,
  'date': '03/30/19 09:10:33',
  'comments': 11,
  'avg_comment_wps': 10.083636363636364}}

In [14]:
avg_sentence_length_df.to_csv('avg_wps_hottest_20_n1.csv', index=False)

## 5. Data Visualization