# Average words per sentence of different subreddits

### Background
An analysis of the average word count per sentence of comments from the most popular Subreddits.

### Subreddits analyzed (top 20 most subscribed as of April 2019):

* r/mildlyinteresting
* r/funny
* r/AskReddit
* r/gaming
* r/pics
* r/science
* r/worldnews
* r/todayilearned
* r/movies
* r/aww
* r/videos
* r/Music
* r/IAmA
* r/gifs
* r/news
* r/EarthPorn
* r/askscience
* r/blog
* r/Showerthoughts
* r/explainlikeimfive
* r/books

In [1]:
import praw
import pandas as pd
import datetime
import json
import string
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
# Load credfile and display when last updated
credfile = 'credfile.json'
credfile_prefix = ''

# Read credentials to a dictionary
with open(credfile) as fh:
    creds = json.loads(fh.read())

print(f"[{datetime.datetime.now()}]" + f"{credfile} {'.' * 10} is being used as credfile")

reddit = praw.Reddit(client_id=creds['client_id'],
                     client_secret=creds['client_secret'],
                     user_agent=creds['user_agent']
                    )

print(reddit.read_only)  # Output: True

[2019-04-08 07:15:03.085713]credfile.json .......... is being used as credfile
True


In [3]:
subreddit_list = [#'AskReddit', 
#                   'funny', 
#                   'gaming', 
#                   'pics', 
#                   'science', 
#                   'worldnews', 
#                   'todayilearned', 
#                   'movies', 
#                   'aww', 
#                   'videos', 
#                   'Music', 
                  'IAmA', 
                  'gifs', 
                  'news', 
                  'EarthPorn', 
                  'askscience', 
                  'blog', 
                  'Showerthoughts', 
                  'explainlikeimfive', 
                  'books']

In [4]:
subreddit = reddit.subreddit('AskReddit')

In [5]:
def standardize_words(word_list):
    """
    Remove stopwords, one-caracter words, and convert words to lowercase.
    Given a list of tokenized words)
    """
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in word_list]
    
    # convert to lower case
    stripped_lower = [word.lower() for word in stripped]

    # Remove one character words
    text_tokenized = [word for word in stripped_lower if len(word) > 2]

    # remove stop words
    stop_words = set(stopwords.words('english'))
    text_stopped = [w for w in text_tokenized if not w in stop_words]
    
    return text_stopped

#### Test iterate over one submission

In [6]:
def avg_sentence_len_comment(comment):
    
    sentences = sent_tokenize(comment)
    words = word_tokenize(comment)   

    cleaned_list = [ x for x in words if len(x) > 2 ]
    
    try:
        avg_sent_len = round(len(cleaned_list) / len(sentences), 2)
    except ZeroDivisionError:
        avg_sent_len = 0
        
    return avg_sent_len

In [7]:
submission = reddit.submission(id='ba4dyn')

# iterate over top comments in the submission and\= create list of sentences
submission.comments.replace_more(limit=None)

total_submission_len = 0
for top_level_comment in submission.comments[1:]: # Skip AutoMod comment
    avg_sent_len = avg_sentence_len_comment(top_level_comment.body.replace('“', '').replace('”', ''))         
    total_submission_len += avg_sent_len

avg_submission_len = total_submission_len / len(submission.comments[1:])

print(avg_submission_len)

11.960803571428567


In [9]:
import time

In [None]:
lt = 20 # take top 'lt' number of hottest submissions
print("Start: ", datetime.datetime.now())

avg_sentence_length_df = pd.DataFrame(columns=['subreddit', 'avg_comment_sent_length'])
counter=0

for sub in subreddit_list:
    subreddit = reddit.subreddit(sub)
    total_subreddit_len = 0
    start = time.time()
    for submission in subreddit.top(limit=lt):
        # iterate over top comments in the submission and\= create list of sentences
        submission.comments.replace_more(limit=None)
        total_submission_len = 0
        for top_level_comment in submission.comments:
            avg_sent_len = avg_sentence_len_comment(top_level_comment.body.replace('“', '').replace('”', ''))         
            total_submission_len += avg_sent_len
        try:
            avg_submission_len = total_submission_len / len(submission.comments[1:])
        except ZeroDivisionError:
            print("Submission has 0 comments, continuing...")
            continue
            
        # keep running total of totals for entire subreddit
        total_subreddit_len += avg_submission_len
        
    # take total / limit
    avg_subreddit_len = total_subreddit_len / lt
    end = time.time()
    net = end-start
    print(f"Average submission comment length for {sub}: ", round(avg_submission_len, 2))
    print(f"                           Time | {sub}: ", net)
    
    avg_sentence_length_df.loc[counter, :] = [sub, avg_subreddit_len]
    counter+=1
print("End:   ", datetime.datetime.now())

Start:  2019-04-08 18:20:47.528881


In [None]:
avg_sentence_length_df

In [None]:
avg_sentence_length_df.to_csv('avg_sentence_length_df_2.csv', index=False)