# Average words per sentence of different subreddits

### Background
An analysis of the average word count per sentence of comments from the most popular Subreddits.

### Subreddits analyzed (top 20 most subscribed as of April 2019):

* r/mildlyinteresting
* r/funny
* r/AskReddit
* r/gaming
* r/pics
* r/science
* r/worldnews
* r/todayilearned
* r/movies
* r/aww
* r/videos
* r/Music
* r/IAmA
* r/gifs
* r/news
* r/EarthPorn
* r/askscience
* r/blog
* r/Showerthoughts
* r/explainlikeimfive
* r/books

In [1]:
import praw
import pandas as pd
import datetime
import json
import string
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
# Load credfile and display when last updated
credfile = 'credfile.json'
credfile_prefix = ''

# Read credentials to a dictionary
with open(credfile) as fh:
    creds = json.loads(fh.read())

print(f"[{datetime.datetime.now()}]" + f"{credfile} {'.' * 10} is being used as credfile")

reddit = praw.Reddit(client_id=creds['client_id'],
                     client_secret=creds['client_secret'],
                     user_agent=creds['user_agent']
                    )

print(reddit.read_only)  # Output: True

[2019-04-06 19:06:07.738456]credfile.json .......... is being used as credfile
True


In [11]:
subreddit_list = [#'AskReddit', 
                  'funny', 
                  'gaming', 
                  'pics', 
                  'science', 
                  'worldnews', 
                  'todayilearned', 
                  'movies', 
                  'aww', 
                  'videos', 
                  'Music', 
                  'IAmA', 
                  'gifs', 
                  'news', 
                  'EarthPorn', 
                  'askscience', 
                  'blog', 
                  'Showerthoughts', 
                  'explainlikeimfive', 
                  'books']

In [4]:
subreddit = reddit.subreddit('AskReddit')

In [6]:
def standardize_words(word_list):
    """
    Remove stopwords, one-caracter words, and convert words to lowercase.
    Given a list of tokenized words)
    """
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in word_list]
    
    # convert to lower case
    stripped_lower = [word.lower() for word in stripped]

    # Remove one character words
    text_tokenized = [word for word in stripped_lower if len(word) > 2]

    # remove stop words
    stop_words = set(stopwords.words('english'))
    text_stopped = [w for w in text_tokenized if not w in stop_words]
    
    return text_stopped

#### Test iterate over one submission

In [96]:
def avg_sentence_len_comment(comment):
    
    sentences = sent_tokenize(comment)
    words = word_tokenize(comment)   

    cleaned_list = [ x for x in words if len(x) > 2 ]
    
    avg_sent_len = round(len(cleaned_list) / len(sentences), 2)
    
    return avg_sent_len

In [None]:
submission = reddit.submission(id='ba4dyn')

# iterate over top comments in the submission and\= create list of sentences
submission.comments.replace_more(limit=None)

total_submission_len = 0
for top_level_comment in submission.comments[1:]: # Skip AutoMod comment
    avg_sent_len = avg_sentence_len_comment(top_level_comment.body.replace('“', '').replace('”', ''))         
    total_submission_len += avg_sent_len

avg_submission_len = total_submission_len / len(submission.comments[1:])

print(avg_submission_len)

In [117]:
lt = 10 # take top 'lt' number of hottest submissions
print("Start: ", datetime.datetime.now())

avg_sentence_length_df = pd.DataFrame(columns=['subreddit', 'avg_comment_sent_length'])
counter=0

for sub in subreddit_list:
    subreddit = reddit.subreddit(sub)
    total_subreddit_len = 0
    for submission in subreddit.hot(limit=lt):
        # iterate over top comments in the submission and\= create list of sentences
        submission.comments.replace_more(limit=None)
        total_submission_len = 0
        for top_level_comment in submission.comments:
            avg_sent_len = avg_sentence_len_comment(top_level_comment.body.replace('“', '').replace('”', ''))         
            total_submission_len += avg_sent_len
        try:
            avg_submission_len = total_submission_len / len(submission.comments[1:])
        except ZeroDivisionError:
            print("Submission has 0 comments, continuing...")
            continue
            
        # keep running total of totals for entire subreddit
        total_subreddit_len += avg_submission_len
        
    # take total / limit
    avg_subreddit_len = total_subreddit_len / lt
    print(f"Average submission length for {sub}: ", round(avg_submission_len, 2))
    print(f"                   Time after {sub}: ", datetime.datetime.now())
    
    avg_sentence_length_df.loc[counter, :] = [sub, avg_subreddit_len]
    counter+=1
print("End:   ", datetime.datetime.now())

Start:  2019-04-06 23:41:04.605157
Average submission length for funny:  5.54
                   Time after funny:  2019-04-06 23:41:37.928244
Average submission length for gaming:  8.52
                   Time after gaming:  2019-04-06 23:42:07.165828




Average submission length for pics:  3.92
                   Time after pics:  2019-04-06 23:42:52.001499




Submission has 0 comments, continuing...
Submission has 0 comments, continuing...
Average submission length for science:  21.75
                   Time after science:  2019-04-06 23:43:38.387803




Average submission length for worldnews:  11.59
                   Time after worldnews:  2019-04-06 23:47:24.093312
Average submission length for todayilearned:  11.64
                   Time after todayilearned:  2019-04-06 23:48:50.837310
Submission has 0 comments, continuing...
Average submission length for movies:  8.81
                   Time after movies:  2019-04-06 23:49:46.902955
Average submission length for aww:  3.7
                   Time after aww:  2019-04-06 23:50:21.906987
Average submission length for videos:  8.42
                   Time after videos:  2019-04-06 23:50:49.589948
Submission has 0 comments, continuing...
Average submission length for Music:  56.2
                   Time after Music:  2019-04-06 23:50:55.047474
Submission has 0 comments, continuing...
Average submission length for IAmA:  9.43
                   Time after IAmA:  2019-04-06 23:52:39.627297
Average submission length for gifs:  5.54
                   Time after gifs:  2019-04-06 23:53:06



Average submission length for news:  8.17
                   Time after news:  2019-04-06 23:59:41.170705
Average submission length for EarthPorn:  5.68
                   Time after EarthPorn:  2019-04-06 23:59:45.043248
Submission has 0 comments, continuing...
Average submission length for askscience:  23.48
                   Time after askscience:  2019-04-06 23:59:52.505319




Average submission length for blog:  5.78
                   Time after blog:  2019-04-07 00:01:56.701576
Average submission length for Showerthoughts:  5.51
                   Time after Showerthoughts:  2019-04-07 00:02:10.460114
Submission has 0 comments, continuing...
Submission has 0 comments, continuing...
Average submission length for explainlikeimfive:  22.49
                   Time after explainlikeimfive:  2019-04-07 00:02:31.289383
Average submission length for books:  14.62
                   Time after books:  2019-04-07 00:02:39.908727
End:    2019-04-07 00:02:39.910908


In [118]:
avg_sentence_length_df

Unnamed: 0,subreddit,avg_comment_sent_length
0,funny,6.06328
1,gaming,7.27696
2,pics,9.72608
3,science,12.1036
4,worldnews,9.49657
5,todayilearned,9.55195
6,movies,7.80287
7,aww,5.28305
8,videos,7.60352
9,Music,11.5134


In [119]:
avg_sentence_length_df.to_csv('avg_sentence_length_df.csv', index=False)