# Average words per sentence of different subreddits

### Background
An analysis of the average word count per sentence of comments from the most popular Subreddits.

### Subreddits analyzed (top 20 most subscribed as of April 2019):

* r/mildlyinteresting
* r/funny
* r/AskReddit
* r/gaming
* r/pics
* r/science
* r/worldnews
* r/todayilearned
* r/movies
* r/soccer
* r/videos
* r/Music
* r/IAmA
* r/gifs
* r/news
* r/EarthPorn
* r/askscience
* r/blog
* r/Showerthoughts
* r/explainlikeimfive
* r/books

In [1]:
import praw
import pandas as pd
import datetime
import json
import string
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
# Load credfile and display when last updated
credfile = 'credfile.json'
credfile_prefix = ''

# Read credentials to a dictionary
with open(credfile) as fh:
    creds = json.loads(fh.read())

print(f"[{datetime.datetime.now()}]" + f"{credfile} {'.' * 10} is being used as credfile")

reddit = praw.Reddit(client_id=creds['client_id'],
                     client_secret=creds['client_secret'],
                     user_agent=creds['user_agent']
                    )

print(reddit.read_only)  # Output: True

[2019-04-08 22:38:37.720977]credfile.json .......... is being used as credfile
True


In [3]:
subreddit_list = ['AskReddit', 
                  'funny', 
                  'gaming', 
                  'pics', 
                  'science', 
                  'worldnews', 
                  'todayilearned', 
                  'movies', 
                  'soccer', 
                  'videos', 
                  'Music', 
                  'IAmA', 
                  'gifs', 
                  'news', 
                  'EarthPorn', 
                  'askscience', 
                  'blog', 
                  'Showerthoughts', 
                  'explainlikeimfive', 
                  'books']

In [4]:
subreddit = reddit.subreddit('AskReddit')

In [5]:
def standardize_words(word_list):
    """
    Remove stopwords, one-caracter words, and convert words to lowercase.
    Given a list of tokenized words)
    """
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in word_list]
    
    # convert to lower case
    stripped_lower = [word.lower() for word in stripped]

    # Remove one character words
    text_tokenized = [word for word in stripped_lower if len(word) > 2]

    # remove stop words
    stop_words = set(stopwords.words('english'))
    text_stopped = [w for w in text_tokenized if not w in stop_words]
    
    return text_stopped

#### Test iterate over one submission

In [6]:
def avg_sentence_len_comment(comment):
    
    sentences = sent_tokenize(comment)
    words = word_tokenize(comment)   

    cleaned_list = [ x for x in words if len(x) > 2 ]
    
    try:
        avg_sent_len = round(len(cleaned_list) / len(sentences), 2)
    except ZeroDivisionError:
        avg_sent_len = 0
        
    return avg_sent_len

In [7]:
submission = reddit.submission(id='ba4dyn')

# iterate over top comments in the submission and\= create list of sentences
submission.comments.replace_more(limit=None)

total_submission_len = 0
for top_level_comment in submission.comments[1:]: # Skip AutoMod comment
    avg_sent_len = avg_sentence_len_comment(top_level_comment.body.replace('“', '').replace('”', ''))         
    total_submission_len += avg_sent_len

avg_submission_len = total_submission_len / len(submission.comments[1:])

print(avg_submission_len)

11.939806547619044


In [8]:
import time

In [9]:
submission.upvote_ratio

0.88

In [14]:
def avg_wps_submission(comment_list):
    """
    Iterate over list of comments in a submission and calculate the average words
    per sentence of all top-level comments
    """
    total_submission_sentence_len = 0
    for comment in comment_list:
        avg_sent_len = avg_sentence_len_comment(comment.body.replace('“', '').replace('”', ''))                 
        total_submission_sentence_len += avg_sent_len

    try:
        avg_wps_submission = total_submission_sentence_len / len(comment_list)
    except ZeroDivisionError:
        print("Submission has 0 comments, continuing...")
        avg_wps_submission = 0
        
    return avg_wps_submission

In [15]:
def avg_wps_subreddit(submission_list):
    """
    Iterate over all x number of submissions and get the average words 
    per sentence for the entire subreddit
    """
    total_wps_subreddit = 0
    for submission in submission_list:
        # iterate over top comments in the submission and create list of sentences
        submission.comments.replace_more(limit=None)
        avg_wps_per_submission = avg_wps_submission(list(submission.comments))

        # keep running total of totals for entire subreddit
        total_wps_subreddit += avg_wps_per_submission
    
    avg_wps_subreddit = total_wps_subreddit / len(submission_list)
    
    return avg_wps_subreddit

In [16]:
lt = 1 # take top 'lt' number of hottest submissions
print("Start: ", datetime.datetime.now())

avg_sentence_length_df = pd.DataFrame(columns=['subreddit', 'avg_comment_sent_length'])
counter=0

for sub in subreddit_list:
    subreddit = reddit.subreddit(sub)
    start = time.time()
    avg_wps_per_subreddit = avg_wps_subreddit(list(subreddit.hot(limit=lt)))
    # take total / limit
    end = time.time()
    net = end-start
    print(f"{sub} | Comment Length Avg: ", round(avg_wps_per_subreddit, 2))
    print(f"{sub} | Time              : {round(net, 2)}s")
    
    avg_sentence_length_df.loc[counter, :] = [sub, avg_wps_per_subreddit]
    counter+=1
print("End:   ", datetime.datetime.now())

Start:  2019-04-08 22:47:35.038030
AskReddit | Comment Length Avg:  10.21
AskReddit | Time              : 112.15s
funny | Comment Length Avg:  7.62
funny | Time              : 2.16s
gaming | Comment Length Avg:  10.29
gaming | Time              : 0.41s
pics | Comment Length Avg:  18.8
pics | Time              : 0.29s
science | Comment Length Avg:  9.6
science | Time              : 1.87s
worldnews | Comment Length Avg:  7.25
worldnews | Time              : 52.38s
todayilearned | Comment Length Avg:  8.92
todayilearned | Time              : 22.43s
movies | Comment Length Avg:  1.0
movies | Time              : 0.21s
Submission has 0 comments, continuing...
aww | Comment Length Avg:  0.0
aww | Time              : 0.15s
videos | Comment Length Avg:  8.03
videos | Time              : 44.27s
Music | Comment Length Avg:  8.67
Music | Time              : 0.43s
IAmA | Comment Length Avg:  7.8
IAmA | Time              : 0.54s
gifs | Comment Length Avg:  4.83
gifs | Time              : 0.25s
news 

In [17]:
avg_sentence_length_df

Unnamed: 0,subreddit,avg_comment_sent_length
0,AskReddit,10.2082
1,funny,7.61798
2,gaming,10.2892
3,pics,18.8
4,science,9.59855
5,worldnews,7.24705
6,todayilearned,8.91853
7,movies,1.0
8,aww,0.0
9,videos,8.02785


In [11]:
avg_sentence_length_df.to_csv('avg_sentence_length_hottest_.csv', index=False)