# Average words per sentence of different subreddits

### Background
An analysis of the average word count per sentence of comments from the most popular Subreddits.

### Subreddits analyzed (top 20 most subscribed as of April 2019):

### Contents
1. Setup
2. Standardize words
3. Calculate average words per submission
4. Calculate average words per submission for entire subreddit
5. View Data
6. Data Visualization

In [1]:
import praw
import pandas as pd
import datetime
import json
import time
import string
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
# Load credfile and display when last updated
credfile = 'credfile.json'
credfile_prefix = ''

# Read credentials to a dictionary
with open(credfile) as fh:
    creds = json.loads(fh.read())

print(f"[{datetime.datetime.now()}]" + f"{credfile} {'.' * 10} is being used as credfile")

reddit = praw.Reddit(client_id=creds['client_id'],
                     client_secret=creds['client_secret'],
                     user_agent=creds['user_agent']
                    )

print(reddit.read_only)  # Output: True

[2019-04-11 20:45:58.012862]credfile.json .......... is being used as credfile
True


In [12]:
subreddit_list = [#'AskReddit', 
#                   'funny', 
#                   'gaming', 
#                   'pics', 
#                   'science', 
#                   'worldnews', 
#                   'todayilearned', 
#                   'movies', 
#                   'soccer', 
#                   'videos', 
#                   'Music', 
#                   'IAmA', 
#                   'gifs', 
#                   'news', 
#                   'EarthPorn', 
#                   'askscience', 
                  'blog', 
                  'Showerthoughts', 
                  'explainlikeimfive', 
                  'books']

In [13]:
subreddit = reddit.subreddit('AskReddit')

In [14]:
def standardize_words(word_list):
    """
    Remove stopwords, one-caracter words, and convert words to lowercase.
    Given a list of tokenized words)
    """
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in word_list]
    
    # convert to lower case
    stripped_lower = [word.lower() for word in stripped]

    # Remove one character words
    text_tokenized = [word for word in stripped_lower if len(word) > 2]

    # remove stop words
    stop_words = set(stopwords.words('english'))
    text_stopped = [w for w in text_tokenized if not w in stop_words]
    
    return text_stopped

#### Test iterate over one submission

In [15]:
def avg_sentence_len_comment(comment):
    
    sentences = sent_tokenize(comment)
    words = word_tokenize(comment)   

    cleaned_list = [ x for x in words if len(x) > 2 ]
    
    try:
        avg_sent_len = round(len(cleaned_list) / len(sentences), 2)
    except ZeroDivisionError:
        avg_sent_len = 0
        
    return avg_sent_len

In [16]:
submission = reddit.submission(id='ba4dyn')

# iterate over top comments in the submission and\= create list of sentences
submission.comments.replace_more(limit=None)

total_submission_len = 0
for top_level_comment in submission.comments[1:]: # Skip AutoMod comment
    avg_sent_len = avg_sentence_len_comment(top_level_comment.body.replace('“', '').replace('”', ''))         
    total_submission_len += avg_sent_len

avg_submission_len = total_submission_len / len(submission.comments[1:])

print(avg_submission_len)

11.932023809523804


In [17]:
submission.upvote_ratio

0.88

In [18]:
def avg_wps_submission(comment_list):
    """
    Iterate over list of comments in a submission and calculate the average words
    per sentence of all top-level comments
    """
    total_submission_sentence_len = 0
    for comment in comment_list:
        avg_sent_len = avg_sentence_len_comment(comment.body.replace('“', '').replace('”', ''))                 
        total_submission_sentence_len += avg_sent_len

    try:
        avg_wps_submission = total_submission_sentence_len / len(comment_list)
    except ZeroDivisionError:
        print("Submission has 0 comments, continuing...")
        avg_wps_submission = 0
        
    return avg_wps_submission

In [19]:
def avg_wps_subreddit(submission_list):
    """
    Iterate over all x number of submissions and get the average words 
    per sentence for the entire subreddit
    """
    total_wps_subreddit = 0
    for submission in submission_list:
        # iterate over top comments in the submission and create list of sentences
        submission.comments.replace_more(limit=None)
        avg_wps_per_submission = avg_wps_submission(list(submission.comments))

        # keep running total of totals for entire subreddit
        total_wps_subreddit += avg_wps_per_submission
    
    avg_wps_subreddit = total_wps_subreddit / len(submission_list)
    
    return avg_wps_subreddit

In [20]:
def get_submission_data(submission_list):
    submission_data_df = pd.DataFrame()
    iterator=0
    for submission in submission_list:
        # iterate over top comments in the submission and create list of sentences
        submission.comments.replace_more(limit=None)
        submission_data_df.loc[iterator, 'upvotes'] = submission.ups
        submission_data_df.loc[iterator, 'upvotes_ratio'] = submission.upvote_ratio
        submission_data_df.loc[iterator, 'date'] = datetime.datetime.fromtimestamp(int(submission.created_utc)).strftime("%m/%d/%y %H:%M:%S")
        submission_data_df.loc[iterator, 'comments'] = len(list(submission.comments))

In [11]:
lt = 1 # take top 'lt' number of hottest submissions
print("Start: ", datetime.datetime.now())

avg_sentence_length_df = pd.DataFrame(columns=['subreddit', 'avg_comment_sent_length'])
counter = 0
start = time.time()
for sub in subreddit_list:
    subreddit = reddit.subreddit(sub)
    submission_list = list(subreddit.hot(limit=lt))
    avg_wps_per_subreddit = avg_wps_subreddit(submission_list)
    # take total / limit
    avg_sentence_length_df.loc[counter, :] = [sub, avg_wps_per_subreddit]
    # record metadata
    get_submission_data(submission_list)
    counter+=1
    
end = time.time()
net = net = end-start

print("Runtime: {round(net, 2)s}")

Start:  2019-04-11 20:48:55.346786


KeyboardInterrupt: 

In [None]:
avg_sentence_length_df

In [None]:
avg_sentence_length_df.to_csv('avg_wps_hottest_20_n1.csv', index=False)