# Average words per sentence of different subreddits

### Background
An analysis of the average word count per sentence of comments from the most popular Subreddits. 

wps = words per sentence

### Subreddits analyzed (top 20 most subscribed as of April 2019):

### Contents
1. [Setup](#1.-Setup)
2. [Average Words Per Sentence for Comments](#2-Average-Words-Per-Sentence-for-Comments)
3. [Average wps for Submission](#3.-Average-wps-for-submission)
4. [Average wps for Subreddit](#4.-Average-wps-for-subreddit)
5. [Data Visualization](#5.-Data-Visualization)

## 1. Setup

In [1]:
import praw
import pandas as pd
import datetime
import json
import time
import string
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import defaultdict

In [2]:
# Load credfile and display when last updated
credfile = 'credfile.json'
credfile_prefix = ''

# Read credentials to a dictionary
with open(credfile) as fh:
    creds = json.loads(fh.read())

print(f"[{datetime.datetime.now()}]" + f"{credfile} {'.' * 10} is being used as credfile")

reddit = praw.Reddit(client_id=creds['client_id'],
                     client_secret=creds['client_secret'],
                     user_agent=creds['user_agent']
                    )

print(reddit.read_only)  # Output: True

[2019-04-17 21:45:41.067576]credfile.json .......... is being used as credfile
True


In [3]:
subreddit_list = ['soccer', 
                  'nfl', 
                  'cfb',
                  'nba', 
                  'hockey',
                  'baseball',
                  'mma', 
                  'tennis', 
                  'golf',
                  'cricket', 
                  'rugbyunion']

## 2. Average Words Per Sentence for Comments

In [4]:
def standardize_words(word_list):
    """
    Remove stopwords, one-caracter words, and convert words to lowercase.
    Given a list of tokenized words)
    """
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in word_list]
    
    # convert to lower case
    stripped_lower = [word.lower() for word in stripped]

    # Remove one character words
    text_tokenized = [word for word in stripped_lower if len(word) > 2]

    # remove stop words
    stop_words = set(stopwords.words('english'))
    text_stopped = [w for w in text_tokenized if not w in stop_words]
    
    return text_stopped

In [5]:
def avg_sentence_len_comment(comment):
    
    sentences = sent_tokenize(comment)
    words = word_tokenize(comment)   

    cleaned_list = [ x for x in words if len(x) > 2 ]
    
    try:
        avg_sent_len = round(len(cleaned_list) / len(sentences), 2)
    except ZeroDivisionError:
        avg_sent_len = 0
        
    return avg_sent_len

In [6]:
submission = reddit.submission(id='ba4dyn')

# iterate over top comments in the submission and\= create list of sentences
submission.comments.replace_more(limit=None)

total_submission_len = 0
for top_level_comment in submission.comments[1:]: # Skip AutoMod comment
    avg_sent_len = avg_sentence_len_comment(top_level_comment.body.replace('“', '').replace('”', ''))         
    total_submission_len += avg_sent_len

avg_submission_len = total_submission_len / len(submission.comments[1:])

print("Submission comments average wps: ", avg_submission_len)
print("Submission upvote ratio: ", submission.upvote_ratio)

Submission comments average wps:  11.942354694485836
Submission upvote ratio:  0.88


## 3. Average wps for Submission

#### Test on one submission using `id`

In [7]:
def avg_wps_submission(comment_list):
    """
    Iterate over list of comments in a submission and calculate the average words
    per sentence of all top-level comments
    """
    total_submission_sentence_len = 0
    for comment in comment_list:
        avg_sent_len = avg_sentence_len_comment(comment.body.replace('“', '').replace('”', ''))                 
        total_submission_sentence_len += avg_sent_len

    try:
        avg_wps_submission = total_submission_sentence_len / len(comment_list)
    except ZeroDivisionError:
        print("Submission has 0 comments, continuing...")
        avg_wps_submission = 0
        
    return avg_wps_submission

## 4. Average wps for Subreddit

In [8]:
def avg_wps_subreddit(submission_list):
    """
    Iterate over all x number of submissions and get the average words 
    per sentence for the entire subreddit
    """
    total_wps_subreddit = 0
    for submission in submission_list:
        # iterate over top comments in the submission and create list of sentences
        submission.comments.replace_more(limit=None)
        avg_wps_per_submission = avg_wps_submission(list(submission.comments))

        # keep running total of totals for entire subreddit
        total_wps_subreddit += avg_wps_per_submission
    
    avg_wps_subreddit = total_wps_subreddit / len(submission_list)
    
    return avg_wps_subreddit

In [9]:
def default_to_regular(d):
    if isinstance(d, defaultdict):
        d = {k: default_to_regular(v) for k, v in d.items()}
    return d

In [10]:
def get_submission_data(submission_list):
    nested_dict = lambda: defaultdict(nested_dict)
    submission_data = nested_dict()
    for submission in submission_list:
        # iterate over top comments in the submission and create list of sentences
        submission.comments.replace_more(limit=None)
        submission_id = submission.id
        submission_data[submission_id]['upvotes'] = submission.ups
        submission_data[submission_id]['upvotes_ratio'] = submission.upvote_ratio
        submission_data[submission_id]['date'] = datetime.datetime.fromtimestamp(int(submission.created_utc)).strftime("%m/%d/%y %H:%M:%S")
        submission_data[submission_id]['comments'] = len(list(submission.comments))
        submission_data[submission_id]['avg_comment_wps'] = round(avg_wps_submission(list(submission.comments)), 2)

    submission_data_dict = default_to_regular(submission_data)
    return submission_data_dict

#### Iterate over the list of Subreddits
`lt` is the number of submissions returned per subreddit

In [11]:
def iterate_over_subs(subreddit_list, method, limit):
    """
    Iterates over a list of subreddit names
    """
    lt = limit
    nested_dict = lambda: defaultdict(nested_dict)
    submission_data = nested_dict()
    start = time.time()
    for sub in subreddit_list:
        subreddit = reddit.subreddit(sub)
        if method == "hot":
            submission_list = list(subreddit.hot('day', limit=limit))
        elif method == "controversial":
            submission_list = list(subreddit.controversial('day', limit=limit))
        elif method == "top":
            submission_list = list(subreddit.top('day', limit=limit))
        elif method == "new":
            submission_list = list(subreddit.new(limit=limit))
        else:
            raise ValueError
            print("Please select a valid type from: hot, controversial, top, new")
        submission_data[sub] = get_submission_data(submission_list)
    
    # Convert defaultdict to regular
    submission_data_dict = default_to_regular(submission_data)

    end = time.time()
    net = net = end-start
    print(f"Runtime: {round(net, 2)}s")
    
    return submission_data_dict

In [12]:
submission_data = iterate_over_subs(subreddit_list, "top", 10)

Runtime: 711.66s


In [13]:
submission_data['soccer']

{'be6l7h': {'upvotes': 13617,
  'upvotes_ratio': 0.96,
  'date': '04/17/19 07:21:10',
  'comments': 81,
  'avg_comment_wps': 6.37},
 'becx3a': {'upvotes': 12695,
  'upvotes_ratio': 0.92,
  'date': '04/17/19 16:54:51',
  'comments': 1762,
  'avg_comment_wps': 7.11},
 'be7ccu': {'upvotes': 11800,
  'upvotes_ratio': 0.96,
  'date': '04/17/19 08:43:36',
  'comments': 225,
  'avg_comment_wps': 8.77},
 'be607a': {'upvotes': 8688,
  'upvotes_ratio': 0.95,
  'date': '04/17/19 06:05:43',
  'comments': 51,
  'avg_comment_wps': 7.42},
 'becwtj': {'upvotes': 7042,
  'upvotes_ratio': 0.95,
  'date': '04/17/19 16:54:09',
  'comments': 438,
  'avg_comment_wps': 5.58},
 'bebqyw': {'upvotes': 5962,
  'upvotes_ratio': 0.94,
  'date': '04/17/19 15:10:47',
  'comments': 439,
  'avg_comment_wps': 3.56},
 'be59h7': {'upvotes': 4861,
  'upvotes_ratio': 0.98,
  'date': '04/17/19 04:14:10',
  'comments': 24,
  'avg_comment_wps': 10.14},
 'beco60': {'upvotes': 4222,
  'upvotes_ratio': 0.97,
  'date': '04/17/19 

#### Convert to dataframe for analysis

In [15]:
def convert_to_df(submission_dict):
    submission_df = pd.DataFrame()
    
    for key, value in submission_data.items():
        for submission_id, data in value.items():
            submission_df.loc[submission_id, 'upvotes'] = data['upvotes']
            submission_df.loc[submission_id, 'upvotes_ratio'] = data['upvotes_ratio'] 
            submission_df.loc[submission_id, 'date'] = data['date'] 
            submission_df.loc[submission_id, 'comments'] = data['comments'] 
            submission_df.loc[submission_id, 'avg_comment_wps'] = data['avg_comment_wps'] 
            submission_df.loc[submission_id, 'subreddit'] = key 
            
    return submission_df

In [16]:
submission_df = convert_to_df(submission_data)

In [17]:
submission_df.head()

Unnamed: 0,upvotes,upvotes_ratio,date,comments,avg_comment_wps,subreddit
be6l7h,13617.0,0.96,04/17/19 07:21:10,81.0,6.37,soccer
becx3a,12695.0,0.92,04/17/19 16:54:51,1762.0,7.11,soccer
be7ccu,11800.0,0.96,04/17/19 08:43:36,225.0,8.77,soccer
be607a,8688.0,0.95,04/17/19 06:05:43,51.0,7.42,soccer
becwtj,7042.0,0.95,04/17/19 16:54:09,438.0,5.58,soccer


## 5. Data Visualization

### By subreddit
Take average of all fields for each subreddit. Then, lets examine the average words per sentance for our list of subreddits

In [18]:
submission_df_pivot = pd.pivot_table(submission_df, index='subreddit', aggfunc='mean')
submission_df_pivot.reset_index(inplace=True)
submission_df_pivot.sort_values(by='avg_comment_wps', inplace=True)
submission_df_pivot

Unnamed: 0,subreddit,avg_comment_wps,comments,upvotes,upvotes_ratio
10,tennis,6.266,9.8,150.3,0.955
9,soccer,6.404,363.9,7619.0,0.96
3,golf,6.608,22.7,370.3,0.963
6,nba,7.1,125.4,5768.8,0.957
0,baseball,7.145,54.3,1486.6,0.971
8,rugbyunion,7.553,10.3,246.2,0.893
4,hockey,7.581,432.9,7802.1,0.968
1,cfb,7.908,41.1,329.8,0.905
2,cricket,8.07,223.8,331.4,0.965
5,mma,8.627,54.4,1295.7,0.958


#### Using Seaborn

In [19]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sns.set(style="whitegrid", color_codes=True)
pal = sns.color_palette("Greens_d", len(submission_df_pivot))
ax = sns.barplot(x="subreddit", 
                 y="avg_comment_wps", 
                 data=submission_df_pivot,
                 palette=np.array(pal[::-1]))
locs, labels = plt.xticks()
plt.setp(labels, rotation=45)
plt.title("Avg Words per Sentence in Comments per Subreddit")
plt.show()

<Figure size 640x480 with 1 Axes>

#### Using Bokeh

In [20]:
from bokeh.io import show, output_file
from bokeh.plotting import figure

output_file("subreddit_wps.html")

subreddit = list(submission_df_pivot['subreddit'])
words_per_sentence = list(submission_df_pivot['avg_comment_wps'])

p = figure(x_range=subreddit, plot_height=250, title="Fruit words_per_sentence",
           toolbar_location=None, tools="")

p.vbar(x=subreddit, top=words_per_sentence, width=0.9)

p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)