In [None]:
# # Sentiment Analysis on Reddit News Headlines with Python’s Natural Language Toolkit (NLTK)

In [4]:
from IPython import display
# from dotenv import load_dotenv
from os import environ, path
import datetime
import math
from pprint import pprint
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid', context='talk', palette='Dark2')

def load_env():
    # https://stackoverflow.com/a/54028874/1426788 
    %load_ext dotenv
    %dotenv

load_env()
version = str(datetime.datetime.now()).replace(' ', '-')
print('version:', version)


version: 2020-02-23-15:47:02.494470


In [1]:
import praw
from os import environ
reddit = praw.Reddit(client_id=environ.get('CLIENT_ID'),
                     client_secret=environ.get('CLIENT_SECRET'),
                     user_agent='sentimine')
headlines = set()
for submission in reddit.subreddit(environ.get('SUBREDDIT')).new(limit=None):
    headlines.add(submission.title)
    display.clear_output()
    print(len(headlines))

ClientException: Required configuration setting 'client_id' missing. 
This setting can be provided in a praw.ini file, as a keyword argument to the `Reddit` class constructor, or as an environment variable.

In [None]:


from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()
results = []

for line in headlines:
    pol_score = sia.polarity_scores(line)
    pol_score['headline'] = line
    results.append(pol_score)

pprint(results[:3], width=100)

In [None]:


df = pd.DataFrame.from_records(results)
df.head()

In [None]:


df['label'] = 0
df.loc[df['compound'] > 0.2, 'label'] = 1
df.loc[df['compound'] < -0.2, 'label'] = -1
df.head()

In [None]:


df2 = df[['headline', 'label']]

In [None]:


df2.to_csv('./tmp/reddit_headlines_labels.csv', mode='a', encoding='utf-8', index=False)

In [None]:


df.label.value_counts()

In [None]:


print("Positive headlines:\n")
pprint(list(df[df['label'] == 1].headline)[:5], width=200)

print("\nNegative headlines:\n")
pprint(list(df[df['label'] == -1].headline)[:5], width=200)

In [None]:


df.label.value_counts(normalize=True) * 100

In [None]:


fig, ax = plt.subplots(figsize=(8, 8))

counts = df.label.value_counts(normalize=True) * 100

sns.barplot(x=counts.index, y=counts, ax=ax)

ax.set_xticklabels(['Negative', 'Neutral', 'Positive'])
ax.set_ylabel("Percentage")

plt.show()

In [None]:


from nltk.tokenize import word_tokenize, RegexpTokenizer

example = "This is an example sentence! However, it isn't a very informative one"

print(word_tokenize(example, language='english'))

In [None]:


tokenizer = RegexpTokenizer(r'\w+')
print(tokenizer.tokenize(example))

In [None]:


from nltk.corpus import stopwords

stop_words = stopwords.words('english')
print(stop_words[:20])

In [None]:


def process_text(headlines):
    tokens = []
    for line in headlines:
        line = line.lower()
        toks = tokenizer.tokenize(line)
        toks = [t for t in toks if t not in stop_words]
        tokens.extend(toks)
    
    return tokens

In [None]:


pos_lines = list(df[df.label == 1].headline)

pos_tokens = process_text(pos_lines)
pos_freq = nltk.FreqDist(pos_tokens)

pos_freq.most_common(20)

In [None]:


y_val = [x[1] for x in pos_freq.most_common()]

fig = plt.figure(figsize=(10,5))
plt.plot(y_val)

plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Word Frequency Distribution (Positive)")
plt.show()

In [None]:


y_final = []
for i, k, z, t in zip(y_val[0::4], y_val[1::4], y_val[2::4], y_val[3::4]):
    y_final.append(math.log(i + k + z + t))

x_val = [math.log(i + 1) for i in range(len(y_final))]

fig = plt.figure(figsize=(10,5))

plt.xlabel("Words (Log)")
plt.ylabel("Frequency (Log)")
plt.title("Word Frequency Distribution (Positive)")
plt.plot(x_val, y_final)
plt.show()

In [None]:


neg_lines = list(df2[df2.label == -1].headline)

neg_tokens = process_text(neg_lines)
neg_freq = nltk.FreqDist(neg_tokens)

neg_freq.most_common(20)

In [None]:


y_val = [x[1] for x in neg_freq.most_common()]

fig = plt.figure(figsize=(10,5))
plt.plot(y_val)

plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Word Frequency Distribution (Negative)")
plt.show()

In [None]:


y_final = []
for i, k, z in zip(y_val[0::3], y_val[1::3], y_val[2::3]):
    if i + k + z == 0:
        break
    y_final.append(math.log(i + k + z))

x_val = [math.log(i+1) for i in range(len(y_final))]

fig = plt.figure(figsize=(10,5))

plt.xlabel("Words (Log)")
plt.ylabel("Frequency (Log)")
plt.title("Word Frequency Distribution (Negative)")
plt.plot(x_val, y_final)
plt.show()