This is (most of) the code required to obtain results in the Sentiment Analysis chapter. Code to produce word shifts and word shift plots is given in  `Word Shift Graphs.ipynb`

Import relevant packages

In [2]:
#packages
import sys
sys.path.append("/Users/a1765262/opt/anaconda3/lib/python3.9/site-packages") # (for VS code sklearn)
import numpy as np
import pandas as pd
from pprint import pprint # to print json files nicely -- cf. print()
# from spacytextblob.spacytextblob import SpacyTextBlob # need to pip install spacytextblob
import os
import json
import datetime
from datetime import datetime as dt
import time
import pickle
import matplotlib.pyplot as plt
from nltk import ngrams, FreqDist
import shifterator as sh
from collections import Counter
import csv
from sklearn.feature_extraction.text import CountVectorizer
from mittens import Mittens


os.chdir("..")

sys.path.append("./Other")
import irulan # useful things I've done

#vectorising channels
# for the text
channel_list = ['ABC1', 'Ch7', 'Ch9', 'Ch10', 'SBS', 'ABC24']
channel_codes = {'ABC1':"561",'Ch10':"1589",'Ch9':"1072",'Ch7':"1328",'SBS':"785", 'ABC24':"560"}

Define functions for sentiment analysis

In [None]:
def get_sentiment(text, lexicon):

    sentiment_scores = list()

    for ch, ch_text in enumerate(text):

        sentiment_scores.append([])

        for i, doc in enumerate(ch_text):

            s = 0
            k = 0

            for word in doc.split():

                if word in lexicon.keys():
                    s += lexicon[word]
                    k += 1

            sentiment_scores[ch].append(s/np.max([k, 1]))

    return sentiment_scores

Preliminary sentiment analysis.

In [None]:
# load data
text = pickle.load(open("all_text_clean_channel_split.pkl", "rb"))
nrc_lexicon = pickle.load(open('nrc_lexicon.pkl', 'rb'))
dates = pickle.load(open("all_dates_clean_channel_split.pkl", "rb"))

# get sentiment scores from text
sentiment_scores = get_sentiment(text, nrc_lexicon)

# make plot
a = 20000

for i, ch in enumerate(channel_list):
    plt.plot(dates[i][int(a/2):-int(a/2)+1], irulan.moving_average(sentiment_scores[i], a), label = ch)

plt.legend(bbox_to_anchor = (1, 1))
plt.title("Sentiment of each channel over time")
plt.xlabel("Date")
plt.ylabel("Sentiment")

News sentiment analysis

In [None]:
# load data
text = pickle.load(open("news_text_channel_split.pkl", "rb"))
nrc_lexicon = pickle.load(open('nrc_lexicon.pkl', 'rb'))
dates = pickle.load(open("news_dates_channel_split.pkl", "rb"))

# get sentiment scores from text
sentiment_scores = get_sentiment(text, nrc_lexicon)

# make plot
a = 20000

for i, ch in enumerate(channel_list):
    plt.plot(dates[i][int(a/2):-int(a/2)+1], irulan.moving_average(sentiment_scores[i], a), label = ch)

plt.legend(bbox_to_anchor = (1, 1))
plt.title("Sentiment of each channel over time")
plt.xlabel("Date")
plt.ylabel("Sentiment")

Fine-tune GloVe embeddings with Mittens. Some code adapted from 'https://github.com/roamanalytics/mittens'.

In [None]:
# load text
text = pickle.load(open("all_news_text.pkl", "rb"))

# get vocabulary
words = " ".join(text).split()
counter = Counter(words)
vocab = list(dict(counter.most_common(30000)).keys())

del words, counter

# get pre-trained model
with open('glove.6B.300d.txt', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=' ',quoting=csv.QUOTE_NONE)
    pre_glove = {line[0]: np.array(list(map(float, line[1:])))
            for line in reader}

# create matrix for mittens
cv = CountVectorizer(ngram_range=(1,1), vocabulary=vocab)
X = cv.fit_transform(text)

del text

Xc = (X.T * X)
Xc.setdiag(0)
coocc_ar = Xc.toarray()

del Xc, X

# create mittens model
mittens_model = Mittens(n=300, max_iter=2000)
new_embeddings = mittens_model.fit(
    coocc_ar,
    vocab=vocab, 
    initial_embedding_dict= pre_glove)

del pre_glove, coocc_ar, mittens_model

new_glove = dict(zip(vocab, new_embeddings))
del vocab, new_embeddings

pickle.dump(new_glove, open('mittens_model.pkl', 'wb'))

Calculate sentiment scores to create lexicon.

In [None]:
new_glove = pickle.load(open('mittens_model.pkl', 'wb'))

mittens_lexicon = dict()

# loop through each word with embeddings and calculate a sentiment score for each
for word in new_glove.keys():

    mittens_lexicon[word] = irulan.glove_sentiment(word)

pickle.dump(mittens_lexicon, open('mittens_lexicon', 'wb'))

Check the robustness of the Mittens lexicon. Calculate the Pearson correlation of sentiment values for 100 runs of Mittens embeddings.

Calculate the Pearson correlation for subsampled models. 

Compare the Mittens lexicon with the NRC-VAD lexicon. 

Another sentiment analysis (news text with Mittens lexicon).

In [None]:
# load data
text = pickle.load(open("news_text_channel_split.pkl", "rb"))
mittens_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))
dates = pickle.load(open("news_dates_channel_split.pkl", "rb"))

# get sentiment scores from text
sentiment_scores = get_sentiment(text, mittens_lexicon)

# make plot
a = 20000

for i, ch in enumerate(channel_list):
    plt.plot(dates[i][int(a/2):-int(a/2)+1], irulan.moving_average(sentiment_scores[i], a), label = ch)

plt.legend(bbox_to_anchor = (1, 1))
plt.title("Sentiment of each channel over time")
plt.xlabel("Date")
plt.ylabel("Sentiment")

If it bleeds, it leads: investigate the sentiment of 5-minute intervals of news text. 

Political sentiment analysis.

Daily sentiment scores.

Weighted sentiment scores.

Sentiment bias: mean sentiment score.

Sentiment bias: word embeddings.

Add sentiment to the bias measure.

Compare with polling and election data.