This is (most of) the code required to obtain results in the Sentiment Analysis chapter. Code to produce word shifts and word shift plots is given in  `Word Shift Graphs.ipynb`

Import relevant packages

In [2]:
#packages
import sys
import numpy as np
import pandas as pd
import os
import json
import datetime
import time
import pickle
import matplotlib.pyplot as plt
from nltk import FreqDist
import shifterator as sh
from collections import Counter
import csv
from sklearn.feature_extraction.text import CountVectorizer
from mittens import Mittens
import scipy
import math
from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot
import irulan

# list of channels
channel_list = ['ABC1', 'Ch7', 'Ch9', 'Ch10', 'SBS', 'ABC24']

2024-01-05 08:48:00.685137: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Define functions for sentiment analysis

In [None]:
def get_sentiment(text, lexicon):

    sentiment_scores = list()

    for ch, ch_text in enumerate(text):

        sentiment_scores.append([])

        for i, doc in enumerate(ch_text):

            s = 0
            k = 0

            for word in doc.split():

                if word in lexicon.keys():
                    s += lexicon[word]
                    k += 1

            sentiment_scores[ch].append(s/np.max([k, 1]))

    return sentiment_scores

Preliminary sentiment analysis.

In [None]:
# load data
text = pickle.load(open("all_text_split.pkl", "rb"))
nrc_lexicon = pickle.load(open('nrc_lexicon.pkl', 'rb'))
dates = pickle.load(open("all_dates_split.pkl", "rb"))

# get sentiment scores from text
sentiment_scores = get_sentiment(text, nrc_lexicon)

# make plot
a = 20000

for i, ch in enumerate(channel_list):
    plt.plot(dates[i][int(a/2):-int(a/2)+1], irulan.moving_average(sentiment_scores[i], a), label = ch)

plt.legend(bbox_to_anchor = (1, 1))
plt.title("Sentiment of each channel over time")
plt.xlabel("Date")
plt.ylabel("Sentiment")

News sentiment analysis

In [None]:
# load data
text = pickle.load(open("all_news_text_split.pkl", "rb"))
nrc_lexicon = pickle.load(open('nrc_lexicon.pkl', 'rb'))
dates = pickle.load(open("all_news_dates_split.pkl", "rb"))

# get sentiment scores from text
sentiment_scores = get_sentiment(text, nrc_lexicon)

# make plot
a = 20000

for i, ch in enumerate(channel_list):
    plt.plot(dates[i][int(a/2):-int(a/2)+1], irulan.moving_average(sentiment_scores[i], a), label = ch)

plt.legend(bbox_to_anchor = (1, 1))
plt.title("Sentiment of each channel over time")
plt.xlabel("Date")
plt.ylabel("Sentiment")

Fine-tune GloVe embeddings with Mittens.

In [None]:
# load text
text = pickle.load(open("all_news_text.pkl", "rb")) # not split by channel
glove = 'glove.6B.300d.txt'

# fine-tune GloVe embeddings with Mittens
new_glove = irulan.train_mittens(text, glove)
pickle.dump(new_glove, open('mittens_model.pkl', 'wb'))

Calculate sentiment scores to create lexicon.

In [None]:
new_glove = pickle.load(open('mittens_model.pkl', 'rb'))

mittens_lexicon = dict()

# loop through each word with embeddings and calculate a sentiment score for each
for word in new_glove.keys():

    mittens_lexicon[word] = irulan.glove_sentiment(word)

pickle.dump(mittens_lexicon, open('mittens_lexicon', 'wb'))

Check the robustness of the Mittens lexicon. Calculate the Pearson correlation of sentiment values for 100 runs of Mittens embeddings.

In [None]:
# set the number of iterations at 100
iterations = 100

# load the original lexicon to compare with
original_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))
text = pickle.load(open("all_news_text.pkl", "rb"))
glove = 'glove.6B.300d.txt'

pearsons = np.zeros(iterations)

for i in range(iterations):

    # train a new Mittens model each iteration
    new_glove = irulan.train_mittens(text, glove, seed = i)

    new_lexicon = dict()
    # loop through each word with embeddings and calculate a sentiment score for each
    for word in new_glove.keys():

        # add sentiment scores for each word to an array
        sentiments = np.append(sentiments, [[original_lexicon[word], irulan.glove_sentiment(word)]], axis = 0)

    # calculate the pearson correlation of the original and new sentiments
    pearsons[i] = scipy.stats.pearsonr(sentiments[:, 0], sentiments[:, 1])[0]

Calculate the Pearson correlation for subsampled models. 

In [None]:
# set the number of iterations at 30
iterations = 30

# load the original lexicon to compare with
original_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))
text = pickle.load(open("all_news_text.pkl", "rb"))
glove = 'glove.6B.300d.txt'

pearsons = np.zeros(iterations)

# loop through each value of p 30 times
for i in range(iterations*9):

    # proportions from 0.1 to 0.9
    p = ((i % 9) + 1) / 10
    np.random.seed(seed = i)

    # choose a proportion of the text to train the model on
    samples = np.random.randint(0, len(text), int(len(text)*p))
    text = [text[k] for k in samples]

    # train a new Mittens model each iteration
    new_glove = irulan.train_mittens(text, glove, seed = i)

    new_lexicon = dict()
    # loop through each word with embeddings and calculate a sentiment score for each
    for word in new_glove.keys():

        # add sentiment scores for each word to an array
        sentiments = np.append(sentiments, [[original_lexicon[word], irulan.glove_sentiment(word)]], axis = 0)

    # calculate the pearson correlation of the original and new sentiments
    pearsons[i] = scipy.stats.pearsonr(sentiments[:, 0], sentiments[:, 1])[0]

Compare the Mittens lexicon with the NRC-VAD lexicon. Make an interactive plot and histogram.

In [None]:
# load the lexicons
nrc_lexicon = pickle.load(open('nrc_lexicon.pkl', 'rb'))
mittens_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))

sentiments = np.empty((0, 2))
word_list = []

# create lexicons of text in common
nrc_filtered = dict()
mittens_filtered = dict()

# loop through the words and check whether they appear in both lists
for word in nrc_lexicon.keys():

    if word in mittens_lexicon.keys():

        # add sentiment scores to an array if they are in both sentiment dictionaries
        sentiments = np.append(sentiments, [[nrc_lexicon[word], mittens_lexicon[word]]], axis = 0)
        word_list.append(word) # add words to a list for plotting

        nrc_filtered[word] = nrc_lexicon[word]
        mittens_filtered[word] = mittens_lexicon[word]

pickle.dump(nrc_filtered, open('nrc_lexicon_filtered.pkl', 'wb'))
pickle.dump(mittens_filtered, open('mittens_lexicon_filtered.pkl', 'wb'))

x = np.linspace(-1.5,1.5,100)

trace = go.Scatter(
    x=sentiments[:, 0],
    y=sentiments[:, 1],
    hovertext = word_list,
    name="Sentiment Score",
    mode = 'markers')

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(trace)
fig['layout'].update(xaxis_title = 'NRC lexicon', yaxis_title = 'Mittens lexicon', width = 800)
fig.update_yaxes(
scaleanchor="x",
scaleratio=1)
iplot(fig)

# calculate pearson correlation
pearson = scipy.stats.pearsonr(sentiments[:, 0], sentiments[:, 1])[0]

In [None]:
# histogram of the words in common between nrc and mittens lexicon

# load lexicons that include only the words in common
nrc_filtered = pickle.load(open('nrc_lexicon_filtered.pkl', 'rb'))
mittens_filtered = pickle.load(open('mittens_lexicon_filtered.pkl', 'rb'))

# plot histograms of the sentiments
plt.hist(mittens_filtered, bins = 50, alpha = 0.5, label = "Mittens")
plt.hist(nrc_filtered, bins = 20, alpha = 0.5, label = "NRC")
plt.legend()
plt.xlabel("Sentiment Score")
plt.ylabel("Frequency")

Another sentiment analysis (news text with Mittens lexicon).

In [None]:
# load data
text = pickle.load(open("all_news_text_split.pkl", "rb"))
mittens_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))
dates = pickle.load(open("all_news_dates_split.pkl", "rb"))

# get sentiment scores from text
sentiment_scores = get_sentiment(text, mittens_lexicon)

# make plot
a = 20000

for i, ch in enumerate(channel_list):
    plt.plot(dates[i][int(a/2):-int(a/2)+1], irulan.moving_average(sentiment_scores[i], a), label = ch)

plt.legend(bbox_to_anchor = (1, 1))
plt.title("Sentiment of each channel over time")
plt.xlabel("Date")
plt.ylabel("Sentiment")

If it bleeds, it leads: investigate the sentiment of 5-minute intervals of news text. 

Political sentiment analysis.

In [None]:
# get political text

# read in original data
text = pickle.load(open("all_text_split_60.pkl", "rb"))
dates = pickle.load(open("all_dates_split_60.pkl", "rb"))
pyx = pickle.load(open('political_probabilities.pkl', 'rb')) # from topic modelling

# initialise
liberal_text = [[]]*6
liberal_dates = [[]]*6
labor_text = [[]]*6
labor_dates = [[]]*6

# filter text and dates to only those with probability of the respective political 
# topic greater than or equal to 0.5
for t, d, p in zip(text, dates, pyx):
    liberal_text = [t[k] for k in range(len(t)) if p[k, 0]>=0.5]
    liberal_dates = [d[k] for k in range(len(d)) if p[k, 0]>=0.5]
    labor_text = [t[k] for k in range(len(t)) if p[k, 1]>=0.5]
    labor_dates = [d[k] for k in range(len(d)) if p[k, 1]>=0.5]

# save
pickle.dump(liberal_text, open('liberal_text.pkl', 'wb'))
pickle.dump(liberal_dates, open('liberal_dates.pkl', 'wb'))
pickle.dump(labor_text, open('labor_text.pkl', 'wb'))
pickle.dump(labor_dates, open('labor_dates.pkl', 'wb'))

In [None]:
# calculate the sentiment of political text and plot

# read in data
liberal_text = pickle.load(open('liberal_text.pkl', 'rb'))
liberal_dates = pickle.load(open('liberal_dates.pkl', 'rb'))
labor_text = pickle.load(open('labor_text.pkl', 'rb'))
labor_dates = pickle.load(open('labor_dates.pkl', 'rb'))
mittens_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))

# calculate sentiment
liberal_sentiment = get_sentiment(liberal_text)
labor_sentiment = get_sentiment(labor_text)
pickle.dump(liberal_sentiment, open('liberal_sentiment.pkl', 'wb'))
pickle.dump(labor_sentiment, open('labor_sentiment.pkl', 'wb'))

# liberal plot
a = 20000
for i, ch in enumerate(channel_list):
    plt.plot(liberal_dates[i][int(a/2):-int(a/2)+1], irulan.moving_average(liberal_sentiment[i], a), label = ch)

plt.legend(bbox_to_anchor = (1, 1))
plt.title('Liberal Sentiment')
plt.xlabel("Date")
plt.ylabel("Sentiment")

# labor plot
for i, ch in enumerate(channel_list):
    plt.plot(labor_dates[i][int(a/2):-int(a/2)+1], irulan.moving_average(labor_sentiment[i], a), label = ch)

plt.legend(bbox_to_anchor = (1, 1))
plt.title('Labor Sentiment')
plt.xlabel("Date")
plt.ylabel("Sentiment")

Daily sentiment scores.

In [None]:
# get daily values

# load data
mittens_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))
pyx = pickle.load(open('political_probabilities.pkl', 'rb'))
dates = pickle.load(open("all_dates_split_60.pkl", "rb"))

# initialise
liberal_daily_sentiments = [[[]]]
labor_daily_sentiments = [[[]]]
daily_dates = [[]]

# loop through the channels
for i, channel_dates, channel_pyx in zip(range(6), dates, pyx):
    
    # get the first date
    d0 = channel_dates[0]
    daily_dates[i] = [d0]
    k = 0

    # get the text for this channel
    channel_text = pickle.load(open("all_text_split_60.pkl", "rb"))[i]

    # loop through documents in this channel
    for d, t, p in zip(channel_dates, channel_text, channel_pyx):

        # if it is a new day, add to the date list
        if d.day != d0.day:
            liberal_daily_sentiments[i].append([])
            labor_daily_sentiments[i].append([])
            daily_dates[i].append(d)
            k += 1

        # update the current day
        d0 = d

        # add to liberal or labor sentiment if p(y|x) is greater than or equal to 0.5
        if p[0] >= 0.5:
            liberal_daily_sentiments[i][k].append(irulan.doc_sentiment(t, mittens_lexicon))

        if p[1] >= 0.5:
            labor_daily_sentiments[i][k].append(irulan.doc_sentiment(t, mittens_lexicon))

    # change of channel, add to lists
    daily_dates.append([])
    liberal_daily_sentiments.append([[]])
    labor_daily_sentiments.append([[]])

# initialise the daily averages
lib_daily_averages = [[]]*6
lab_daily_averages = [[]]*6

# calculate averages
for i in range(6):
    lib_daily_averages[i] = [np.mean(lib) for lib in liberal_daily_sentiments[i]]
    lab_daily_averages[i] = [np.mean(lab) for lab in labor_daily_sentiments[i]]

# save
pickle.dump(lib_daily_averages, open('daily_average_liberal_sentiment.pkl', 'wb'))
pickle.dump(lab_daily_averages, open('daily_average_labor_sentiment.pkl', 'wb'))
pickle.dump(daily_dates, open('daily_dates.pkl', 'wb'))

# plot liberal daily averages
a = 50
for i, ch in enumerate(channel_list):

    # remove some nans
    lib_non_nan = [l for l in lib_daily_averages[i] if not np.isnan(l)]
    daily_dates_non_nan = [d for d, l in zip(daily_dates[i], lib_daily_averages[i]) if not np.isnan(l)]

    # plot
    plt.plot(daily_dates_non_nan[int(a/2):-int(a/2)+1], irulan.moving_average(lib_non_nan, a), label = ch)
    plt.title("The average daily sentiment of Liberal text")
    plt.xlabel("Date")
    plt.ylabel("Sentiment")
    plt.legend(bbox_to_anchor = ((1,1)))

# plot labor daily averages
for i, ch in enumerate(channel_list):

    # remove some nans
    lab_non_nan = [l for l in lab_daily_averages[i] if not np.isnan(l)]
    daily_dates_non_nan = [d for d, l in zip(daily_dates[i], lab_daily_averages[i]) if not np.isnan(l)]

    # plot
    plt.plot(daily_dates_non_nan[int(a/2):-int(a/2)+1], irulan.moving_average(lab_non_nan, a), label = ch)
    plt.title("The average daily sentiment of Labor text")
    plt.xlabel("Date")
    plt.ylabel("Sentiment")
    plt.legend(bbox_to_anchor = ((1,1)))

Weighted sentiment scores.

In [None]:
# load data
text = pickle.load(open("all_text_split_60.pkl", "rb"))
dates = pickle.load(open("all_dates_split_60.pkl", "rb"))
mittens_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))

# get Mittens sentiment scores for all text
sentiment_scores = get_sentiment(text, mittens_lexicon)
pickle.dump(sentiment_scores, open('all_sentiments_60.pkl', 'wb'))

# get topic probabilities
pyx = pickle.load(open('political_probabilities.pkl', 'rb'))

# calculate and save weighted sentiment
liberal_weighted_sentiment = [sentiment_scores[i]*pyx[i][:, 0] for i in range(len(channel_list))]
labor_weighted_sentiment = [sentiment_scores[i]*pyx[i][:, 1] for i in range(len(channel_list))]
pickle.dump(liberal_weighted_sentiment, open('liberal_weighted_sentiment.pkl', 'wb'))
pickle.dump(labor_weighted_sentiment, open('labor_weighted_sentiment.pkl', 'wb'))

# make liberal plot
a = 20000
for i, ch in enumerate(channel_list):
    plt.plot(dates[i][int(a/2):-int(a/2)+1], irulan.moving_average(liberal_weighted_sentiment[i], a), label = ch)

plt.legend(bbox_to_anchor = (1, 1))
plt.title('Weighted sentiment of Liberal text')
plt.xlabel("Date")
plt.ylabel("Sentiment")

# make laborplot
for i, ch in enumerate(channel_list):
    plt.plot(dates[i][int(a/2):-int(a/2)+1], irulan.moving_average(labor_weighted_sentiment[i], a), label = ch)

plt.legend(bbox_to_anchor = (1, 1))
plt.title('Weighted sentiment of Labor text')
plt.xlabel("Date")
plt.ylabel("Sentiment")

Sentiment bias: mean sentiment score.

In [None]:
liberal_sentiment = pickle.load(open('liberal_sentiment.pkl', 'rb'))
labor_sentiment = pickle.load(open('labor_sentiment.pkl', 'rb'))

liberal_average_sentiment = np.mean(liberal_sentiment)
labor_average_sentiment = np.mean(labor_sentiment)

print('Liberal mean sentiment:', liberal_average_sentiment)
print('Labor mean sentiment:', labor_average_sentiment)

Sentiment bias: word embeddings.

In [None]:
glove = 'glove.6B.300d.txt'

# create list of Liberal/Labor words
liberal_words = ['liberal', 'scott', 'morrison', 'malcolm', 'turnbull']
labor_words = ['labor', 'anthony', 'albanese', 'bill', 'shorten']

# create a Mittens model trained on each channel
for i, channel in enumerate(channel_list):

    # load text from one channel
    text = pickle.load(open("all_text_split.pkl", "rb"))[i]

    # train a Mittens model on text from this channel 
    new_glove = irulan.train_mittens(text, glove)
    pickle.dump(new_glove, open(f'mittens_model_{channel}.pkl', 'wb'))

    # find the sentiment values for Liberal and Labor terms
    liberal_sentiment = 0
    labor_sentiment = 0

    for word in liberal_words:
        liberal_sentiment += irulan.glove_sentiment(word)

    for word in labor_words:
        labor_sentiment += irulan.glove_sentiment(word)

    print('Liberal sentiment:', liberal_sentiment/5)
    print('Labor sentiment:', labor_sentiment/5)

Add sentiment to the bias measure.

In [None]:
# load the weighted sentiment scores calculated previously
liberal_weighted_sentiment = pickle.load(open('liberal_weighted_sentiment.pkl', 'rb'))
labor_weighted_sentiment = pickle.load(open('labor_weighted_sentiment.pkl', 'rb'))

for i, channel in enumerate(channel_list):
    mod = np.mean(np.array(liberal_weighted_sentiment[i]) - np.array(labor_weighted_sentiment[i]))

    print(f'MOD for {channel}:', mod)

Compare with polling and election data.

In [None]:
# manually scraped, inserted into an excel spreadsheet and saved as a csv
polls = pd.read_csv('opinion_polls.csv')

# load other data
liberal_weighted_sentiment = pickle.load(open('liberal_weighted_sentiment.pkl', 'rb'))
labor_weighted_sentiment = pickle.load(open('labor_weighted_sentiment.pkl', 'rb'))
dates = pickle.load(open('all_dates.pkl', 'rb'))

# create one long array, rather than separated by channel
dates = [date for channel in dates for date in channel]
liberal_weighted_sentiment = [sentiment for channel in liberal_weighted_sentiment for sentiment in channel]
liberal_weighted_sentiment = np.array([x for _, x in sorted(zip(dates, liberal_weighted_sentiment), key=lambda pair: pair[0])])
labor_weighted_sentiment = [sentiment for channel in labor_weighted_sentiment for sentiment in channel]
labor_weighted_sentiment = np.array([x for _, x in sorted(zip(dates, labor_weighted_sentiment), key=lambda pair: pair[0])])
dates = sorted(dates)

dates = [datetime.datetime.fromtimestamp(d) for d in dates]

# plot the weighted sentiments
a = 300000
plt.plot(10*irulan.moving_average(np.array(liberal_weighted_sentiment) - np.array(labor_weighted_sentiment), a), dates[int(a/2):-int(a/2)+1], color = 'tab:blue', alpha = 0.4)

# plot the polling data
a = 4

# make sure to remove NaN rows 
liberal_polls = np.array([float(p) for p in polls["LNP"] if str(p)[0] == '0'])
labor_polls = np.array([float(p) for p in polls["ALP"] if str(p)[0] == '0'])
poll_dates = [datetime.datetime.fromisoformat(d) for d, p in zip(polls["DATE"], polls["ALP"]) if str(p)[0] == '0']

# plot polling data
plt.plot(irulan.moving_average(liberal_polls-labor_polls, a), poll_dates[int(a/2):-int(a/2)+1], color = 'black', alpha = 0.8)
plt.ylabel('Difference in polling percentage')
plt.xlabel('Date')
plt.plot([0, 0], [datetime.datetime(2015, 1, 1), datetime.datetime(2023, 1, 1)], color = "black", ls = "dashed")