This notebook replicates word shift graphs for all tables in the thesis. 

Import required packages

In [None]:
import shifterator as sh
import pickle
import numpy as np
import datetime
from nltk import FreqDist
import irulan

Function to calculate monthly Tsallis entropy and sentiment shifts

In [None]:
def get_monthly_shift(text, dates, month, year, lexicon, file_name = None):

    # filter to only the text within that month
    peak_words = list()
    peak = [text[i] for i in range(len(dates)) if dates[i].month == month and dates[i].year == year]

    # count words in all of the text
    text = " ".join(text)
    counts = dict()
    counts = FreqDist(text.split())
    del text

    # counts words in the peak
    peak = " ".join(peak)
    peak_counts = dict()
    peak_counts = FreqDist(peak.split())
    del peak

    # calculate Tsallis entropy shifts
    entropy_shift = sh.EntropyShift(type2freq_1=counts,
                                        type2freq_2=peak_counts,
                                        alpha = 0.3)

    # make the shift graph
    if file_name:
        entropy_shift.get_shift_graph(title=f'Sentiment difference between {month}/{year} and all text',
                                    text_size_inset = False,
                                    cumulative_inset = False,
                                    height = 8,
                                    xlabel = 'Score shift',
                                    filename = f'{file_name}_entropy.pdf')
    else:
        entropy_shift.get_shift_graph(title=f'Sentiment difference between {month}/{year} and all text',
                                    text_size_inset = False,
                                    cumulative_inset = False,
                                    height = 8,
                                    xlabel = 'Score shift')

    # calculate sentiment shifts
    sentiment_shift = sh.WeightedAvgShift(type2freq_1=counts,
                                    type2freq_2=peak_counts,
                                    type2score_1=lexicon,
                                    type2score_2=lexicon,
                                    reference_value='average')

    # make the shift graph
    if file_name:
        sentiment_shift.get_shift_graph(title=f'Sentiment difference between {month}/{year} and all text',
                                    text_size_inset = False,
                                    cumulative_inset = False,
                                    height = 8,
                                    xlabel = 'Score shift',
                                    filename = f'{file_name}_sentiment.pdf')
    else:
        sentiment_shift.get_shift_graph(title=f'Sentiment difference between {month}/{year} and all text',
                                text_size_inset = False,
                                cumulative_inset = False,
                                height = 8,
                                xlabel = 'Score shift')

Channel 10/ABC24

In [None]:
# load Channel 10 and ABC24 text/dates
ch10_text = pickle.load(open('all_text.pkl', 'rb'))[3]
abc24_text = pickle.load(open('all_text.pkl', 'rb'))[5]
ch10_dates = pickle.load(open('all_dates.pkl', 'rb'))[3]
abc24_dates = pickle.load(open('all_dates.pkl', 'rb'))[5]
nrc_lexicon = pickle.load(open('nrc_lexicon.pkl', 'rb'))

In [None]:
file_name = 'ch10_abc24'

# count words in all of the text
ch10_text = " ".join(ch10_text)
ch10_counts = dict()
ch10_counts = FreqDist(ch10_text.split())
del ch10_text

# counts words in the peak
abc24_text = " ".join(abc24_text)
abc24_counts = dict()
abc24_counts = FreqDist(abc24_text.split())
del abc24_text

# calculate Tsallis entropy shifts
entropy_shift = sh.EntropyShift(type2freq_1=ch10_counts,
                                    type2freq_2=abc24_counts,
                                    alpha = 0.3)

# make the shift graph
entropy_shift.get_shift_graph(title='Sentiment difference between the Mittens lexicon and the original GloVe lexicon',
                            text_size_inset = False,
                            cumulative_inset = False,
                            height = 8,
                            xlabel = 'Score shift',
                            filename = f'{file_name}_entropy.pdf')

# calculate sentiment shifts
sentiment_shift = sh.WeightedAvgShift(type2freq_1 = ch10_counts,
                                type2freq_2 = abc24_counts,
                                type2score_1 = nrc_lexicon,
                                type2score_2 = nrc_lexicon,
                                reference_value='average')

# make the shift graph
sentiment_shift.get_shift_graph(title='Sentiment difference between the Mittens lexicon and the original GloVe lexicon',
                            text_size_inset = False,
                            cumulative_inset = False,
                            height = 8,
                            xlabel = 'Score shift',
                            filename = f'{file_name}_sentiment.pdf')

Channel 7 all text monthly analysis

In [None]:
# load Channel 7 text/dates
text = pickle.load(open('all_text.pkl', 'rb'))[1]
dates = pickle.load(open('all_dates.pkl', 'rb'))[1]
nrc_lexicon = pickle.load(open('nrc_lexicon.pkl', 'rb'))

In [None]:
# generate shifts for each month
get_monthly_shift(text, dates, 2, 2018, nrc_lexicon)

get_monthly_shift(text, dates, 1, 2020, nrc_lexicon)

get_monthly_shift(text, dates, 9, 2022, nrc_lexicon)

ABC24 news text 2020/2021 and other dates

In [None]:
# load data
text = pickle.load(open('all_news.pkl', 'rb'))[5]
dates = pickle.load(open('all_news.pkl', 'rb'))[5]
nrc_lexicon = pickle.load(open('nrc_lexicon.pkl', 'rb'))

In [None]:
file_name = 'abc24_news_peak'

# filter to only the text in 2020/2021
peak_words = list()
peak = [text[i] for i in range(len(dates)) if dates[i].year == 2020 or dates[i].year == 2021]

# count words in all of the text
text = " ".join(text)
counts = dict()
counts = FreqDist(text.split())
del text

# counts words in the peak
peak = " ".join(peak)
peak_counts = dict()
peak_counts = FreqDist(peak.split())
del peak

# calculate Tsallis entropy shifts
entropy_shift = sh.EntropyShift(type2freq_1=counts,
                                    type2freq_2=peak_counts,
                                    alpha = 0.3)

# make the shift graph
entropy_shift.get_shift_graph(title='Sentiment difference between the Mittens lexicon and the original GloVe lexicon',
                            text_size_inset = False,
                            cumulative_inset = False,
                            height = 8,
                            xlabel = 'Score shift',
                            filename = f'{file_name}_entropy.pdf')

# calculate sentiment shifts
sentiment_shift = sh.WeightedAvgShift(type2freq_1=counts,
                                type2freq_2=peak_counts,
                                type2score_1=nrc_lexicon,
                                type2score_2=nrc_lexicon,
                                reference_value='average')

# make the shift graph
sentiment_shift.get_shift_graph(title='Sentiment difference between the Mittens lexicon and the original GloVe lexicon',
                            text_size_inset = False,
                            cumulative_inset = False,
                            height = 8,
                            xlabel = 'Score shift',
                            filename = f'{file_name}_sentiment.pdf')

Compare Mittens lexicon with NRC

In [None]:
# read in data
nrc_lexicon = pickle.load(open('nrc_lexicon.pkl', 'rb'))
mittens_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))
text = pickle.load(open("all_text.pkl", "rb"))

# get a 10% subsample of data 
np.random.seed(seed = 0)
p = 0.1
ints = np.random.randint(0, int(len(text)), int(len(text)*p))
text_sampled = [text[i] for i in ints]
del text

# get the counts of words in the text
text_sampled = " ".join(text_sampled)
counts = dict()
counts = FreqDist(text_sampled.split())
del text_sampled

file_name = 'mittens_vs_nrc'

# calculate sentiment shifts
entropy_shift = sh.EntropyShift(type2freq_1=counts,
                                    type2freq_2=peak_counts,
                                    alpha = 0.3)

entropy_shift.get_shift_graph(title='Entropy difference between the Mittens lexicon and the NRC lexicon',
                            text_size_inset = False,
                            cumulative_inset = False,
                            height = 8,
                            xlabel = 'Score shift',
                            filename = f'{file_name}_entropy.pdf')

# make the shift graph
sentiment_shift = sh.WeightedAvgShift(type2freq_1=counts,
                                type2freq_2=peak_counts,
                                type2score_1=mittens_lexicon,
                                type2score_2=mittens_lexicon,
                                reference_value='average')


sentiment_shift.get_shift_graph(title=f'Sentiment difference between the Mittens lexicon and the NRC lexicon',
                                    text_size_inset = False,
                                    cumulative_inset = False,
                                    height = 8,
                                    xlabel = 'Score shift',
                                    filename = f'{file_name}_sentiment.pdf')

Compare monthly news text from all channels with Mittens

In [None]:
# load news text from all channels
text = pickle.load(open('all_news_total.pkl', 'rb'))
dates = pickle.load(open('all_news_dates_total.pkl', 'rb'))
mittens_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))

In [None]:
# generate shifts for each month
get_monthly_shift(text, dates, 1, 2018, mittens_lexicon)

get_monthly_shift(text, dates, 4, 2020, mittens_lexicon)

get_monthly_shift(text, dates, 10, 2021, mittens_lexicon)

get_monthly_shift(text, dates, 3, 2022, mittens_lexicon)

Investigate the spike in SBS sentiment

In [None]:
# load SBS news text
text = pickle.load(open('all_news_total.pkl', 'rb'))[4]
dates = pickle.load(open('all_news_dates_total.pkl', 'rb'))[4]
mittens_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))

In [None]:
# generate monthly shifts
get_monthly_shift(text, dates, 5, 2022, mittens_lexicon)

In [None]:
# generate daily stats
mittens_lexicon = pickle.load(open('./Data/nrc_lexicon.pkl', 'rb'))
year = 2022
month = 5
day = 15

peak_news = [text[i] for i in range(len(dates)) if dates[i].month == month and dates[i].year == year and dates[i].day == day]
del dates

text = " ".join(text)
counts = dict()
counts = FreqDist(text.split())
del text

peak_news = " ".join(peak_news)
peak_counts = dict()
peak_counts = FreqDist(peak_news.split())

entropy_shift = sh.EntropyShift(type2freq_1=counts,
                                    type2freq_2=peak_counts,
                                    alpha = 0.3)

entropy_shift.get_shift_graph(title='Entropy difference between 15/5/2022 SBS text and other',
                            text_size_inset = False,
                            cumulative_inset = False,
                            height = 8,
                            xlabel = 'Score shift',
                            filename = f'{file_name}_entropy.pdf')

sentiment_shift = sh.WeightedAvgShift(type2freq_1=counts,
                                type2freq_2=peak_counts,
                                type2score_1=mittens_lexicon,
                                type2score_2=mittens_lexicon,
                                reference_value='average')

sentiment_shift.get_shift_graph(title='Sentiment difference between 15/5/2022 SBS text and other',
                            text_size_inset = False,
                            cumulative_inset = False,
                            height = 8,
                            xlabel = 'Score shift',
                            filename = f'{file_name}_sentiment.pdf')

Compare political 1-minute and 5-minute documents

In [None]:
# load text
liberal_text_5_min = pickle.load(open('liberal_text_5_min.pkl', 'rb'))
labor_text_5_min = pickle.load(open('labor_text_5_min.pkl', 'rb'))
liberal_text_1_min = pickle.load(open('liberal_text_1_min.pkl', 'rb'))
labor_text_1_min = pickle.load(open('labor_text_1_min.pkl', 'rb'))

# generate counts
counts_5_min_liberal, counts_5_min_labor = irulan.get_counts(liberal_text_5_min, labor_text_5_min)
counts_1_min_liberal, counts_1_min_labor = irulan.get_counts(liberal_text_1_min, labor_text_1_min)

# get and save word shifts
entropy_shift = sh.EntropyShift(type2freq_1=counts_1_min_liberal,
                                type2freq_2=counts_5_min_liberal,
                                alpha = 0.3)

entropy_shift.get_shift_graph(system_names = ['1-minute Liberal text', '5-minute Liberal text'],
                                title='Entropy difference between 1-minute and 5-minute Liberal text',
                                text_size_inset = False,
                                cumulative_inset = False,
                                top_n = 20,
                                height = 8)

entropy_shift = sh.EntropyShift(type2freq_1=counts_1_min_labor,
                                type2freq_2=counts_5_min_labor,
                                alpha = 0.3)

entropy_shift.get_shift_graph(system_names = ['1-minute Labor text', '5-minute Labor text'],
                                title='Entropy difference between 1-minute and 5-minute Labor text',
                                text_size_inset = False,
                                cumulative_inset = False,
                                top_n = 20,
                                height = 8)

Compare political monthly text

In [None]:
# load text
liberal_text_1_min = pickle.load(open('liberal_text_1_min.pkl', 'rb'))
labor_text_1_min = pickle.load(open('labor_text_1_min.pkl', 'rb'))
liberal_dates_1_min = pickle.load(open('liberal_dates_1_min.pkl', 'rb'))
labor_dates_1_min = pickle.load(open('labor_dates_1_min.pkl', 'rb'))
mittens_lexicon = pickle.load(open('mittens_lexicon.pkl', 'rb'))

In [None]:
# generate monthly word shifts
get_monthly_shift(liberal_text_1_min, 2, 2021, mittens_lexicon)

get_monthly_shift(labor_text_1_min, 9, 2019, mittens_lexicon)

get_monthly_shift(labor_text_1_min, 3, 2022, mittens_lexicon)