Import relevant packages

In [6]:
import sys # to get "unresolved" packages
sys.path.append("/Users/a1765262/opt/anaconda3/lib/python3.9/site-packages")

import pandas as pd # for data frames
import os # for working directories
import numpy as np # for numbers
import matplotlib.pyplot as plt # for plotting
from corextopic import corextopic as ct # for topic model
import string # for preprocessing
import sklearn.feature_extraction.text # for vectoriser
from nltk.sentiment import SentimentIntensityAnalyzer # for sentiment
import pickle
import scipy
from nltk import FreqDist
import shifterator as sh
import scipy.stats as st
import datetime
from mpl_toolkits.axes_grid1 import AxesGrid
import irulan
from sklearn.feature_extraction.text import CountVectorizer


channel_list = ["ABC1", "Ch7", "Ch9", "Ch10", "SBS", "ABC24"]

Test a topic model anchored on 'sport' for different document lengths. Plot the number of words in each document for 5-minute and program lengths. 

In [None]:
# vector of seconds that we will look at (9999 seconds represents the program split)
seconds = [10, 60, 120, 300, 600, 1200, 1800, 9999]

abc1_2022 = pd.read_csv("ABC1_2022.csv")

for i, s in enumerate(seconds):

    # clean text so that documents are s seconds long
    text = irulan.clean_split_mins(abc1_2022, s)

    # train topic model
    _, tm = irulan.tm(text, anchors = ['sport'], num_topics= 40)

    print(tm.get_topics()[0])

    # if we have split into 5-minute or program-length intervals, make histogram
    if s in [300, 9999]:
        plt.hist([len(doc.split()) for doc in text], bins = 50)
        plt.title("The number of words contained in each program")
        plt.xlabel("Number of words")
        plt.ylabel("Frequency")

Find the total correlation of topic models with the number of topics adjusted from 10 to 100.

In [None]:
text = pickle.load(open("2022_text_split.pkl", "rb"))

# make the topic model
n_topics = np.array(range(10, 110, 10))
iterations = 30
tcs = np.empty((len(n_topics), iterations))

# initialise vectorizer 
vectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features = 10000, binary = True) 

# get sparse matrix of number of times each word appears in each document
matrix = vectorizer.fit_transform(text)

# get the list of words
words = sorted(vectorizer.vocabulary_.keys())

for k, n in enumerate(n_topics):
    for i in range(iterations):

        # train the topic model
        tm = ct.Corex(n_hidden = n, words = words, max_iter = 400, seed = i) 
        tm.fit(matrix, words = words, anchor_strength = 10)

        tcs[k, i] = tm.tc

        # save the topic model for later
        pickle.dump(open(f'./{n}_topics_{i}.pkl', 'wb'))

        print(f'Value {i} for {n} topics is: {tm.tc}')
        del tm

# make violin plot
plt.violinplot(tcs, positions = np.array(range(10, 110, 10)), widths = 5)
plt.title("The total correlation of topic models on 2022 data adjusting the number of topics")
plt.ylabel("Total correlation")
plt.xlabel("Number of topics")

Pearson and Spearman similarities for topic models with an adjacent number of topics.

In [29]:
def topic_similarity(tm1, tm2, similarity_type = 'pearson'):

    assert similarity_type in ['pearson', 'spearman'], "'similarity_type' should be 'pearson' or 'spearman'."

    k = 0

    # get the p(y|x) terms
    pyx1 = tm1.p_y_given_x
    pyx2 = tm2.p_y_given_x

    # get the words in the topics as a list
    topics1 = [[line[k][0] for k in range(len(line))] for line in tm1.get_topics(n_words = 100)]
    topics2 = [[line[k][0] for k in range(len(line))] for line in tm2.get_topics(n_words = 100)]

    similarity_score = 0

    # loop through and compare all topics
    for i, t1 in enumerate(topics1):
            for j, t2 in enumerate(topics2):

                # if the number of words in common is greater than or equal to 5
                if len(set(t1).intersection(set(t2)))>=5:

                    # get the p(y|x) terms
                    a = pyx1[:, i]
                    b = pyx2[:, j]

                    # calculate the pearson or spearman similarity of the p(y|x) terms
                    if similarity_type == 'pearson':

                        similarity_score += scipy.stats.pearsonr(a, b)[0]
                        k += 1

                    else:

                        similarity_score += scipy.stats.spearmanr(a, b, nan_policy='omit')[0]
                        k += 1
    
    return similarity_score/max([k, 1])
    

In [None]:
# assume that we already have 30 topic models for each number of topics...

piecewise_comparisons = zip(range(10, 110, 10), range(20, 100, 10))
iterations = 30
pearson_matrix = np.identity(10)*iterations
spearman_matrix = np.identity(10)*iterations

for i in range(iterations):

    for n1, n2 in piecewise_comparisons:

        # load both topic models
        tm1 = pickle.load(open(f'./{n}_topics_{i}.pkl', 'rb'))
        tm2 = pickle.load(open(f'./{n}_topics_{i}.pkl', 'rb'))

        # calculate the pearson similarity between them
        pearson = topic_similarity(tm1, tm2, 'pearson')
        pearson_matrix[n1, n2] += pearson
        pearson_matrix[n2, n1] += pearson

        # calculate the spearman similarity between them
        spearman = topic_similarity(tm1, tm2, 'spearman')
        spearman_matrix[n1, n2] += spearman
        spearman_matrix[n2, n1] += spearman

# take the mean
pearson_matrix /= iterations
spearman_matrix /= iterations

# plot as two heatmaps
for i in range(10):
    for j in range(10):

        if i!=j and i+1!=j and i-1!=j:
            spearman_matrix[i,j] = np.nan
            pearson_matrix[i,j] = np.nan

fig, ax = plt.subplots()
im = ax.imshow(spearman_matrix, vmin = 0.1)

for i in range(10):
    for j in range(10):

        text = ax.text(j, i, round(spearman_matrix[i,j], 2), ha = 'center', va = 'center', color = 'w')

ax.set_xticks(np.arange(10), labels = np.arange(10, 110, 10))
ax.set_yticks(np.arange(10), labels = np.arange(10, 110, 10))
ax.set_xlabel("Number of topics")
ax.set_ylabel("Number of topics")

Generate unsupervised topic models trained on 2022 data from ABC1.

In [None]:
# read in text and dates from ABC1
text = pickle.load(open("2022_text_split.pkl", "rb"))[0]
dates = pickle.load(open("2022_dates_split.pkl", "rb"))[0]

# initialise vectorizer 
vectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features = 10000, binary = True) 

# get sparse matrix of number of times each word appears in each document
matrix = vectorizer.fit_transform(text)

# get the list of words
words = sorted(vectorizer.vocabulary_.keys())

# make the topic model
n_topics = 40
iterations = 5
a = 5000

for i in range(iterations):

    # train the topic model
    tm = ct.Corex(n_hidden = n_topics, words = words, max_iter = 400, seed = i) 
    tm.fit(matrix, words = words)
    print(tm.get_topics()[:5])

    # sport will be topic 0 for all topic models except the second one
    if i != 1:

        plt.plot(dates[int(a/2):1-int(a/2)], irulan.moving_average(tm.p_y_given_x[:, 0], a), alpha = 0.8, label = i+1)

    else:
        plt.plot(dates[int(a/2):1-int(a/2)], irulan.moving_average(tm.p_y_given_x[:, 4], a), alpha = 0.8, label = i+1)

    del tm

# make the plot nice
plt.xlabel("Date")
plt.ylabel("Probability")
plt.xticks([datetime.date(year = 2022, month = 2, day = 1), datetime.date(year = 2022, month = 5, day = 1), datetime.date(year = 2022, month = 8, day = 1), datetime.date(year = 2022, month = 11, day = 1)])
plt.legend(bbox_to_anchor=(1, 1));

Train 30 topic models on ABC1 text with the anchor 'sport'.

In [None]:
# read in text and dates from ABC1
text = pickle.load(open('2022_text_split.pkl', 'rb'))[0]
dates = pickle.load(open('2022_dates_split.pkl', 'rb'))[0]

# initialise vectorizer 
vectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features = 10000, binary = True) 

# get sparse matrix of number of times each word appears in each document
matrix = vectorizer.fit_transform(text)

# get the list of words
words = sorted(vectorizer.vocabulary_.keys())

# make the topic model
n_topics = 40
iterations = 30
a = 5000

for i in range(iterations):

    # train the topic model
    tm = ct.Corex(n_hidden = n_topics, words = words, max_iter = 400, seed = i) 
    tm.fit(matrix, words = words, anchors = ['sport'], anchor_strength = 10)
    print(tm.get_topics()[0])
    plt.plot(dates[int(a/2):1-int(a/2)], irulan.moving_average(tm.p_y_given_x[:, 0], a), color = 'black', alpha = 0.5)
    del tm

plt.xticks([datetime.date(year = 2022, month = 2, day = 1), datetime.date(year = 2022, month = 5, day = 1), datetime.date(year = 2022, month = 8, day = 1), datetime.date(year = 2022, month = 11, day = 1)])
plt.xlabel("Date")
plt.ylabel("Probability")

Train 30 topic models with anchors for each method of inference and calculate Pearson correlation. 

In [None]:
# read in text and dates from ABC1
text = pickle.load(open('all_text_split.pkl', 'rb'))[0]
dates = pickle.load(open('all_dates_split.pkl', 'rb'))[0]
anchors = [['covid'],['sport'],['gardening'],['election']]

# initialise vectorizer 
vectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features = 10000, binary = True) 

# get sparse matrix of number of times each word appears in each document
matrix = vectorizer.fit_transform(text)

# get the list of words
words = sorted(vectorizer.vocabulary_.keys())

# make the topic model
n_topics = 40
iterations = 30
a = 5000
correlations = np.zeros((2, 30))

for i in range(iterations):

    # ground truth
    tm = ct.Corex(n_hidden = n_topics, words = words, max_iter = 400, seed = i) 
    tm.fit(matrix, words = words, anchors = anchors, anchor_strength = 10)

    # method 1
    text_sampled = [text[i] for i in range(len(text)) if dates[i].year == 2022]
    vec1, tm1 = irulan.tm(text_sampled, num_topics=40, anchors = anchors, seed = i)

    # method 2
    np.random.seed(seed = i)

    # subsample proportion p
    p = 0.2
    ints = np.random.randint(0, len(text), int(len(text)*p))
    text_sampled = [text[i] for i in ints]

    vec2, tm2 = irulan.tm(text_sampled, num_topics=40, anchors = anchors, seed = i)

    # make predictions from the sampled topic models
    matrix1 = vec1.transform(text)
    p1 = tm1.transform(matrix1, details = True)[0]

    matrix2 = vec2.transform(text)
    p2 = tm2.transform(matrix2, details = True)[0]

    correlation1 = scipy.stats.pearsonr(tm.p_y_given_x[:, 0], p1[:, 0])
    correlation2 = scipy.stats.pearsonr(tm.p_y_given_x[:, 0], p2[:, 0])

    # plot the first iteration
    if i == 0:
        plt.plot(dates[int(a/2):-int(a/2)+1], irulan.moving_average(tm.p_y_given_x[:, 0], a), label = 'Ground truth')
        plt.plot(dates[int(a/2):-int(a/2)+1], irulan.moving_average(p1[:, 0], a), label = 'Method 1')
        plt.plot(dates[int(a/2):-int(a/2)+1], irulan.moving_average(p2[:, 0], a), label = 'Method 2')

        for k, anchor in enumerate(anchors[0]):
            print(f'{anchor} Method 1: {correlation1}')
            print(f'{anchor} Method 2: {correlation2}')

    correlations[0, i] = correlation1
    correlations[1, i] = correlation2

    correlation_means = np.mean(correlations, axis = 1)

Compare different percentages to subsample with the ground truth.

In [None]:
# read in text and dates from ABC1
text = pickle.load(open('2022_text_split.pkl', 'rb'))[0]
dates = pickle.load(open('2022_dates_split.pkl', 'rb'))[0]
anchors = [['covid'],['sport'],['gardening'],['election']]
subsampled_proportions = [0.001, 0.01, 0.02, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8]

# initialise vectorizer 
vectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features = 10000, binary = True) 

# get sparse matrix of number of times each word appears in each document
matrix = vectorizer.fit_transform(text)

# get the list of words
words = sorted(vectorizer.vocabulary_.keys())

# make the topic model
n_topics = 40
iterations = 30
a = 5000
correlations = np.zeros((30, len(subsampled_proportions)))

for i in range(iterations):

    for p in subsampled_proportions:

        # ground truth
        tm = ct.Corex(n_hidden = n_topics, words = words, max_iter = 400, seed = i) 
        tm.fit(matrix, words = words, anchors = anchors, anchor_strength = 10)

        # method 2
        np.random.seed(seed = i)

        # subsample proportion p
        ints = np.random.randint(0, len(text), int(len(text)*p))
        text_sampled = [text[i] for i in ints]

        vec2, tm2 = irulan.tm(text_sampled, num_topics=40, anchors = anchors, seed = i)

        # make predictions from the sampled topic model
        matrix2 = vec2.transform(text)
        p2 = tm2.transform(matrix2, details = True)[0]

        correlations[i, p] = scipy.stats.pearsonr(tm.p_y_given_x[:, 0], p2[:, 0])

# plot
plt.violinplot(correlations);
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9], ["0.1", "1", "2", "5", "10", "20", "40", "60", "80"])
plt.ylabel("Correlation")
plt.xlabel("Percentage of subsample")

Generate unsupervised topic models trained on 2022 data for each channel.

In [None]:
for i, channel in enumerate(channel_list):

    plt.figure()

    # read in text and dates
    text = pickle.load(open('2022_text_split.pkl', 'rb'))[i]
    dates = pickle.load(open('2022_dates_split.pkl', 'rb'))[i]

    # train topic model
    _, tm = irulan.tm(text_sampled)

    # print topics
    print(tm.get_topics()[:5])

    # plot
    for j in range(5):
        plt.plot(dates[int(a/2):1-int(a/2)], irulan.moving_average(tm.p_y_given_x[:,j], a))

    plt.xticks([datetime.date(year = 2022, month = 2, day = 1), datetime.date(year = 2022, month = 5, day = 1), datetime.date(year = 2022, month = 8, day = 1), datetime.date(year = 2022, month = 11, day = 1)])
    plt.xlabel("Date")
    plt.ylabel("Probability")

Compare unsupervised topic models trained on each channel with Pearson similarity.

In [None]:
# initialise
iterations = 30
similarity_matrix = np.zeros((6, 6))

for i in range(iterations):

    for j, channel1 in enumerate(channel_list):

        # load in text and create topic model for channel 1
        text1 = pickle.load(open('2022_text_split.pkl', 'rb'))[j]
        vec1, tm1 = irulan.tm(text1, seed = i)

        # get topics into a nice format
        topics1 = [[line[k][0] for k in range(len(line))] for line in tm1.get_topics(n_words = 100)]

        for k, channel2 in enumerate(channel_list):

            # load in text and create topic model for channel 2
            text2 = pickle.load(open('2022_text_split.pkl', 'rb'))[k]
            vec2, tm2 = irulan.tm(text2, seed = i)

            # get topics into a nice format
            topics2 = [[line[k][0] for k in range(len(line))] for line in tm2.get_topics(n_words = 100)]

            # predict values for channel 2s text using channel 1s topic model
            matrix1 = vec1.transform(text2)
            p1 = tm1.transform(matrix1, details = True)[0]

            p2 = tm2.p_y_given_x

            # initialise the similarity score
            similarity_score = 0
            sums = 0

            # loop through topics 
            for m, t1 in enumerate(topics1):
                for n, t2 in enumerate(topics2):

                    # check if the topics share at least 5 words
                    if len(set(t1).intersection(set(t2)))>=5:

                        a = p1[:, m]
                        b = p2[:, n]

                        # add to similarity score
                        similarity_score += scipy.stats.pearsonr(a, b)[0]
                        sums += 1

            similarity_score /= max([sums, 1])

            # add similarity score to the matrix
            similarity_matrix[j, k] += similarity_score

# take the mean of the scores
similarity_matrix /= iterations

# plot the scores
fig, ax = plt.subplots()
im = ax.imshow(similarity_matrix)

# to get the numbers on the heatmap
for i, channel1 in enumerate(channel_list):
    for j, channel2 in enumerate(channel_list):

        text = ax.text(j, i, round(similarity_matrix[i,j], 2), ha = 'center', va = 'center', color = 'w')

ax.set_xticks(np.arange(6), labels = channel_list, rotation = 45)
ax.set_yticks(np.arange(6), labels = channel_list)
ax.tick_params(left = False, bottom = False)

Compare anchored topic models trained on each channel with Pearson similarity.

In [None]:
# initialise
iterations = 30
similarity_matrix = np.zeros((6, 6, 6))

for i in range(iterations):

    for j, channel1 in enumerate(channel_list):

        # load in text and create topic model for channel 1
        text1 = pickle.load(open('2022_text_split.pkl', 'rb'))[j]
        vec1, tm1 = irulan.tm(text1, anchors = [['covid'], ['ukraine'], ['election'], ['flood'], ['queen'], ['sport']], seed = i)

        # get topics into a nice format
        topics1 = [[line[k][0] for k in range(len(line))] for line in tm1.get_topics(n_words = 100)]

        for k, channel2 in enumerate(channel_list):

            # load in text and create topic model for channel 2
            text2 = pickle.load(open('2022_text_split.pkl', 'rb'))[k]
            vec2, tm2 = irulan.tm(text2, anchors = [['covid'], ['ukraine'], ['election'], ['flood'], ['queen'], ['sport']], seed = i)

            # get topics into a nice format
            topics2 = [[line[k][0] for k in range(len(line))] for line in tm2.get_topics(n_words = 100)]

            # predict values for channel 2s text from channel 1s topic model and vice versa
            matrix1 = vec1.transform(text2)
            p1 = tm1.transform(matrix1, details = True)[0]

            p2 = tm2.p_y_given_x

            # initialise the similarity score
            similarity_score = 0

            # loop through topics 
            for m in range(6):

                a = p1[:, m]
                b = p2[:, m]

                # add to similarity score
                similarity_score = scipy.stats.pearsonr(a, b)[0]

                # add similarity score to the matrix
                similarity_matrix[j, k, m] += similarity_score

        if i == 0:
            # print out the contents of the sport topic
            print(tm1.get_topics(n_words=40)[5])


# take the mean of the scores
similarity_matrix /= iterations
average_similarity = np.mean(similarity_matrix, axis = 2)

# plot the scores
fig, ax = plt.subplots()
im = ax.imshow(similarity_matrix)

# to get the numbers on the heatmap
for i, channel1 in enumerate(channel_list):
    for j, channel2 in enumerate(channel_list):

        text = ax.text(j, i, round(similarity_matrix[i,j], 2), ha = 'center', va = 'center', color = 'w')

ax.set_xticks(np.arange(6), labels = channel_list, rotation = 45)
ax.set_yticks(np.arange(6), labels = channel_list)
ax.tick_params(left = False, bottom = False)

# create multiple heatmaps - one for each topic
fig = plt.figure()
grid = AxesGrid(fig, 111,
                nrows_ncols=(3, 2),
                axes_pad=0.5,
                share_all=True,
                label_mode="L",
                cbar_location="right",
                cbar_mode="single",
                )

# plot a heatmap for each channel 
for k, topic, ax in zip(range(6), ['covid', 'ukraine', 'election', 'flood', 'queen', 'sport'], grid):

    im = ax.imshow(similarity_matrix[:, :, k], vmin = 0.4, vmax = 1)

    for i in range(6):
        for j in range(6):

            text = ax.text(j, i, round(similarity_matrix[i,j, k], 2), ha = 'center', va = 'center', color = 'w')
    
    grid.cbar_axes[0].colorbar(im)

    for cax in grid.cbar_axes:
        cax.toggle_label(True)

    ax.set_title(f"`{topic}' topic")
    ax.set_xticks(np.arange(6), labels = channel_list, rotation  = 45)
    ax.set_yticks(np.arange(6), labels = channel_list)

    ax.tick_params(left = False, bottom = False)

Create a hierarchical topic model to combine sport topics into one larger overarching sport topic. 

In [None]:
text = pickle.load(open('2022_text_split.pkl', 'rb'))

for ch, t in zip(channel_list, text):

    vectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features = 10000, binary = True) 

    # get sparse matrix of number of times each word appears in each document
    matrix = vectorizer.fit_transform(t)

    # get the list of words
    words = sorted(vectorizer.vocabulary_.keys())

    # train the CorEx topic model
    tm1 = ct.Corex(n_hidden = 200, words = words, seed = 0) 
    tm1.fit(matrix, words = words)

    # train the second topic model on the first topic model's topics
    tm2 = ct.Corex(n_hidden = 20, words = words, seed = 0)
    tm2.fit(scipy.sparse.csr_matrix(tm1.labels))

    # get the content of the first topic model's topics
    tm1_topics = tm1.get_topics()

    for topic in tm2.get_topics():

        # initialise a list containing all of the topic words, and a list separated into the smaller topics
        topic_list = list()
        topic_list_sep = list()

        # create lists by looping through topics
        for t in topic:
            topic_list.extend([tm1_topics[t[0]][i][0] for i in range(len(tm1_topics[t[0]]))])
            topic_list_sep.append([tm1_topics[t[0]][i][0] for i in range(len(tm1_topics[t[0]]))])

        # check if sport-related terms are in any of the topics - if so, this is a larger sport topic and we will print its contents
        if 'sport' in topic_list or 'ball' in topic_list or 'game' in topic_list or 'player' in topic_list:

            print(topic_list_sep)

Plot media attention for supervised topic models.

In [None]:
# initialise
iterations = 30
a = 5000
mean_pyx = [np.zeros((len(text), 40))]*6
anchors = [['covid'], ['ukraine'], ['election'], ['flood'], ['queen'], ['sport']]

for j, channel in enumerate(channel_list):

    # load in text and create topic model for channel 1
    text = pickle.load(open('2022_all_clean_split.pkl', 'rb'))[j]
    dates = pickle.load(open('2022_all_clean_dates_split.pkl','rb'))[i]
    

    for i in range(iterations):

        # make a topic model for each iteration
        vec, tm = irulan.tm(text, anchors = anchors, seed = i)

        mean_pyx[j] += tm.p_y_given_x

    # take the mean of the p(y|x) terms
    mean_pyx[j] /= iterations

# plot for each topic
for t, topic in enumerate(anchors):

    plt.figure()

    for j, channel in enumerate(channel_list):
    
        plt.plot(dates[int(a/2):1-int(a/2)], irulan.moving_average(mean_pyx[j][:, t]), label = channel)

    plt.title(f"{topic} topic")
    plt.xlabel("Date")
    plt.ylabel("Probability")
    plt.xticks([datetime.date(year = 2022, month = 2, day = 1), datetime.date(year = 2022, month = 5, day = 1), datetime.date(year = 2022, month = 8, day = 1), datetime.date(year = 2022, month = 11, day = 1)])
    plt.legend(loc = 'upper left', bbox_to_anchor = (1,1));

Calculate the coverage bias for some topics using a topic model.

In [None]:
word_list = ['covid', 'ukraine', 'election', 'flood', 'queen', 'sport']

for i, channel in enumerate(channel_list):

    # load in text 
    text = pickle.load(open('all_clean_split.pkl', 'rb'))[i]

    # get all counts of the text
    vecs = CountVectorizer()
    counts = vecs.fit_transform(text[i])


    for word in word_list:

        # find the counts of the words in the word list and print
        s = (np.sum(counts[:, vecs.vocabulary_[word]]))/len(text)
        print(f'{channel}, {word}: {s}')

Calculate the coverage bias for some topics using a topic model.

In [None]:
# similar to getting media attention

# initialise
iterations = 30
a = 5000
mean_pyx = [np.zeros((len(text), 40))]*6
anchors = [['covid'], ['ukraine'], ['election'], ['flood'], ['queen'], ['sport']]
bias_matrix = np.zeros((6, 6))

for j, channel in enumerate(channel_list):

        # load in text and create topic model for channel 1
        text = pickle.load(open('all_text_split.pkl', 'rb'))[j]
        dates = pickle.load(open('all_dates_split.pkl','rb'))[i]

        for i in range(iterations):

            # subsample text
            np.random.seed(i)
            p = 0.1
            ints = np.random.randint(0, len(text), int(len(text)*p))
            subsampled_text = [subsampled_text[i] for i in ints]

            # train topic model with anchors
            vec, tm = irulan.tm(text, anchors = anchors, seed = i)

            # add to the mean p(y|x)
            mean_pyx[j] += tm.p_y_given_x

        # take the mean of each channel's p(y|x)
        mean_pyx[j] /= iterations

        # make predictions from the sampled topic models
        matrix = vec.transform(text)
        pyx = tm.transform(matrix, details = True)[0]

        for t, topic in enumerate(anchors):

            bias = np.mean(mean_pyx[j][:, t])
            bias_matrix[j, t] = bias

            print(f'Coverage bias for {topic} topic on {channel}:', bias)

Calculate political coverage bias using word counts. 

In [None]:
word_lists = [['liberal', 'scott', 'morrison', 'malcolm', 'turnbull'], ['labor', 'anthony', 'albanese', 'bill', 'shorten']]

for i, channel in enumerate(channel_list):

    # load in text 
    text = pickle.load(open('all_text_split.pkl', 'rb'))[i]

    # get all counts of the text
    vecs = CountVectorizer()
    counts = vecs.fit_transform(text[i])


    for party in word_lists:

        s = 0

        for word in party:

            # find the counts of the words in the word list and print
            s += (np.sum(counts[:, vecs.vocabulary_[word]]))

        print(f'{channel}, {word}: {s/len(text)}')

Calculate political coverage bias using a topic model.

In [None]:
# similar to getting media attention

# initialise
iterations = 30
a = 5000
mean_pyx = [np.zeros((len(text), 40))]*6
anchors = [['liberal', 'scott', 'morrison', 'malcolm', 'turnbull'], ['labor', 'anthony', 'albanese', 'bill', 'shorten']]
bias_matrix = np.zeros((6, 6))
mod = np.zeros(6)
ros = np.zeros(6)

for j, channel in enumerate(channel_list):

        # load in text and create topic model for channel 1
        text = pickle.load(open('all_text_split.pkl', 'rb'))[j]
        dates = pickle.load(open('all_dates_split.pkl','rb'))[i]

        for i in range(iterations):

            # subsample text
            np.random.seed(i)
            p = 0.1
            ints = np.random.randint(0, len(text), int(len(text)*p))
            subsampled_text = [subsampled_text[i] for i in ints]

            # train topic model with anchors
            vec, tm = irulan.tm(text, anchors = anchors, seed = i)

            # make predictions from the sampled topic models
            matrix = vec.transform(text)
            pyx = tm.transform(matrix, details = True)[0]

            # add to the mean p(y|x)
            mean_pyx[j] += pyx

        # take the mean of each channel's p(y|x)
        mean_pyx[j] /= iterations

        for t, party in enumerate(anchors):

            bias = np.mean(mean_pyx[j][:, t])
            bias_matrix[j, t] = bias

            print(f'Coverage bias for {party[0]} topic on {channel}:', bias)

        # calculate MOD
        mod[j] = np.mean(mean_pyx[j][:, 0] - mean_pyx[j][:, 1])
        print(f'MOD for {channel}:', bias)

        # caluclate ROS
        ros[j] = np.sum(mean_pyx[j][:, 0])/np.sum(mean_pyx[j][:, 1])
        print(f'ROS for {channel}:', bias)

# save the mean of the p(y|x) terms
pickle.dump(mean_pyx, open('political_probabilities.pkl', 'wb'))