Import relevant packages

In [6]:
import sys # to get "unresolved" packages
sys.path.append("/Users/a1765262/opt/anaconda3/lib/python3.9/site-packages")

import pandas as pd # for data frames
import os # for working directories
import numpy as np # for numbers
import matplotlib.pyplot as plt # for plotting
from corextopic import corextopic as ct # for topic model
import string # for preprocessing
import sklearn.feature_extraction.text # for vectoriser
from nltk.sentiment import SentimentIntensityAnalyzer # for sentiment
import pickle
import scipy
from nltk import FreqDist
import shifterator as sh
import scipy.stats as st
import datetime
# from readability import readability

import irulan

channel_list = ["ABC1", "Ch7", "Ch9", "Ch10", "SBS", "ABC24"]

Test a topic model anchored on 'sport' for different document lengths. Plot the number of words in each document for 5-minute and program lengths. 

In [None]:
# vector of seconds that we will look at (9999 seconds represents the program split)
seconds = [10, 60, 120, 300, 600, 1200, 1800, 9999]

abc1_2022 = pd.read_csv("ABC1_2022.csv")

for i, s in enumerate(seconds):

    # clean text so that documents are s seconds long
    text = irulan.clean_split_mins(abc1_2022, s)

    # train topic model
    _, tm = irulan.tm(text, anchors = ['sport'], num_topics= 40)

    print(tm.get_topics()[0])

    # if we have split into 5-minute or program-length intervals, make histogram
    if s in [60, 9999]:
        plt.hist([len(doc.split()) for doc in text], bins = 50)
        plt.title("The number of words contained in each program")
        plt.xlabel("Number of words")
        plt.ylabel("Frequency")

Find the total correlation of topic models with the number of topics adjusted from 10 to 100.

In [None]:
text = pickle.load(open("2022_all_clean.pkl", "rb"))

# make the topic model
n_topics = np.array(range(10, 110, 10))
iterations = 30
tcs = np.empty((len(n_topics), iterations))

# initialise vectorizer 
vectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features = 10000, binary = True) 

# get sparse matrix of number of times each word appears in each document
matrix = vectorizer.fit_transform(text)

# get the list of words
words = sorted(vectorizer.vocabulary_.keys())

for k, n in enumerate(n_topics):
    for i in range(iterations):

        # train the topic model
        tm = ct.Corex(n_hidden = n, words = words, max_iter = 400, seed = i) 
        tm.fit(matrix, words = words, anchor_strength = 10)

        tcs[k, i] = tm.tc

        # save the topic model for later
        pickle.dump(open(f'./{n}_topics_{i}.pkl', 'wb'))

        print(f'Value {i} for {n} topics is: {tm.tc}')
        del tm

# make violin plot
plt.violinplot(tcs, positions = np.array(range(10, 110, 10)), widths = 5)
plt.title("The total correlation of topic models on 2022 data adjusting the number of topics")
plt.ylabel("Total correlation")
plt.xlabel("Number of topics")

Pearson and Spearman similarities for topic models with an adjacent number of topics.

In [29]:
def topic_similarity(tm1, tm2, similarity_type = 'pearson'):

    assert similarity_type in ['pearson', 'spearman'], "'similarity_type' should be 'pearson' or 'spearman'."

    k = 0

    pyx1 = tm1.p_y_given_x
    pyx2 = tm2.p_y_given_x

    topics1 = [[line[k][0] for k in range(len(line))] for line in tm1.get_topics(n_words = 100)]
    topics2 = [[line[k][0] for k in range(len(line))] for line in tm2.get_topics(n_words = 100)]

    similarity_score = 0

    for i, t1 in enumerate(topics1):
            for j, t2 in enumerate(topics2):

                if len(set(t1).intersection(set(t2)))>=5:

                    a = pyx1[:, i]

                    b = pyx2[:, j]

                    if similarity_type == 'pearson':

                        similarity_score += scipy.stats.pearsonr(a, b)[0]
                        k += 1

                    else:

                        similarity_score += scipy.stats.spearmanr(a, b, nan_policy='omit')[0]
                        k += 1
    
    return similarity_score/max([k, 1])
    

In [None]:
# assume that we already have 30 topic models for each number of topics...

piecewise_comparisons = zip(range(10, 110, 10), range(20, 100, 10))
iterations = 30
pearson_matrix = np.identity(10)*iterations
spearman_matrix = np.identity(10)*iterations

for i in range(iterations):

    for n1, n2 in piecewise_comparisons:

        tm1 = pickle.load(open(f'./{n}_topics_{i}.pkl', 'rb'))
        tm2 = pickle.load(open(f'./{n}_topics_{i}.pkl', 'rb'))

        pearson = topic_similarity(tm1, tm2, 'pearson')
        pearson_matrix[n1, n2] += pearson
        pearson_matrix[n2, n1] += pearson

        spearman = topic_similarity(tm1, tm2, 'spearman')
        spearman_matrix[n1, n2] += spearman
        spearman_matrix[n2, n1] += spearman

pearson_matrix /= iterations
spearman_matrix /= iterations

for i in range(10):
    for j in range(10):

        if i!=j and i+1!=j and i-1!=j:
            spearman_matrix[i,j] = np.nan
            pearson_matrix[i,j] = np.nan

fig, ax = plt.subplots()
im = ax.imshow(spearman_matrix, vmin = 0.1)

for i in range(10):
    for j in range(10):

        text = ax.text(j, i, round(spearman_matrix[i,j], 2), ha = 'center', va = 'center', color = 'w')

ax.set_xticks(np.arange(10), labels = np.arange(10, 110, 10))
ax.set_yticks(np.arange(10), labels = np.arange(10, 110, 10))
ax.set_xlabel("Number of topics")
ax.set_ylabel("Number of topics")

Generate unsupervised topic models trained on 2022 data from ABC1.

In [None]:
# read in text and dates from ABC1
text = pickle.load(open("2022_all_clean_split.pkl", "rb"))[0]
dates = pickle.load(open("2022_all_dates_split.pkl", "rb"))[0]

# initialise vectorizer 
vectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features = 10000, binary = True) 

# get sparse matrix of number of times each word appears in each document
matrix = vectorizer.fit_transform(text)

# get the list of words
words = sorted(vectorizer.vocabulary_.keys())

# make the topic model
n_topics = 40
iterations = 5
a = 5000

for i in range(iterations):

    # train the topic model
    tm = ct.Corex(n_hidden = n_topics, words = words, max_iter = 400, seed = i) 
    tm.fit(matrix, words = words)
    print(tm.get_topics()[:5])

    # sport will be topic 0 for all topic models except the second one
    if i != 1:

        plt.plot(dates[int(a/2):1-int(a/2)], irulan.moving_average(tm.p_y_given_x[:, 0], a), alpha = 0.8, label = i+1)

    else:
        plt.plot(dates[int(a/2):1-int(a/2)], irulan.moving_average(tm.p_y_given_x[:, 4], a), alpha = 0.8, label = i+1)

    del tm

# make the plot nice
plt.xlabel("Date")
plt.ylabel("Probability")
plt.xticks([datetime.date(year = 2022, month = 2, day = 1), datetime.date(year = 2022, month = 5, day = 1), datetime.date(year = 2022, month = 8, day = 1), datetime.date(year = 2022, month = 11, day = 1)])
plt.legend(bbox_to_anchor=(1, 1));

Train 30 topic models on ABC1 text with the anchor 'sport'.

In [None]:
# read in text and dates from ABC1
text = pickle.load(open('2022_all_clean_split.pkl', 'rb'))[0]
dates = pickle.load(open('2022_all_dates_split.pkl', 'rb'))[0]

# initialise vectorizer 
vectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features = 10000, binary = True) 

# get sparse matrix of number of times each word appears in each document
matrix = vectorizer.fit_transform(text)

# get the list of words
words = sorted(vectorizer.vocabulary_.keys())

# make the topic model
n_topics = 40
iterations = 30
a = 5000

for i in range(iterations):

    # train the topic model
    tm = ct.Corex(n_hidden = n_topics, words = words, max_iter = 400, seed = i) 
    tm.fit(matrix, words = words, anchors = ['sport'], anchor_strength = 10)
    print(tm.get_topics()[0])
    plt.plot(dates[int(a/2):1-int(a/2)], irulan.moving_average(tm.p_y_given_x[:, 0], a), color = 'black', alpha = 0.5)
    del tm

plt.xticks([datetime.date(year = 2022, month = 2, day = 1), datetime.date(year = 2022, month = 5, day = 1), datetime.date(year = 2022, month = 8, day = 1), datetime.date(year = 2022, month = 11, day = 1)])
plt.xlabel("Date")
plt.ylabel("Probability")

Train 30 topic models with anchors for each method of inference and calculate Pearson correlation. 

In [None]:
# read in text and dates from ABC1
text = pickle.load(open('2022_all_clean_split.pkl', 'rb'))[0]
dates = pickle.load(open('2022_all_dates_split.pkl', 'rb'))[0]
anchors = [['covid'],['sport'],['gardening'],['election']]

# initialise vectorizer 
vectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features = 10000, binary = True) 

# get sparse matrix of number of times each word appears in each document
matrix = vectorizer.fit_transform(text)

# get the list of words
words = sorted(vectorizer.vocabulary_.keys())

# make the topic model
n_topics = 40
iterations = 30
a = 5000
correlations = np.zeros((2, 30))

for i in range(iterations):

    # ground truth
    tm = ct.Corex(n_hidden = n_topics, words = words, max_iter = 400, seed = i) 
    tm.fit(matrix, words = words, anchors = anchors, anchor_strength = 10)

    # method 1
    text_sampled = [text[i] for i in range(len(text)) if dates[i].year == 2022]
    vec1, tm1 = irulan.tm(text_sampled, num_topics=40, anchors = anchors, seed = i)

    # method 2
    np.random.seed(seed = i)

    # subsample proportion p
    p = 0.2
    ints = np.random.randint(0, len(text), int(len(text)*p))
    text_sampled = [text[i] for i in ints]

    vec2, tm2 = irulan.tm(text_sampled, num_topics=40, anchors = anchors, seed = i)

    # make predictions from the sampled topic models
    matrix2 = vec1.transform(text)
    p1 = tm1.transform(matrix2, details = True)[0]

    matrix1 = vec2.transform(text)
    p2 = tm2.transform(matrix1, details = True)[0]

    correlation1 = scipy.stats.pearsonr(tm.p_y_given_x[:, 0], p1[:, 0])
    correlation2 = scipy.stats.pearsonr(tm.p_y_given_x[:, 0], p2[:, 0])

    if i == 0:
        plt.plot(dates[int(a/2):-int(a/2)+1], irulan.moving_average(tm.p_y_given_x[:, 0], a), label = 'Ground truth')
        plt.plot(dates[int(a/2):-int(a/2)+1], irulan.moving_average(p1[:, 0], a), label = 'Method 1')
        plt.plot(dates[int(a/2):-int(a/2)+1], irulan.moving_average(p2[:, 0], a), label = 'Method 2')

        for k, anchor in enumerate(anchors[0]):
            print(f'{anchor} Method 1: {correlation1}')
            print(f'{anchor} Method 2: {correlation2}')

    correlations[0, i] = correlation1
    correlations[1, i] = correlation2

    correlation_means = np.mean(correlations, axis = 1)

Compare different percentages to subsample with the ground truth.

In [None]:
# read in text and dates from ABC1
text = pickle.load(open('2022_all_clean_split.pkl', 'rb'))[0]
dates = pickle.load(open('2022_all_dates_split.pkl', 'rb'))[0]
anchors = [['covid'],['sport'],['gardening'],['election']]
subsampled_proportions = [0.001, 0.01, 0.02, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8]

# initialise vectorizer 
vectorizer = sklearn.feature_extraction.text.CountVectorizer(max_features = 10000, binary = True) 

# get sparse matrix of number of times each word appears in each document
matrix = vectorizer.fit_transform(text)

# get the list of words
words = sorted(vectorizer.vocabulary_.keys())

# make the topic model
n_topics = 40
iterations = 30
a = 5000
correlations = np.zeros((30, len(subsampled_proportions)))

for i in range(iterations):

    for p in subsampled_proportions:

        # ground truth
        tm = ct.Corex(n_hidden = n_topics, words = words, max_iter = 400, seed = i) 
        tm.fit(matrix, words = words, anchors = anchors, anchor_strength = 10)

        # method 2
        np.random.seed(seed = i)

        # subsample proportion p
        ints = np.random.randint(0, len(text), int(len(text)*p))
        text_sampled = [text[i] for i in ints]

        vec2, tm2 = irulan.tm(text_sampled, num_topics=40, anchors = anchors, seed = i)

        # make predictions from the sampled topic model
        matrix1 = vec2.transform(text)
        p2 = tm2.transform(matrix1, details = True)[0]

        correlations[i, p] = scipy.stats.pearsonr(tm.p_y_given_x[:, 0], p2[:, 0])

plt.violinplot(correlations);
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9], ["0.1", "1", "2", "5", "10", "20", "40", "60", "80"])
plt.ylabel("Correlation")
plt.xlabel("Percentage of subsample")

Generate unsupervised topic models trained on 2022 data for each channel.

In [None]:
for i, channel in channel_list:

    plt.figure()

    # read in text and dates
    text = pickle.load(open('2022_all_clean_split.pkl', 'rb'))[i]
    dates = pickle.load(open('2022_all_dates_split.pkl', 'rb'))[i]

    # train topic model
    _, tm2 = irulan.tm(text_sampled, num_topics=40)

    # print topics
    print(tm.get_topics()[:5])

    for j in range(5):
        plt.plot(dates[int(a/2):1-int(a/2)], irulan.moving_average(tm2.p_y_given_x[:,j], a))

    plt.xticks([datetime.date(year = 2022, month = 2, day = 1), datetime.date(year = 2022, month = 5, day = 1), datetime.date(year = 2022, month = 8, day = 1), datetime.date(year = 2022, month = 11, day = 1)])
    plt.xlabel("Date")
    plt.ylabel("Probability")
    plt.legend(bbox_to_anchor = (1,1))

Compare unsupervised topic models trained on each channel with Pearson similarity.

Compare anchored topic models trained on each channel with Pearson similarity.

Create a hierarchical topic model to combine sport topics into one larger overarching sport topic. 

Plot media attention for supervised topic models.

Calculate the coverage bias for some topics.

Get political word counts. 

Calculate political coverage bias.