Import relevant packages

In [1]:
import sys # to get "unresolved" packages
sys.path.append("/Users/a1765262/opt/anaconda3/lib/python3.9/site-packages")

import pandas as pd # for data frames
import os # for working directories
import numpy as np # for numbers
import matplotlib.pyplot as plt # for plotting
from corextopic import corextopic as ct # for topic model
import string # for preprocessing
import sklearn.feature_extraction.text # for vectoriser
from nltk.sentiment import SentimentIntensityAnalyzer # for sentiment
import pickle
import scipy
from nltk import FreqDist
import shifterator as sh
import datetime
# from readability import readability

os.chdir("..")

sys.path.append("./Other")
import irulan

channel_list = ["ABC1", "Ch7", "Ch9", "Ch10", "SBS", "ABC24"]

Load in text data from 2022. This is split up by channel.

In [2]:
text_split = pickle.load(open("./Data/2022_all_clean_split.pkl", "rb"))

### Word analysis

Find the total number of words, and the number of unique words

In [None]:
text_split = pickle.load(open("./Data/2022_all_clean_split.pkl", "rb"))

for i, channel in enumerate(channel_list):    

    text = " ".join(text_split[i]).split()
    print(f"Total number of words ({channel}):", len(text))
    print(f"Number of unique words ({channel}):", len(set(text)))

Get the sentiment of each channel

In [None]:
nrc_lexicon = pickle.load(open('./Data/NRC_lexicon.pkl', 'rb'))
mittens_lexicon = pickle.load(open('./Data/mittens_lexicon.pkl', 'rb'))

for i, channel in enumerate(channel_list):

    text = text_split[i]

    s = np.mean(np.array([irulan.doc_sentiment(doc, mittens_lexicon) for doc in text]))
    print("Mittens sentiment of " + str(channel) + ": " + str(s))

    s = np.mean(np.array([irulan.doc_sentiment(doc, nrc_lexicon) for doc in text]))
    print("NRC sentiment of " + str(channel) + ": " + str(s))

Find the terms with the greatest difference in Tsallis entropy on each channel. Use the terms with the greatest difference in entropy to create word clouds. While we do this, we can also find the Shannon entropy of text from each channel.

In [None]:
# function to get the colour of each word

def word_col_labmt(word, lexicon, font_size, position, orientation, random_state = None, **kwargs):
    
    word_col = 'black'
    
    if word in lexicon.keys():
        
        if lexicon[word] > 0.2:
            
            word_col = 'green'
        
        elif lexicon[word] < -0.2:
            
            word_col = 'red'
        
    return word_col

In [None]:
all_text = pickle.load(open("./Data/2022_all_clean_split.pkl", "rb"))

all_text_for_counts = all_text[0][:]

for i in range(1, 6):
    all_text_for_counts.extend(all_text[i])

all_text_for_counts = " ".join(all_text_for_counts)

all_counts = dict() # initialise dictionary
# get frequency counts
all_counts = FreqDist(all_text_for_counts.split())

for i, channel in enumerate(channel_list):

    text = " ".join(all_text[i])

    channel_counts = dict() # initialise dictionary
    # get frequency counts
    channel_counts = FreqDist(text.split())
    
    # calculate Tsallis entropy shifts
    entropy_shift = sh.EntropyShift(type2freq_1=all_counts,
                                    type2freq_2=channel_counts,
                                    alpha = 0.3)
    
    shft_scores = entropy_shift.get_shift_scores()

    counts_array = np.array(list(channel_counts.values()))

    # get the entropy of each channel 
    print(scipy.stats.entropy(counts_array))

# TODO include word cloud code here

Get plots for periodicity

In [None]:
# initialise
daily_counts = np.zeros((6, 365))
daily_dates = [datetime.date(2022, 1, 1) + datetime.timedelta(days = i) for i in range(365)]
colours = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:pink']

for i, ch in enumerate(channel_list):

    # get text and dates
    text = text_split[i]
    dates = pickle.load(open("./Data/all_dates_channel_split.pkl", "rb"))[i]

    # filter to just 2022
    text = [t for d, t in zip(dates, text) if d.year == 2022]
    dates = [d for d in dates if d.year == 2022]

    # count the number of words on each day
    for d, t in zip(dates, text):

        k = (d-datetime.datetime(2022, 1, 1)).days

        daily_counts[i, k] += len(t.split())

    # plot
    plt.plot(daily_dates, daily_counts[i], label = ch, color = colours[i])

    plt.title('Daily word counts in 2022')
    plt.xlabel('Date')
    plt.ylabel('Word count')
    plt.legend(bbox_to_anchor = (1,1))

### Program analysis

Find the number of unique programs, and the number of genres in each channel

In [None]:
for channel in channel_list:

    data = pd.read_csv(f'./Data/{channel}_2022.csv')

    print(f"Number of individual programs ({channel}):", len(set(data["program"])))
    print(f"Number of genres ({channel}):", len(set(data["genre"])))
    print(text.groupby("genre").count())

Find the most common programs

In [None]:
for i, channel in enumerate(channel_list):

    text = pd.read_csv(f"./Data/Yearly Data/{channel}_2022.csv")

    print(channel, text["program"].value_counts()[:5])

Find the most common genres

In [None]:
for i, channel in enumerate(channel_list):

    text = pd.read_csv(f"./Data/Yearly Data/{channel}_2022.csv")

    print(channel, text["genre"].value_counts()[:5]/len(text))

Calculate the sentiment of each genre

In [None]:
genre_dict_to_text = {}

for ch in channel_list:

    text = pd.read_csv("./Data/Yearly Data/" + ch + "_2022.csv")

    for genre, line in zip(text["genre"], text["text"]):

            if genre not in genre_dict_to_text.keys():

                genre_dict_to_text[genre] = [line]

            else:

                genre_dict_to_text[genre].append(line)

for genre, text_list in genre_dict_to_text.items():

    mittens_sentiment = 0
    nrc_sentiment = 0
    k = 0

    for doc in text_list:
    
        doc = str(doc).replace("'", "").lower()

        for c in string.punctuation:

            doc = str(doc).replace(c, "").lower()

        mittens_sentiment += irulan.doc_sentiment(doc, mittens_lexicon)
        nrc_sentiment += irulan.doc_sentiment(doc, nrc_lexicon)

        k += 1

    mittens_sentiment /= np.max([1, k])
    nrc_sentiment /= np.max([1, k])

    print(f'{genre} mittens sentiment: {mittens_sentiment}')
    print(f'{genre} NRC sentiment: {nrc_sentiment}')