This Jupyter notebook details the method for obtaining data from the Tveeder API and converting it into an easy format for us to work with. 

Import relevant packages

In [4]:
#packages
# import requests
import numpy as np
import pandas as pd
import os
import json
import datetime
from dateutil.relativedelta import relativedelta
import re
import string
import pickle

os.chdir("..")

#vectorising channels
# for the text
channel_list = ['ABC24','ABC1','Ch10','Ch9','Ch7','SBS']
channel_codes = {'ABC24':"560",'ABC1':"561",'Ch10':"1589",'Ch9':"1072",'Ch7':"1328",'SBS':"785"}
# for the programs
channel_list_2 = ['ABC News 24','ABC 1','Ten Digital','Nine Digital','7 Digital','SBS One']
channel_codes_2 = {'ABC News 24':"560",'ABC 1':"561",'Ten Digital':"1589",'Nine Digital':"1072",'7 Digital':"1328",'SBS One':"785"}

Firstly, we obtain a month of data. 

In [None]:
# code adapted from Benjamin Lang, Alexandra Stephenson, Tyson Rowe, and Niamh Jones

def monthly_update(month, year):

    month = str(month)
    month = month.rjust(2,'0')
    year = str(year)

    """
    Upload one month's worth of data to Box
    """

    DST_back = ['2015-04-05','2016-04-03','2017-04-02','2018-04-01','2019-04-07','2020-04-05','2021-04-04', '2022-04-03', '2023-04-02']
    DST_fwd = ['2015-10-04','2016-10-02','2017-10-01','2018-10-07','2019-10-06','2020-10-04','2021-10-03', '2022-10-02', '2023-10-01']
    

    # loop through all channels
    for channel, channel_prog in zip(channel_list, channel_list_2):

        clock_back = 0
        clock_fwd = 0

        #Getting start and end dates
        first = year + "-" + month + "-01"
        firstday = datetime.datetime.strptime(first, "%Y-%m-%d")
        lastday = firstday + relativedelta(months=1)

        #convert time to epoch time, and get start and endtime of day
        starttime = firstday.timestamp()
        endtime = lastday.timestamp()

        #Will loop through each day of the month
        start_hour = starttime
        end_hour = starttime + 3600

        #determine days in month by dividing time by seconds in day
        for j in range(1,round((endtime-starttime)/(3600*24))+1):
            day = str(j).rjust(2,'0')
            date = year + '-' + month.rjust(2,'0') + '-' + day
            if date in DST_back:
                clock_back = 1
            if date in DST_fwd:
                clock_fwd = 1
            #Create a folder for day if doesn't already exist
            raw_file_loc = './raw/' + channel + '/' + year + '/' + date
            if not os.path.exists(raw_file_loc):
                os.makedirs(raw_file_loc)
            text_file_loc = './text/' + channel + '/' + year + '/' + date
            if not os.path.exists(text_file_loc):
                os.makedirs(text_file_loc)

                
            #will loop through that day and save files for each hour
            i=0
            while i in range(24):
                #get date and time info
                date_time = datetime.datetime.fromtimestamp(start_hour)
                time_stamp = date_time.strftime("%Y-%m-%d_%H")
                #write file path names
                raw_label = str(time_stamp) + ".json"
                text_label = str(time_stamp) + ".txt"

                #account for daylight savings
                if clock_back == 1 and i == 3:
                    raw_label = str(time_stamp) + "_DST" + ".json"
                    text_label = str(time_stamp) + "_DST" + ".txt"
                    i=2
                    clock_back = 0 
                if clock_fwd == 1 and i == 2:
                    i=3
                    clock_fwd = 0

                #obtain file from tveeder
                req = requests.get("http://beta.tveeder.com/api/channel/" + channel_codes[channel] + "/range/" + str(start_hour) + "/" + str(end_hour))
                # put text into one string
                text = ""
                for item in req.json()['range']:
                #     print(item['text'])
                    text += ' '
                    text += item['text']
            
                #save json file
                data = req.json()
                #os.chdir(raw_file_loc)
                with open(raw_file_loc + '/' + raw_label, 'w') as file_raw:
                    json.dump(data, file_raw)
                #save text file 
                #os.chdir(text_file_loc)
                with open(text_file_loc + '/' + text_label, 'w') as file_text:
                    file_text.write(text)    

                #change time for next hour
                start_hour, end_hour = end_hour, end_hour + 3600
                i+=1

        # program data

        file_loc = './Program schedule (raw)/' + channel_prog + '/' + year
        if not os.path.exists(file_loc):
                os.makedirs(file_loc)
        
        #Getting start and end dates for the month
        first = year + "-" + month + "-01"
        firstday = datetime.datetime.strptime(first, "%Y-%m-%d")
        lastday = firstday + relativedelta(months=1)

        #convert time to epoch time, and get start and endtime of day
        starttime = firstday.timestamp()
        endtime = lastday.timestamp()
        
        #Will loop through each day of the month
        startday = starttime - 60*60 #since based on VIC/NSW time
        endday = startday + 3600*24
        
        #determine days in month by dividing time by seconds in day
        for j in range(1,round((endtime-starttime)/(3600*24))+1):
            day = str(j).rjust(2,'0')
            date = year + '-' + month.rjust(2,'0') + '-' + day
            
            #account for DST
            if date in DST_back: #add an hour for DST ending (bckwd)
                endday = endday + 3600
            elif date in DST_fwd:
                endday = endday - 3600
            
            #collecting programs
            req = requests.get("http://beta.tveeder.com/api/channel/" + channel_codes_2[channel_prog] + "/epg/" + str(endday))
            
            # update the working directory
            #os.chdir(file_loc)3
            
            # save json file
            with open(file_loc + '/' + str(date) + ".json", "w") as json_file:
                json.dump(req.json()['epg'], json_file)
                
            # update start and end of days
            startday = endday
            endday = startday + 3600*24

In [None]:
# example usage

monthly_update(1, 2023)

We now combine program information with text and filter down to just the text, date, program, and genre

from 5 minute docs

check this again!

In [None]:
def get_data(channel1, channel2, year, m1, m2, comp):

    

    if comp == "mac":
        folder = r"/Users/a1765262/Library/CloudStorage/Box-Box"
    else:
        folder = r"/Users/irula/Box"

    days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    data_prog = pd.DataFrame()

    for month in range(m1, m2+1):
        for day in range(1, days[month-1]+1):
                if os.path.exists(folder + '/tveeder/program_schedule/' + channel1 + '/' + str(year) + '/' + str(year) + '-' + str(month).zfill(2) + '-' + str(day).zfill(2) + '.csv'): 
                    df = pd.read_csv(folder + '/tveeder/program_schedule/' + channel1 + '/' + str(year) + '/' + str(year) + '-' + str(month).zfill(2) + '-' + str(day).zfill(2) + '.csv')

                    data_prog = pd.concat([data_prog, df], axis=0)

    data_prog.reset_index(inplace = True)

    with open(folder + '/MPhil_1/AAAProject/Data/' + channel1 + '_data.json', 'r') as datafile:
        datafile = json.load(datafile)
        data_capt = json.loads(datafile)
        data_capt = pd.DataFrame({'text': data_capt.get('text'), 'date': data_capt.get('date')})

    text = ["0"]
    i = 0
    doc = 0
    date_init = data_capt["date"][0]
    dates = [date_init]

    # loop through all shows and find their captions    
    for show in range(len(data_prog)): 

        if show != len(data_prog) - 1:

            while data_capt["date"][i] <= data_prog["start_epoch"][show+1]: # while the current captions are earlier than the start of the next show 

                if data_capt["date"][i] - date_init > 300: # if more than 5 minutes
                    doc += 1
                    text.append("0")
                    date_init = data_capt["date"][i]
                    dates = np.append(dates, date_init)

                if data_capt["date"][i] >= data_prog["start_epoch"][show] and data_capt["date"][i] <= data_prog["end_epoch"][show]: # ensure the captions are within the show
                    text[doc] = text[doc] + " " + data_capt["text"][i] # add current line to the show's text
                    i += 1 # next line of captions
                else:
                    i += 1 # next line of captions even if not in current show
                if i >= len(data_capt):
                    break

        else:
            while data_capt["date"][i] <= data_prog["end_epoch"][show]: # while current captions are earlier than the end of the final show

                if data_capt["date"][i] - date_init > 300: # if more than 5 minutes
                    doc += 1
                    text.append("0")
                    date_init = data_capt["date"][i]
                    dates = np.append(dates, date_init)

                if data_capt["date"][i] >= data_prog["start_epoch"][show] and data_capt["date"][i] <= data_prog["end_epoch"][show]: # ensure the captions are within the show
                    text[doc] = text[doc] + " " + data_capt["text"][i] # add current line to the show's text
                    i += 1 # next line of captions
                else:
                    i += 1 # next line of captions even if not in current show
                if i >= len(data_capt):
                    break


        doc += 1
        text.append("0")
        dates = np.append(dates, date_init)

    text = [re.sub("[^A-Za-z']+", ' ', str(word)).lower() for word in text] # preprocessing

    genres = pd.DataFrame()

    for month in range(m1, m2+1):
        for day in range(1, days[month-1]+1):
                if os.path.exists(folder + '/tveeder/Program schedule (raw)/' + channel2 + '/2022/2022-' + str(month).zfill(2) + '-' + str(day).zfill(2) + '.json'): 
                    with open(folder + '/tveeder/Program schedule (raw)/' + channel2 + '/2022/2022-' + str(month).zfill(2) + '-' + str(day).zfill(2) + '.json', 'r') as datafile:
                        data = json.load(datafile)
                        if data:
                            data = pd.DataFrame(data)
                            genres = pd.concat([genres, data])

    genres.reset_index(inplace = True)

    # initialise dictionary
    genre_dict = dict()

    for i in range(len(genres)):
        genre_dict[genres["title"][i]] = genres["contentinfo"][i] # make dictionary {title: genre}
        
    dates = [datetime.datetime.fromtimestamp(dates[i]) for i in range(len(dates))]
    
    return(dates, text)

We then combine months' worth of data into one data frame spanning 2015-2022.

(from Combine Monthly.ipynb)

In [None]:
month_list = range(1, 13)
year_list = range(2015, 2023)
data = pd.DataFrame(columns = ['text', 'date', 'program', 'genre'])

'''
Loop through each channel, year, and month. 
We aim to generate a data set for each channel consisting of all text with labels for dates, genre etc.
'''

for channel in channel_list:

    for year in year_list:

        for month in month_list:

            month = str(month).rjust(2,'0')
        
            data = pd.concat([data, pd.read_csv(f"./Data/{channel}_{month}_{year}.csv")], ignore_index = True)

    pd.to_csv(f'./Data/{channel}_all_text.csv')

Clean and reduce each channel to 5-minute documents. Combine data from all channels into one list for ease of use. 

In [None]:
def clean_split_mins(data, secs):

    # create documents by second to input into a topic model
    # input data in the form of a dataframe [text, date]
    # output a list split into documents of length [secs]
    if len(data) == 0:
        return [], []
        
    # initialise
    date0 = data["date"][0]
    txt_list = [""]
    date_list = [date0]
    i = 0

    # make into documents of length 'secs'
    for text, date in zip(data["text"], data["date"]):

        if date - date0 > secs:

            txt_list.append(str(text) + " ")
            i += 1
            date0 = date
            date_list.append(date0)

        else:

            txt_list[i] += str(text) + " "

    txt_list = [line.replace("'", "").lower() for line in txt_list]

    for c in string.punctuation:

        txt_list = [line.replace(c, " ") for line in txt_list]


    return txt_list, date_list

In [None]:
all_channels_text_list = list()
all_channels_date_list = list()

for channel in channel_list:

    # clean data
    text_list, date_list = clean_split_mins(data, 300)

    # convert from epoch time
    date_list = [datetime.datetime.fromtimestamp(date) for date in date_list]

    # append each channel to final list
    all_channels_text_list.append(text_list)
    all_channels_date_list.append(date_list)

pickle.dump(all_channels_text_list, open('./Data/all_text.pkl', 'wb'))
pickle.dump(all_channels_date_list, open('./Data/all_dates.pkl', 'wb'))

From tabled data, select just news text. Clean and reduce to 5-minute documents. 

In [None]:
all_news_text = list()
all_news_dates = list()

for channel in channel_list:

    data = pd.read_csv(f'./Data/{channel}_all_text.csv')
    
    # filter to just one genre
    news_data = data[data['genre'] == 'news/current affairs (general)']

    # clean data
    news_text, news_dates = clean_split_mins(news_data, 300)

    # convert from epoch time
    news_dates = [datetime.datetime.fromtimestamp(date) for date in news_dates]

    all_news_text.append(news_text)
    all_news_dates.append(news_dates)

pickle.dump(all_news_text, open('./Data/all_news_text.pkl', 'wb'))
pickle.dump(all_news_dates, open('./Data/all_news_dates.pkl', 'wb'))