evaluation objectives:

1. fetch broadcasts with 'abortion' of FNC and MSNBC from UCLA NewsScape
2. extract 1000 stories for each channel
3. run pipeline to fetch topics, emotion, sentiment and stance 

In [15]:
!module load FFmpeg

In [1]:
import os
import pandas as pd


In [2]:
TV_PATH="/mnt/rds/redhen/gallina/tv"

KEYS = {
    "climate_change": [
        "climate", 
        "environment", 
        "warming", 
        "greenhouse", 
        "emissions", 
        "fossil", 
        "renewable", 
        "sustainable", 
        "methane"
        ],
    "abortion": [
        "abortion"
    ]}


meta = pd.read_csv("meta_abortion.csv", on_bad_lines='skip')

# remove duplicates
meta = meta.drop_duplicates(subset=["filename"]).reset_index(drop=True)

meta.head(3)

Unnamed: 0,filename,uuid,subquery,time,text,permalink
0,2022-12-31_2015_US_MSNBC_Morning_Joe.txt,ce432176-8947-11ed-94e9-089e01ba034d,abortion,2022-12-31 20:27:29,WHO HAS LEARNED\nFROM THEM WHO KNEW EXACTLY WH...,"http://www.sscnet.ucla.edu/tna/edge/video,ce43..."
1,2022-12-31_1500_US_MSNBC_AM_Joy.txt,cd22a37e-891b-11ed-bcf7-089e01ba034d,abortion,2022-12-31 16:39:32,"AS SECOND CLASS\nCITIZENS.\n>> FOR OTHERS, IT ...","http://www.sscnet.ucla.edu/tna/edge/video,cd22..."
2,2022-12-31_1300_US_FOX-News_FOX_and_Friends_Sa...,09e08f4e-890b-11ed-ad10-089e01ba0338,abortion,2022-12-31 13:42:39,HAS COMMITMENTS TO\nPRIORITIES AND THIS IS\nCO...,"http://www.sscnet.ucla.edu/tna/edge/video,09e0..."


In [3]:
# fetch 1000 stories for FNC and MSNBC each
n = 1000

inds = {"FOX-News": [], "MSNBC": []}

for ind, row in meta.iterrows():
    # check outlet
    if list(inds.keys())[0] in row['filename']:
        
        inds[list(inds.keys())[0]].append(ind)
        continue

    elif list(inds.keys())[1] in row['filename']:

        inds[list(inds.keys())[1]].append(ind)


In [8]:
sample = inds["FOX-News"][ : 150] + inds["MSNBC"][ : 150]


In [9]:
from tqdm import tqdm


def convert_format(x):
    '''convert timestamp format'''
    i = x.split('.')[0][-6 : ]

    return f"{i[ : 2]}:{i[2 : 4]}:{i[4 : ]}"


cues, data = ["|CC1|", "|SEG_00|", "Type=Commercial", "Type=Story start"], []

# fetch stories with timestamps
for ind in tqdm(sample, total=len(sample)):
    try:
        ent = meta.iloc[ind].filename.split("-")

        # date entities
        path = [ent[0], ent[1], "-".join(ent[2 : ]).split("_")[0], "_".join("-".join(ent[2 : ]).split("_")[1 : ])]
        # filepath
        f_path = os.path.join(TV_PATH, path[0], "-".join(path[ : 2]), "-".join(path[ : 3]), meta.iloc[ind].filename)
        # read text file
        with open(f_path) as f: lines = f.read()

        stories, meta_stories = [story.split("\n")[1 : ] for story in lines.split(cues[1]) if cues[3] in story], []

        for story in stories:
            trs = " ".join([line.split(cues[0])[-1] for line in story]).replace(">>", "")

            if any([key.upper() in trs for key in KEYS["abortion"]]):

                start = story[0].split(cues[0])[0].split("|")[0]
                end = story[-1].split("|")[0].split('.')[0][-6 : ]

                meta_story = {
                    "file_path": f_path, 
                    "trs": trs,
                    "start": convert_format(start),
                    "end": convert_format(end)
                    }
                meta_stories.append(meta_story)
        
        data.extend(meta_stories)
    
    except: pass

100%|██████████| 300/300 [00:00<00:00, 780.27it/s]


In [10]:
data[2]

{'file_path': '/mnt/rds/redhen/gallina/tv/2022/2022-12/2022-12-30/2022-12-30_1700_US_FOX-News_Outnumbered.txt',
 'trs': ' Kennedy: HELLO, WELCOME BACK KAMALA HARRIS OUR VICE PRESIDENT RUNNING AT HER SECOND YEAR AS BP AND SHE IS FRUSTRATED WITH HER MEDIA COVERAGE, HOW DARE YOU! SHE IS NOT HOLDING BACK ABOUT IT. SHE SAT DOWN WITH "WASHINGTON POST" COLUMNIST JONATHAN CAPE HARD HER BIGGEST FAN BOY APPARENTLY TO REFLECT ON THE PAST YEAR. THE ENTITLING PIECE CALLED KAMALA HARRIS HAD A MOST EXCELLENT YEAR, THANKS BILL AND TED. THE LACK OF COVERAGE OR LEADERSHIP IS GETTING AND SHE COMPARES THAT TO COVERAGE OF THE SUPREME COURT\'S LANDMARK ABORTION RIGHTS SAYING "THERE ARE THINGS I HAVE DONE AS VICE PRESIDENT THAT FULLY DEMONSTRATE THE STRENGTH OF MY LEADERSHIP AS VICE PRESIDENT DOES NOT RECEIVE THE KIND OF COVERAGE THAT I THINK DOGS DID RECEIVE." TODD, I\'M A LITTLE CONFUSED PERIODS SHE IS MAD A LANDMARK ONCE IN A GENERATION SUPREME COURT CASE GOT MORE COVERAGE THAN JUST THE FACT SHE WAS SUCH 

In [2]:
import json

with open("abortion.json", 'w') as f:
    json.dump(data, f, indent=4)

In [3]:
with open("result.json") as f:
    result = json.load(f)