week5 objectives:
1. explore UCLA NewsScape data
2. set keywords for a topic: climate change, gun control, immigration
3. fetch query results; channels: CNN, FNC, MSNBC
4. identify cues to segment a broadcast
5. validate segments contextually


In [1]:
import os
import csv
import pandas as pd

import pprint

In [2]:
TV_PATH="/mnt/rds/redhen/gallina/tv"

KEYS = {
    "climate_change": [
        "climate", 
        "environment", 
        "warming", 
        "greenhouse", 
        "emissions", 
        "fossil", 
        "renewable", 
        "sustainable", 
        "methane"
        ]}


In [3]:
meta = pd.read_csv("meta_climate.csv")

# remove duplicates
meta = meta.drop_duplicates(subset=["filename"]).reset_index(drop=True)

meta.head(3)

Unnamed: 0,filename,uuid,subquery,time,text,permalink
0,2022-12-31_2015_US_MSNBC_Morning_Joe.txt,ce432176-8947-11ed-94e9-089e01ba034d,environment,2022-12-31 20:47:17,IS REALLY HARD FOR\nCOMPANIES ACROSS THE COUNT...,"http://www.sscnet.ucla.edu/tna/edge/video,ce43..."
1,2022-12-31_2000_US_FOX-News_The_Journal_Editor...,b5f7ce7a-8945-11ed-b9fc-089e01ba0338,fossil,2022-12-31 20:14:24,BEG VENEZUELA WE\nDO NOT INCREASE PRODUCTION H...,"http://www.sscnet.ucla.edu/tna/edge/video,b5f7..."
2,2022-12-31_1700_US_CNN_CNN_Newsroom_With_Fredr...,91c395a2-892c-11ed-b597-089e01ba0338,environment,2022-12-31 17:20:55,RIGHT AMERICAN POLITICS.\nWHEN IT COMES TO ABO...,"http://www.sscnet.ucla.edu/tna/edge/video,91c3..."


In [4]:
print(f"number of broadcasts: {meta.shape[0]}")

number of broadcasts: 88232


In [5]:
cues, data = ["|CC1|", "|SEG_00|", "Type=Commercial", "Type=Story start"], []

# fetch stories with timestamps
for ind in range(10):
    try:
        ent = meta.iloc[ind].filename.split("-")

        # date entities
        path = [ent[0], ent[1], "-".join(ent[2 : ]).split("_")[0], "_".join("-".join(ent[2 : ]).split("_")[1 : ])]
        # filepath
        f_path = os.path.join(TV_PATH, path[0], "-".join(path[ : 2]), "-".join(path[ : 3]), meta.iloc[ind].filename)
        # read text file
        with open(f_path) as f: lines = f.read()

        stories, meta_stories = [story.split("\n")[1 : ] for story in lines.split(cues[1]) if cues[3] in story], []

        for story in stories:
            trs = " ".join([line.split(cues[0])[-1] for line in story]).replace(">>", "")

            if any([key.upper() in trs for key in KEYS["climate_change"]]):

                meta_story = {
                    "file_path": f_path, 
                    "trs": trs,
                    "start": story[0].split(cues[0])[0].split("|")[0],
                    "end": story[-1].split("|")[0]
                    }
                meta_stories.append(meta_story)
        
        data.extend(meta_stories)
    
    except: pass

In [8]:
import json

with open("climate.json", 'w') as f:
    json.dump(data, f, indent=4)

In [6]:
data[0]

{'file_path': '/mnt/rds/redhen/gallina/tv/2022/2022-12/2022-12-31/2022-12-31_2015_US_MSNBC_Morning_Joe.txt',
 'trs': "WELCOME TO A SPECIAL EDITION OF -- LOOKING BACK IN 25 NEWS MAKERS AND POLITICS, BUSINESS, POP ULTRA, ENTERTAINMENT, AND SPORTS. AND WHETHER THEY WERE UP OR DOWN BY THE END OF THE YEAR. AND, DONNY GEORGE IS HERE TO TAKE US THROUGH THEM ALL. YIKES. A LOT OF BRANDS DOWNS. SOME GROUND. UPS  AND YOU KNOW, THE THING, IS SOME OF THESE ACTUALLY ARE A BIT OBVIOUS. BUT WHAT MAKES IT SO INTERESTING IS, WE'VE GOT JOHNNY. HERE HE'S GOT THAT SPECIAL SAUCE. HE OBVIOUSLY. AN ADVERTISING LEGEND. A BRANDING LEGEND. HE'S GOT THE BRANDING. A NO YOU'VE EVER BEEN OVER TO HIS PLACE. HE HAS THE SUPERCOMPUTER. THE BRANDING SUPERCOMPUTER. WE GO TO THIS GUY TO FIGURE OUT WHO IS UP, WHO IS, DOWN AND WHY.  THERE'S ABOUT 2017 YEARS SCIENTISTS WORKING AROUND THE CLOCK. OUT OF THE WAY THEY ARE, DANISH BUT THAT'S JUST. SOMEHOW THEY KIND OF GOT TOGETHER. AND THEY WORK WITH THE SUPERCOMPUTER. THEY USE A 

In [7]:
data[0]['trs']  # not related to climate change

"WELCOME TO A SPECIAL EDITION OF -- LOOKING BACK IN 25 NEWS MAKERS AND POLITICS, BUSINESS, POP ULTRA, ENTERTAINMENT, AND SPORTS. AND WHETHER THEY WERE UP OR DOWN BY THE END OF THE YEAR. AND, DONNY GEORGE IS HERE TO TAKE US THROUGH THEM ALL. YIKES. A LOT OF BRANDS DOWNS. SOME GROUND. UPS  AND YOU KNOW, THE THING, IS SOME OF THESE ACTUALLY ARE A BIT OBVIOUS. BUT WHAT MAKES IT SO INTERESTING IS, WE'VE GOT JOHNNY. HERE HE'S GOT THAT SPECIAL SAUCE. HE OBVIOUSLY. AN ADVERTISING LEGEND. A BRANDING LEGEND. HE'S GOT THE BRANDING. A NO YOU'VE EVER BEEN OVER TO HIS PLACE. HE HAS THE SUPERCOMPUTER. THE BRANDING SUPERCOMPUTER. WE GO TO THIS GUY TO FIGURE OUT WHO IS UP, WHO IS, DOWN AND WHY.  THERE'S ABOUT 2017 YEARS SCIENTISTS WORKING AROUND THE CLOCK. OUT OF THE WAY THEY ARE, DANISH BUT THAT'S JUST. SOMEHOW THEY KIND OF GOT TOGETHER. AND THEY WORK WITH THE SUPERCOMPUTER. THEY USE A GOOGLE MACHINE, ALSO. THEY DO A LOT OF DIFFERENT THINGS. GEORGE, SAID A LOT OF THESE. A LOT OF TIMES WE DO THIS ON A 