In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import json
import random
import csv
import re
import datetime

In [2]:
TRUMP_CAMGAIN_BEGINS = datetime.datetime(2015,6,16)

In [3]:
data_dir = 'Captions'

In [4]:
def str_to_datetime(s):
    s = s.replace(' ','-').replace(':','-')
    return datetime.datetime(*[int(x) for x in s.split('-')])

In [5]:
def combine_transcript_files(transcripts_list, constraints):
    final_transcripts = []
    for i,transcripts in enumerate(transcripts_list):
        final_transcripts += [x for x in transcripts if constraints[i](x)]
    return final_transcripts

In [6]:
#    transcripts_old = [show_info for show_info in transcripts_old if 
#                       c>str_to_datetime(show_info['start_localtime'].split()[0])>=TRUMP_CAMGAIN_BEGINS]
def get_shows(fs, constraints=[]):
    if constraints:
        if len(constraints)!=len(fs):
            print('wrong number of constraints')
            return
    else:
        constraints= [lambda x: True] * len(fs)
    transcripts_list = []
    for f in fs:
        with open(data_dir + '/' + f) as json_f:
            transcripts_list.append(json.load(json_f))
    return combine_transcript_files(transcripts_list, constraints)

#CNN
cnn_fs = ['processed_CNN_april_september_2019.json']#['processed_CNN_2019.json', 'processed_CNN_p2_2019.json'] # processed_CNN_2018.json, processed_cnn_2014_feb26_IDS.json            
msnbc_fs = []#['processed_MSNBC_2019.json', 'processed_MSNBC_p2_2019.json'] # processed_MSNBC_2018.json, processed_msnbc_2014_feb16_IDS.json
foxnews_fs = ['processed_CNN_april_september_2019.json']#['processed_FOXNEWS_2019.json', 'processed_FOXNEWS_p2_2019.json'] #processed_FOXNEWS_2018.json, processed_foxn_2014_feb26_IDS.json

In [16]:
CNN_shows_2018jan = get_shows(cnn_fs)
FOX_shows_2018jan = get_shows(foxnews_fs)
MSNBC_shows_2018jan = get_shows(msnbc_fs)

In [8]:
len(MSNBC_shows_2019)

712

In [9]:
def count_shows(show_data, min_shows=0):
    program_counts = {}
    for show_info in show_data:
        program = show_info['program']
        program_counts[program] = program_counts.get(program,0) + 1
    for show,count in sorted(program_counts.items(), key=lambda x: -x[1]):
        if count>min_shows:
            print(count, show)

In [10]:
def print_sample(show_data, date=None, program_contains='', i=None):
    # first, constrain possible random shows by date and program
    shows_subset = show_data
    if date:
        shows_subset = [show_info for show_info in shows_subset if show_info['start_localtime'].split()[0]==date]
    if program_contains:
        shows_subset = [show_info for show_info in shows_subset if program_contains in show_info['program']]
    i = random.randrange(0,len(shows_subset)) if not i else i
    show_info = shows_subset[i]
    print(len(shows_subset), 'matches')
    print(show_info['program'], show_info['start_localtime'], show_info['start_time'], show_info['stop_time'])
    print()
    cc_snippets = show_info['cc'].split('\n')
    timestamps = [int(x) for x in show_info['times']]
    cc_lines = show_info['cc'].split('>>')
    ts_pointer = 0
    for line in cc_lines:
        lb = line.count('\n')
        print(timestamps[ts_pointer], '>>' + line.replace('\n',' '))
        ts_pointer += lb


In [11]:
def day_name(i):
    return {
        0:'Monday',
        1:'Tuesday',
        2:'Wednesday',
        3:'Thursday',
        4:'Friday',
        5:'Saturday',
        6:'Sunday'
    }[i]
def build_primetime_lineup(show_data, out_f):
    lineup = {}
    for show_info in show_data:
        day, time = show_info['start_localtime'].split()
        if day not in lineup:
            lineup[day] = {}
        # if in primetime (west coast air)
        t_h = int(time.split(':')[0])+3-12
        if 10 >= t_h >= 5:
            h = round((str_to_datetime(show_info['stop_time']) - str_to_datetime(show_info['start_time'])).seconds/60/60,1)
            suffix = ' ({} hours)'.format(int(h)) if h>1.25 else ''
            lineup[day][t_h] = show_info['program'] + suffix
    rows = [["Date", "Day", "5p", "6p", "7p", "8p", "9p", "10p"]]
    for day in sorted(lineup):
        shows = lineup[day]
        rows.append([day, day_name(str_to_datetime(day).weekday()), 
                     shows.get(5,''), shows.get(6,''), shows.get(7,''), shows.get(8,''), 
                     shows.get(9,''), shows.get(10,'')])
    with open(out_f,'w') as csv_f:
        csv.writer(csv_f).writerows(rows)

In [12]:
commercial_match = re.compile('(?<![0-9])[a-z]{2,}\W(?<!:)|(?<![0-9])[a-z]{2,}$')
def is_commercial(s):
    return re.search(commercial_match, s.replace(' ',''))

In [13]:
def show_lineup(show_data, date):
    shows_on_date = [show_info for show_info in show_data if show_info['start_localtime'].split()[0]==date]
    for show_info in sorted(shows_on_date, key=lambda x: x['start_localtime']):
        print(show_info['start_localtime'].split()[1], show_info['program'])

In [14]:
def segment(show_info):
    ts_pointer = 0
    timestamps = [int(x) for x in show_info['times']]
    if len(timestamps)==show_info['cc'].count('\n'):
        timestamps = timestamps + [timestamps[-1]]
    
    segments = []
    current_segment = ''
    segment_start = timestamps[ts_pointer]
    
    snippets = []
    clock = 0
    snippet_time = 60*4
    snippet = ''
    snippet_start = timestamps[ts_pointer]
    
    commercial_break = False
    for chunk in show_info['cc'].split('>>'):
        chunk_time = timestamps[ts_pointer + chunk.count('\n')] - timestamps[ts_pointer]
        # if there is a tripple carrot
        if chunk.startswith('>'):
            if current_segment:
                snippets.append((snippet, snippet_start, timestamps[ts_pointer], len(segments)))
                snippet=''; snippet_start = timestamps[ts_pointer]; clock = 0
                segments.append((current_segment, segment_start, timestamps[ts_pointer]))
                current_segment = ''; segment_start = timestamps[ts_pointer]
            chunk = chunk[1:]
        if current_segment:
            current_segment += '>> '
            snippet += '>> '
        for i,line in enumerate(chunk.split('\n')):
            if is_commercial(line):
                if current_segment:
                    snippets.append((snippet, snippet_start, timestamps[ts_pointer+i], len(segments)))
                    snippet=''; clock = 0
                    segments.append((current_segment, segment_start, timestamps[ts_pointer+i]))
                    current_segment = ''
                commercial_break = True
                break
            current_segment += line.strip() + ' '
            snippet += line.strip() + ' '
        ts_pointer += chunk.count('\n')
        if commercial_break:
            segment_start = timestamps[ts_pointer]
            snippet_start = timestamps[ts_pointer]
            commercial_break = False
        else:
            clock += chunk_time
        if clock > snippet_time:
            snippets.append((snippet, snippet_start, timestamps[ts_pointer], len(segments)))
            snippet = ''; snippet_start = timestamps[ts_pointer]; clock = 0
    
    return snippets, segments

In [24]:
print_sample(CNN_shows_2019, i=300)

2637 matches
State of the Union With Jake Tapper 2019-01-13 09:00:00 2019-01-13 17:00:00 2019-01-13 18:00:59

6 >>At Comcast, it's my job to develop, apps and tools that simplify your experience. My name is Mike, I'm in product development at Comcast. We're working to make things simple, easy and awesome. 
36 >>> INVESTIGATING TRUMP. A NEW REPORT SAYS THE FBI INVESTIGATED THE PRESIDENT FEARING HE COULD BE WORKING FOR RUSSIA. AS WE ALSO LEARN PRESIDENT TRUMP REPORTEDLY CONCEALED DETAILS OF HIS MEETINGS WITH VLADIMIR PUTIN. THE WHITE HOUSE CALLS THE REPORTS ABSURD. WE'LL ASK THE TOP DEMOCRAT ON THE SENATE INTELLIGENCE COMMITTEE, SENATOR MARK WARNER, NEXT. 
57 >>> PLUS, HISTORIC SHUTDOWN. DAY 23 OF THE LONGEST GOVERNMENT SHUTDOWN IN AMERICAN HISTORY. AND NEITHER SIDE IS BUDGING ON THE WALL. THE PRESIDENT SAYS IF DEMOCRATS DON'T ACT, HE WILL. 
67 >> I WILL DECLARE A NATIONAL EMERGENCY. 
72 >> THE CHAIRMAN OF THE HOMELAND SECURITY COMMITTEE, REPUBLICAN SENATOR RON JOHNSON, RESPONDS IN MOMEN

In [15]:
# test sample
def segment_shows(show_data, date_range, time_range, f_name, include_wknd=False, show_id_start=0):
    segment_rows = [["Segment ID", "Date", "Time (pt)", "Program", "Start", "Stop", "Duration", "Text"]]
    snippet_rows = [["Segment ID", "Snippet ID", "Day", "Time (pt)", "Program", "Start", "Stop", "Duration", "Text"]]
    show_ID = show_id_start
    for show_info in sorted(show_data, key=lambda x: str_to_datetime(x['start_localtime'])):
        dt = str_to_datetime(show_info['start_localtime'])
        if date_range[0] <= dt <= date_range[1] and (dt.hour+3-12) in time_range and (dt.weekday()<5 or include_wknd):
            p = show_info['program']
            day, time = str(dt).split()
            snippets, segments = segment(show_info)
            segment_ID = 0
            for s, t0, tn in segments:
                segment_rows.append(['{}_{}'.format(show_ID, segment_ID), day, time, p, t0, tn, tn-t0, s])
                segment_ID += 1
            snippet_ID = 0
            for s, t0, tn, segment_ID in snippets:
                snippet_rows.append(['{}_{}'.format(show_ID, segment_ID), snippet_ID, day, time, p, t0, tn, tn-t0, s])
                snippet_ID += 1
            show_ID +=1
    with open(f_name + '_segments.csv','w') as csv_f:
        csv.writer(csv_f).writerows(segment_rows)
    with open(f_name + '_snippets.csv','w') as csv_f:
        csv.writer(csv_f).writerows(snippet_rows)

In [20]:
full_range = [TRUMP_CAMGAIN_BEGINS, datetime.datetime(2019,1,1)]
range_2019 = [datetime.datetime(2019,1,1), datetime.datetime(2019,4,23)]
jan_2018_range = [datetime.datetime(2018,1,1), datetime.datetime(2018,1,31)]
# sample cnn segmentation
segment_shows(FOX_shows_2018jan, jan_2018_range, [8,9,10], 'Transcript_Segments/FOXNEWS_jan2018', show_id_start=10000)

In [67]:
show_lineup(CNN_shows, '2018-11-22')

02:00:00 Early Start With Christine Romans and Dave Briggs
06:00:00 CNN Newsroom with Poppy Harlow and Jim Sciutto
07:00:00 CNN Newsroom with Poppy Harlow and Jim Sciutto
08:00:00 At This Hour With Kate Bolduan
09:00:00 Inside Politics
11:00:00 CNN Newsroom With Brooke Baldwin
12:00:00 CNN Newsroom With Brooke Baldwin


In [93]:
CNN_shows[0].keys()

dict_keys(['title', 'program', 'cc', 'topics', 'times', 'contributor', 'start_time', 'stop_time', 'start_localtime'])

In [66]:
print_sample(CNN_shows, date='2018-11-22')#, program_contains='Wolf Blitzer')

7 matches
CNN Newsroom with Poppy Harlow and Jim Sciutto 2018-11-22 06:00:00 2018-11-22 14:00:00 2018-11-22 15:00:54

1 >>LOTS OF MEMBERS OF OUR EXTENDED CNN FAMILY HERE TODAY TO YOURS. ON THE COUNT OF THREE, YOU GUYS, CAN WE DO A HAPPY THANKSGIVING. ONE, TWO, THREE -- HAPPY THANKSGIVING! HAVE A GREAT HOLIDAY. "CNN NEWSROOM" STARTS RIGHT NOW. 
23 >>> I SHOULD HAVE BROUGHT MY KIDS TODAY, THEN. VERY GOOD MORNING TO YOU, AND A HAPPY, WARM, FOOD FILLED THANKSGIVING TO YOU. I'M JIM SCIUTTO. POPPY HAS THE DAY OFF. 
33 >>> PRESIDENT TRUMP IS SERVING UP ANOTHER HEALTHY PLATTER OF CONTEMPT THIS MORNING FOR THE U.S. COURT OF APPEALS. HE IS CALLING THAT COURT A COMPLETE AND TOTAL DISASTER, OUT OF CONTROL WITH A HORRIBLE REPUTATION, HE CLAIMS. GOING FURTHER, BEDLAM, CHAOS, INJURY AND DEATH WILL FOLLOW IF THE ANYO9th CIRCUIT KEEPS VOTIN AGAINST HIS ISSUES. THE CHIEF JUSTICE OF THE UNITED STATES OF AMERICA WHO YESTERDAY RESPONDED TO THE PRESIDENT FOR ATTACKING THE JUDGE WHO BLOCKED HIS EXECUTIVE ORD

In [13]:
count_shows(CNN_shows, min_shows=2)

786 CNN Tonight With Don Lemon
630 Anderson Cooper 360
473 CNN Newsroom With Brooke Baldwin
470 Early Start With Christine Romans and Dave Briggs
469 Situation Room With Wolf Blitzer
354 CNN Newsroom With Ana Cabrera
329 CNN Newsroom With Fredricka Whitfield
303 CNN Newsroom Live
301 CNN Newsroom With John Berman and Poppy Harlow
266 Inside Politics
250 At This Hour With Kate Bolduan
236 The Lead With Jake Tapper
232 Erin Burnett OutFront
210 Cuomo Primetime
201 Wolf
148 CNN Newsroom with Poppy Harlow and Jim Sciutto
141 New Day
140 New Day Saturday
97 New Day Sunday
97 State of the Union With Jake Tapper
95 Fareed Zakaria GPS
74 Cuomo Prime Time
68 Smerconish
57 CNN Special Report
48 Reliable Sources
44 CNN Newsroom With Victor Blackwell and Christi Paul
32 The Nineties
27 The Eighties
26 The Van Jones Show
23 The Seventies
20 CNN Newsroom With Poppy Harlow
13 The Sixties
12 The Axe Files
12 Election Night in America
8 CNN Special Program
4 Race for the White House
3 Sen. Collins on C