In [1]:
import os
import pandas as pd

In [2]:
bbc_ids = [54, 106, 107, 175, 279]
years = range(2015, 2018)

prefix = {
    54: 'BBC+News',
    106: 'News',
    107: 'News',
    175: 'News',
    279: ''
}

In [3]:
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

In [11]:
for bbc_id in bbc_ids:
    rows = []
    for year in years:
        data_path = '../data/bbc/{}/{}/transcripts'.format(bbc_id, year)
        for month in months:
            csv_file = os.path.join(data_path, '{} {}-{} {}.csv'.format(prefix[bbc_id], month, year, bbc_id))
            
            if not os.path.exists(csv_file):
                continue
            df = pd.read_csv(csv_file)
            df = df.drop(['Unnamed: 0', 'Has Transcript', 'Unavailable link', 'Unavailable reason'], axis=1)
            df['word_count'] = df['Transcript'].apply(lambda x: len(x.split()))
            shows = df['Program Name'].unique()

            for show in shows:
                df_show = df.loc[df['Program Name'] == show]
                total_show_length = df_show['word_count'].sum()
                rows.append([year, month, show, total_show_length])

    res_df = pd.DataFrame(rows, columns=['year', 'month', 'show title', 'word count in show'])
    display(res_df)
    res_df.to_csv('bbc_{}_show_word_counts.csv'.format(bbc_id))

Unnamed: 0,year,month,show title,word count in show
0,2015,jan,BBC Weekend News,50960
1,2015,jan,BBC London News,74521
2,2015,jan,BBC News at One,101231
3,2015,jan,BBC News at Six,90586
4,2015,jan,BBC News at Ten,92269
...,...,...,...,...
208,2017,dec,BBC News,1684856
209,2017,dec,BBC London News,315045
210,2017,dec,BBC News at One,137517
211,2017,dec,BBC News at Six,128431


Unnamed: 0,year,month,show title,word count in show
0,2015,jan,Channel 4 News,213321
1,2015,jan,Channel 4 News Summary,1028
2,2015,jan,The Simpsons (2004),3385
3,2015,feb,Channel 4 News,189590
4,2015,feb,Channel 4 News Summary,473
...,...,...,...,...
68,2017,oct,Channel 4 News Summary,27801
69,2017,nov,Channel 4 News,410285
70,2017,nov,Channel 4 News Summary,26473
71,2017,dec,Channel 4 News,334525


Unnamed: 0,year,month,show title,word count in show
0,2015,jan,5 News,602
1,2015,jan,5 News at 5,63397
2,2015,jan,5 News Tonight,78829
3,2015,jan,5 News Weekend,3346
4,2015,feb,5 News Weekend,4908
...,...,...,...,...
176,2017,dec,5 News,13585
177,2017,dec,5 News Weekend,11194
178,2017,dec,5 News Lunchtime,14433
179,2017,dec,5 News at 5,81180


Unnamed: 0,year,month,show title,word count in show
0,2015,jan,ITV News at Ten & Weather,143047
1,2015,jan,ITV News & Weather,194809
2,2015,jan,ITV News London,92595
3,2015,jan,ITV News,3248
4,2015,feb,ITV News & Weather,203499
...,...,...,...,...
195,2017,dec,ITV News London,288801
196,2017,dec,ITV Lunchtime News,121907
197,2017,dec,ITV Evening News,124548
198,2017,dec,ITV News at Ten,97823


Unnamed: 0,year,month,show title,word count in show
0,2015,may,Sky News Tonight with Adam Boulton and Anna Jones,32336
1,2015,may,Sky News at Ten,114789
2,2015,may,Sky News at Seven with Steve Dixon,3984
3,2015,may,Sky News at Nine,2818
4,2015,may,Sky News at 11 with Lorna Dunkley,6637
...,...,...,...,...
549,2017,dec,Sky News Tonight,41863
550,2017,dec,Sky News with Colin Brazier and Jayne Secker,45562
551,2017,dec,Sky News on the Hour,16409
552,2017,dec,Press Preview,40947


In [29]:
rows = []
for year in years:
    for bbc_id in bbc_ids:
        data_path = '../data/bbc/{}/{}/transcripts'.format(bbc_id, year)
        
        if not os.path.exists(data_path):
            continue
        
        for month in months:
            print(year, bbc_id, month)
            csv_file = os.path.join(data_path, '{}{}-{} {}.csv'.format(prefix[bbc_id], month, year, bbc_id))
            if not os.path.exists(csv_file):
                rows.append([bbc_id, year, month, 0, 0])
                continue
            df = pd.read_csv(csv_file)
            df = df.drop(['Unnamed: 0', 'Has Transcript', 'Unavailable link', 'Unavailable reason'], axis=1)
            df['word_count'] = df['Transcript'].apply(lambda x: len(x.split()))
            n_shows = len(df['Program Name'])
            word_count = df['word_count'].sum()
            rows.append([bbc_id, year, month, word_count, n_shows])
            
            
res_df = pd.DataFrame(rows, columns=['bbc_id', 'year', 'month', 'word_count', 'show_count'])

2007 54 jan
2007 54 feb
2007 54 mar
2007 54 apr
2007 54 may
2007 54 jun
2007 54 jul
2007 54 aug
2007 54 sep
2007 54 oct
2007 54 nov
2007 54 dec
2008 54 jan
2008 54 feb
2008 54 mar
2008 54 apr
2008 54 may
2008 54 jun
2008 54 jul
2008 54 aug
2008 54 sep
2008 54 oct
2008 54 nov
2008 54 dec
2009 54 jan
2009 54 feb
2009 54 mar
2009 54 apr
2009 54 may
2009 54 jun
2009 54 jul
2009 54 aug
2009 54 sep
2009 54 oct
2009 54 nov
2009 54 dec
2009 106 jan
2009 106 feb
2009 106 mar
2009 106 apr
2009 106 may
2009 106 jun
2009 106 jul
2009 106 aug
2009 106 sep
2009 106 oct
2009 106 nov
2009 106 dec
2009 175 jan
2009 175 feb
2009 175 mar
2009 175 apr
2009 175 may
2009 175 jun
2009 175 jul
2009 175 aug
2009 175 sep
2009 175 oct
2009 175 nov
2009 175 dec
2009 279 jan
2009 279 feb
2009 279 mar
2009 279 apr
2009 279 may
2009 279 jun
2009 279 jul
2009 279 aug
2009 279 sep
2009 279 oct
2009 279 nov
2009 279 dec
2010 54 jan
2010 54 feb
2010 54 mar
2010 54 apr
2010 54 may
2010 54 jun
2010 54 jul
2010 54 aug
2010

In [30]:
res_df

Unnamed: 0,bbc_id,year,month,word_count,show_count
0,54,2007,jan,0,0
1,54,2007,feb,0,0
2,54,2007,mar,0,0
3,54,2007,apr,0,0
4,54,2007,may,0,0
...,...,...,...,...,...
607,279,2018,aug,0,0
608,279,2018,sep,0,0
609,279,2018,oct,0,0
610,279,2018,nov,0,0


In [31]:
res_df.to_csv('bbc_word_and_show_counts.csv')

In [4]:
for bbc_id in bbc_ids:
    rows = []
    shows = []
    for year in years:
        data_path = '../data/bbc/{}/{}/transcripts'.format(bbc_id, year)
        for month in months:
            csv_file = os.path.join(data_path, '{} {}-{} {}.csv'.format(prefix[bbc_id], month, year, bbc_id))
            
            if not os.path.exists(csv_file):
                continue
            df = pd.read_csv(csv_file)
            df = df.drop(['Unnamed: 0', 'Has Transcript', 'Unavailable link', 'Unavailable reason'], axis=1)
            df['word_count'] = df['Transcript'].apply(lambda x: len(x.split()))
            shows.extend(list(df['Program Name'].unique()))

    shows = list(set(shows))
    
    res_df = pd.DataFrame(shows, columns=['shows'])
    display(res_df)
    res_df.to_csv('bbc_{}_shows.csv'.format(bbc_id))

Unnamed: 0,shows
0,Joins BBC News
1,BBC News at One
2,BBC Weekend News
3,BBC London News
4,BBC News
5,Better In or Out?
6,BBC News at Six
7,BBC News Special
8,BBC News Special - Royal Engagement
9,BBC News at Ten


Unnamed: 0,shows
0,Channel 4 News Special
1,The Fake News Show
2,Man Down
3,Channel 4 News
4,Channel 4 News Summary
5,The Simpsons (2004)


Unnamed: 0,shows
0,Parliament Under Attack: 5 News Special
1,5 News Special - Manchester: A City United
2,Brexit Begins: A 5 News Special
3,Shock Election: A 5 News Special
4,5 News Weekend
5,Angelina Ballerina (2009)
6,Paris Attacks - A 5 News Special
7,The Queen - A Five News Tonight Special
8,5 News in Film
9,Bananas in Pyjamas


Unnamed: 0,shows
0,New: And Here Is the News...
1,The Chancellor’s Budget 2017: An ITV News Special
2,ITV News: Paris Terror Attacks
3,Election 2017: ITV News Special
4,New: After the News
5,ITV News - Election 2015
6,ITV News: The Budget
7,ITV News Special: The New Prime Minister
8,ITV Lunchtime News
9,ITV News - VE Day 70


Unnamed: 0,shows
0,Sky News at Nine
1,Sky News at 11 with Mark Longhurst
2,Sky News at 11
3,The Queen
4,Sky World Review and Business Report
...,...
110,Press Preview
111,Sportsline
112,Sky Midnight News
113,Sunrise with Sarah-Jane Mee and Jonathan Samuels
