In [2]:
# Imports
import glob
import pandas as pd
from bs4 import BeautifulSoup
import json

In [3]:
# Constants, variables and important data

ROOT = "./../../data/BBC/*"
folders = glob.glob(ROOT)

files = []

for folder in folders:
    files.append(glob.glob(f'{folder}/*.html'))
files = [item for sublist in files for item in sublist]

In [20]:
def extract_data(files):
    data = []
    for index, file in enumerate(files):
        if './data/show' not in file:
            soup = BeautifulSoup(open(file), 'html.parser')
            print(file)

            if soup.find('meta', attrs={'property': 'og:description'}):
                description = soup.find('meta', attrs={'property': 'og:description'})['content']
            else:
                description = ''

            if soup.find('meta', attrs={'name': 'keywords'}):
                keywords = soup.find('meta', attrs={'name': 'keywords'})['content']
            else:
                keywords = ''

            # Extract Json
            if soup.find(id="tvip-script-app-store"):
                json_raw = soup.find(id="tvip-script-app-store")
                json_raw = json_raw.get_text()
                json_raw = json_raw.partition("__IPLAYER_REDUX_STATE__ = ")
                stripped_text = json_raw[2][:len(json_raw[2]) - 1]
                jsonfile = json.loads(stripped_text)

                episode_ = jsonfile['episode']
                title = episode_['title']

                synopses_ = episode_['synopses']
                if 'large' in synopses_:
                    synops_long = synopses_['large']
                else:
                    synops_long = ''
                if 'medium' in synopses_:
                    synops_med = synopses_['medium']
                else:
                    synops_med = ''
                if 'small' in synopses_:
                    synops_small = synopses_['small']
                else:
                    synops_small = ''

                if 'images' in episode_:
                    image = episode_['images']['standard']
                else:
                    image = ''

                category = ''
                if 'labels' in episode_:
                    labels_ = episode_['labels']
                    if 'category' in labels_:
                        category = labels_['category']

                channel = episode_['masterBrand']['id']
                versions_ = jsonfile['versions'][0]
                language = versions_['guidance']
                if 'firstBroadcast' in versions_:
                    release_date = versions_['firstBroadcast']
                else:
                    release_date = ''
                duration_sec = versions_['duration']['seconds']
            else:
                title = ''
                synops_long = ''
                synops_med = ''
                synops_small = ''
                category = ''
                channel = ''
                language = ''
                release_date = ''
                duration_sec = ''
                image = ''

            article = {
                'id': index,
                'title': title,
                'description': description,
                'image': image,
                'keywords': keywords,
                'synopses_small': synops_small,
                'synops_med': synops_med,
                'synops_long': synops_long,
                'category': category,
                'channel': channel,
                'language': language,
                'release_date': release_date,
                'duration_sec': duration_sec,
                'topic': file.split('/', 5)[4]
            }
            # append the article to the data
            data.append(article)
    return data

In [13]:


# load html files


In [21]:
data = extract_data(files)

./../data/BBC/music/iplayer-episode-m000zp6r-bbc-proms-2021-last-night-of-the-proms-part-2.html
./../data/BBC/music/iplayer-episode-m000kqv6-port-series-5-episode-4.html
./../data/BBC/music/iplayer-episode-p0bn4w25-sit-down-stand-up-with-greg-james.html
./../data/BBC/music/iplayer-episode-b08jktzz-ceiliuradh-na-feile-padraig.html
./../data/BBC/music/iplayer-episode-m0014scx-songs-of-praise-edinburgh.html
./../data/BBC/music/iplayer-episode-m0010rtb-radio-1s-out-out-live-best-bits.html
./../data/BBC/music/iplayer-episode-m00142lm-big-night-of-musicals-by-the-national-lottery.html
./../data/BBC/music/iplayer-episode-m000x9rb-paul-weller-live-at-the-barbican.html
./../data/BBC/music/iplayer-episode-m000y2xd-reclaiming-amy.html
./../data/BBC/music/iplayer-episode-p02jppyv-julian-bream-masterclass-5-julian-bream-festival.html
./../data/BBC/music/iplayer-episode-b08n8hpk-wayfaring-stranger-with-phil-cunningham-series-1-episode-3.html
./../data/BBC/music/iplayer-episode-b0b61qfz-arctic-monkey

In [22]:
data

[{'id': 0,
  'title': 'BBC Proms',
  'description': 'Australian tenor Stuart Skelton is joined by Latvian accordionist Ksenija Sidorova.',
  'image': 'https://ichef.bbci.co.uk/images/ic/{recipe}/p09w0p59.jpg',
  'keywords': 'BBC, iPlayer, TV, BBC Proms, 2021: Last Night of the Proms, Part 2',
  'synopses_small': 'Australian tenor Stuart Skelton is joined by Latvian accordionist Ksenija Sidorova.',
  'synops_med': 'Australian tenor Stuart Skelton is joined by Latvian accordionist Ksenija Sidorova and the BBC Symphony Orchestra conducted by Sakari Oramo for the climax of the Proms season. ',
  'synops_long': 'Katie Derham hosts continued live coverage from the Royal Albert Hall, at the climax of the world’s greatest classical music festival.  Latvian accordionist Ksenija Sidorova and Australian tenor Stuart Skelton join Sakari Oramo and the BBC Symphony Orchestra and BBC Singers for a jubilant programme including music by Florence Price, Latin flavours from Piazzolla and Troilo, English 

In [23]:
df = pd.DataFrame.from_records(data, index='id')

In [7]:
df.to_csv('./../bbc_data.csv', sep=';')

In [24]:
df

Unnamed: 0_level_0,title,description,image,keywords,synopses_small,synops_med,synops_long,category,channel,language,release_date,duration_sec,topic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,BBC Proms,Australian tenor Stuart Skelton is joined by L...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, BBC Proms, 2021: Last Night ...",Australian tenor Stuart Skelton is joined by L...,Australian tenor Stuart Skelton is joined by L...,Katie Derham hosts continued live coverage fro...,Music,bbc_radio_three,False,9pm 11 Sep 2021,5247,music
1,Port,"Previously unseen music from the series Port, ...",https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Port, Series 5: Episode 4","Previously unseen music from the series Port, ...",Julie Fowlis a’ lìbhrigeadh ceòl bhon t-sreath...,Bidh Druthag Bheag Eile a’ tarraing ri chèile ...,Music,bbc_alba,False,9 Jul 2020,1037,music
2,"Sit Down, Stand Up with Greg James",Tom Holland and Greg James cold call celebrity...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Sit Down, Stand Up with Greg...",Tom Holland and Greg James cold call celebrity...,Tom Holland and Greg James cold call celebrity...,,Entertainment,bbc_radio_one,False,10 Feb 2022,611,music
3,Ceiliúradh na Féile Pádraig,John Toal and Pauline Scanlon introduce an eve...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Ceiliúradh na Féile Pádraig",John Toal and Pauline Scanlon introduce an eve...,John Toal and Pauline Scanlon introduce an eve...,John Toal and Pauline Scanlon introduce an eve...,Music,bbc_two_northern_ireland_digital,False,17 Mar 2017,3538,music
4,Songs of Praise,Claire McCollum explores Edinburgh and shares ...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Songs of Praise, Edinburgh",Claire McCollum explores Edinburgh and shares ...,Claire McCollum goes on a Christian heritage t...,Claire McCollum explores Edinburgh and discove...,Music,bbc_one,False,1:15pm 20 Feb 2022,2044,music
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3798,Murder Trial: The Disappearance of Margaret Fl...,The trial continues as Cairney and Jones stand...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Murder Trial: The Disappeara...",The trial continues as Cairney and Jones stand...,Second part of the documentary about the trial...,As the murder trial that gripped a nation cont...,Documentary,bbc_two,Contains some upsetting scenes.,10pm 8 Jan 2020,3540,documentaries
3799,"Paul Merson: Football, Gambling and Me",Footballer Paul Merson sets out to understand ...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Paul Merson: Football, Gambl...",Footballer Paul Merson sets out to understand ...,Former footballer Paul Merson sets out to unde...,"Over the past 35 years, former Arsenal and Eng...",Sport,bbc_one,Contains some strong language.,9pm 11 Oct 2021,3480,documentaries
3800,Avicii: True Stories,Avicii: True Stories is Tim Bergling’s own sto...,https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Avicii: True Stories",Avicii: True Stories is Tim Bergling’s own sto...,Avicii: True Stories is Tim Bergling’s own sto...,"In 2006, Tim Bergling was just a regular teena...",Documentary,bbc_three,Contains strong language.,20 Apr 2019,5766,documentaries
3801,Heaven Made,"On the Isle of Wight, Brother Matthew, a monk ...",https://ichef.bbci.co.uk/images/ic/{recipe}/p0...,"BBC, iPlayer, TV, Heaven Made, Series 1: Episo...","On the Isle of Wight, Brother Matthew, a monk ...","At Quarr Abbey on the Isle of Wight, Brother M...",The Benedictine nuns of Kylemore Abbey on the ...,Documentary,bbc_one,False,10:30am 13 Feb 2022,3244,documentaries
