# Data collection - BBC website
# Part 1: Episode information

In [1]:
import pandas as pd
import json

import urllib.request
from urllib.request import urlopen
from lxml.html import parse, fromstring
pd.set_option('display.max_rows', 10)

In [None]:
## The packages used on Google Colab

!pip install git+https://github.com/openai/whisper.git 
!sudo apt update && sudo apt install ffmpeg
from google.colab import drive

## Get links of episodes

In [2]:
BASEURL = 'https://www.bbc.co.uk/programmes/b006qnmr/episodes/guide'

def get_episodes(url):
    n = 0
    total = 111
    while True:
        n += 1
        print(f"Retrieving page {n}...")
        tree = parse(urlopen(url))
        names = [
            e.text_content() for e in tree.xpath("//span[@class='programme__title gamma']")
        ]
        links = [
            e.attrib["href"] for e in tree.xpath("//div[@class='programme__body']//a")
        ]
        for name, link in zip(names, links):
            directory = {}
            directory['names'] = name
            directory['links'] = link
            yield directory
        next_page = BASEURL + str(tree.xpath("//a[@rel='next']/@href")[0])
        if not n > total:
            url = next_page
        else:
            print("No more pages found.")
            break

## Get information of episodes

In [3]:
def get_info(url):
    tree = parse(urlopen(url))
    # time
    try:
        time = tree.xpath("//div[@class='broadcast-event__time beta']/@title")[0]
    except:
        time = ''
        
    #duration
    duration = tree.xpath("//div[@class='map__intro']/p[@class='episode-panel__meta']\
    /text()[normalize-space()]")[-1].strip()
    
    # book
    try:
        book = tree.xpath("//h3[contains(text(), 'Book Choice')]/\
        following-sibling::ul//h4//span[@class='title']/text()")[0]
    except:
        book = ''
    
    # luxury
    try:
        luxury = tree.xpath("//h3[contains(text(), 'Luxury Choice')]/\
        following-sibling::ul//h4//span[@class='title']/text()")[0]
    except:
        luxury = ''
    
    # favourite
    try:
        favourite = tree.xpath("//h3[contains(text(), 'Favourite')]/following-sibling::ul\
        //h4[@class='gamma no-margin']/span[@class='artist']/text()")[0]
    except:
        favourite = ''
    
    # check availibility
    download = tree.find("//div[@class='buttons__download']")
    if download is not None:
        availibility = True
    else:
        availibility = False
        
    # check number of artists
    artists = [e.text_content() for e in tree.xpath("//span[@class='artist']")]
    
    return {'time':time, 'duration':duration, 'book':book, 'luxury':luxury, \
            'favourite':favourite, 'availibility':availibility, 'number':len(artists)}


## Get information of songs

In [4]:
def get_songs(url):
    tree = parse(urlopen(url))
    music_blocks = tree.xpath("//li[contains(@class, 'segments-list__item--music')]")
    artists = []
    songs = []
    albums = []
    labels = []
    
    for block in music_blocks:  
        # artist
        artist_elements = block.xpath(".//span[@class='artist']")
        artist_texts = [e.text_content() for e in artist_elements] if artist_elements else ' '
        artists.extend(artist_texts)

        # song
        song_elements = block.xpath(".//p[@class='no-margin']/span")
        song_texts = [e.text_content() for e in song_elements] if song_elements else ' '
        songs.extend(song_texts)

        # album
        album_elements = block.xpath(".//div[@class='segment__track']//em")
        album_texts = [e.text_content() for e in album_elements] if album_elements else ' '
        albums.extend(album_texts)

        # label
        label_elements = block.xpath(".//abbr[@title='Record Label']")
        label_texts = [e.text_content() for e in label_elements] if label_elements else ' '
        labels.extend(label_texts)

    return list(zip(artists, songs, albums, labels))


## Retrieve the data

### About episodes

In [None]:
with open("Guests.json", mode="w") as f:
    for item in get_episodes(BASEURL):
        print(f"Processing {item['names']}...")
        info = get_info(item['links'])
        info['guests'] = item['names']
        info['links'] = item['links']
        f.write(json.dumps(info))
        f.write("\n")

### About songs

In [None]:
with open("Songs.json", mode="w") as f:
    for item in get_episodes(BASEURL):
        print(f"Processing {item['names']}...")
        for song in get_songs(item['links']):
            song = list(song)
            song.append(item['names'])
            song.append(item['links'])
            f.write(json.dumps(song))
            f.write("\n")

## Save as files

In [None]:
df = pd.read_json("Guests.json", lines=True)
df

In [None]:
df.to_csv('Guests.csv')

In [None]:
df2 = pd.read_json("Songs.json", lines=True)
df2.columns = ['artists', 'songs', 'albums', 'labels', 'guests', 'links']
df2

In [None]:
df2.to_csv('Songs.csv', encoding='utf-8', index=True)

# Part 2: Download Episodes

## Determine which episodes to download

In [None]:
df['index'] = df.index
df['time'] = pd.to_datetime(df['time'])
df['year'] = df['time'].dt.year

In [None]:
df[(df['availibility'] == True) & (df['number'] > 0)].year.value_counts()

In [None]:
filtered_df = df.sort_values(by = 'time')\
            [(df['availibility'] == True) & (df['number'] > 0)]\
            .groupby('year').head(5)
filtered_df

In [None]:
filtered_df.to_csv('Guests_filter.csv')

## Retrieve the audio data

In [None]:
for link, index in zip(filtered_df['links'], filtered_df['index']):
    tree = parse(urlopen(link))
    try:
        file_link = [e.attrib["href"] for e in tree.xpath("//a[@class='link-complex br-linkinvert buttons__download__link']")][-1]
    except:
        file_link = [e.attrib["href"] for e in tree.xpath("//a[@class='link-complex popup__list__item island--squashed br-secondary-bg-ontext br-secondary-bg-onbg--hover br-secondary-link-ontext--hover']")][-1]
    # Generate a unique file name
    file_name = f"file_{index}.mp3"
    # Download the file
    urllib.request.urlretrieve(f"https:{file_link}", file_name)
    print(f"Downloaded {file_name}")

# Transcribe the audio data to text data

> This part was done on Google Colab, so the codes below are just some cope-paste. The idea is that set the file path of my Google drive to save the finished text data, and use the base model of Whisper to transcribe the audio data.

In [None]:
drive.mount('/content/drive')

In [None]:
!whisper 'file_xxx.mp3' --model base

In [None]:
!cp file_1784.txt /content/drive/MyDrive