In [106]:
import pandas as pd
import numpy as np
import requests
import xml.etree.ElementTree as ET
import boto3
s3_resource = boto3.resource('s3')

# Podcast Audio (MP3) Retrieval

We want to retrieve as many podcasts as possible in mp3 format and then transcribe them. This is step 1.

In [14]:
read_data = pd.read_csv("data/cbc/train.csv")
#read_data = pd.read_csv("data/radio-canada/train.csv")

In [5]:
read_data.sample(10)

Unnamed: 0,sessionId,contentId,eventId
721147,user-230758,1.478305,0
582543,user-185753,1.477905,0
1839304,user-596317,1.478343,0
1913867,user-620782,1.478333,8
1796829,user-582469,1.478185,2
771376,user-247288,1.478407,19
1462119,user-473156,1.477673,1
1806987,user-585865,1.478358,1
297978,user-093144,1.478386,2
1203871,user-388793,1.478235,4


In [13]:
print('There are', read_data.size, 'reads')

There are 6217836 reads


In [11]:
unique_content = read_data['contentId'].unique()
print('There are', unique_content.size, 'unique articles')

There are 38059 unique articles


# Retrieving Podcast 'Shows'

In [109]:
def mp3_url_from_rss(rss_url):
    rss = requests.get(url=rss_url)
    root = ET.fromstring(rss.content)
    channel = root[0]
    # Structure is
    #   <xml><channel>...<item></item><item></item>...</channel></xml>
    mp3s = []
    for episode in channel.findall('item'):
        #print(episode.find('title').text)
        mp3_url = episode.find('enclosure').attrib.get('url')
        #print('url: ', mp3_url)
        mp3s.append(mp3_url) 
    return mp3s

def download_mp3(url, name):
    response = requests.get(url)
    with open(name, 'wb') as f:
        f.write(response.content)

def upload_to_s3(bucket, local_filename, s3_filename):
    s3_resource.Object(bucket, s3_filename).upload_file(
        Filename=local_filename)

In [111]:
upload_to_s3('cbc-ds-podcasts', '0.mp3', '0.mp3')

In [107]:
mp3s = mp3_url_from_rss('https://www.cbc.ca/podcasting/includes/2050.xml')
download_mp3(mp3s[0], '0.mp3')

In [119]:
tmp_mp3_name = 'temp.mp3'
bucket = 'cbc-ds-podcasts'
def process_shows(shows, just_names=False):
    all_uploaded = []
    for show in shows.json():
        name = show['title']
        print(">    NAME", name)
        
        # Try to find rss xml and download mp3s from there
        try:
            rss_url = show['rssUrl']
            print('RSS Request: ', rss_url)
            mp3s = mp3_url_from_rss(rss_url)
            for mp3 in mp3s:
                mp3_name = mp3.split('/')[-1]
                if not just_names:
                    # Download mp3
                    download_mp3(mp3, tmp_mp3_name)
                
                    # Upload mp3 to s3
                    print("Uploading ", mp3_name)
                    upload_to_s3(bucket, tmp_mp3_name, mp3_name)
                all_uploaded.append(mp3_name)
                
        except:
            print('WARN: No RSS found...')
            
    return all_uploaded
    

In [115]:
shows_api = 'https://api-gw.radio-canada.ca/audio/v1/shows'
shows = requests.get(url=shows_api)
uploaded = process_shows(shows)

>    NAME The 180
WARN: No RSS found...
>    NAME 2050: Degrees of Change
RSS Request:  https://www.cbc.ca/podcasting/includes/2050.xml
Uploading  2050-3qWkhA7S-20181122.mp3
Uploading  2050_20170608_55493.mp3
Uploading  2050_20170608_79050.mp3
Uploading  2050_20170608_54364.mp3
Uploading  2050_20170608_99144.mp3
Uploading  2050_20170608_68298.mp3
Uploading  2050_20170608_49800.mp3
Uploading  2050_20170601_85504.mp3
>    NAME Afternoon Drive
WARN: No RSS found...
>    NAME Airplay
RSS Request:  https://www.cbc.ca/podcasting/includes/airplayyk1.xml
Uploading  airplayyk1-UBw3UAR4-20190131.mp3
Uploading  airplayyk1-SArX1n8D-20190117.mp3
Uploading  airplayyk1-p8wChbFz-20181206.mp3
Uploading  airplayyk1-pBHS6ZhU-20181204.mp3
Uploading  airplayyk1-81vD8znp-20181204.mp3
Uploading  airplayyk1-a5WzV9FF-20181128.mp3
Uploading  airplayyk1-ZOq5ozC1-20181122.mp3
Uploading  airplayyk1-NllmW08x-20181115.mp3
Uploading  airplayyk1-tXX5DdU4-20181107.mp3
Uploading  airplayyk1-DVtasZGW-20181102.mp3
Uploadi

# Names of MP3 Files in S3

The following MP3s have been copied to S3 for storage where they can then be transcribed.

In [120]:
uploaded = process_shows(shows, just_names=True)
print(uploaded)

>    NAME The 180
WARN: No RSS found...
>    NAME 2050: Degrees of Change
RSS Request:  https://www.cbc.ca/podcasting/includes/2050.xml
>    NAME Afternoon Drive
WARN: No RSS found...
>    NAME Airplay
RSS Request:  https://www.cbc.ca/podcasting/includes/airplayyk1.xml
>    NAME Alberta at Noon
RSS Request:  http://www.cbc.ca/podcasting/includes/calgwildrose.xml
>    NAME All in a Day
RSS Request:  https://www.cbc.ca/podcasting/includes/ottallinaday.xml
>    NAME All in a Weekend Montreal
WARN: No RSS found...
>    NAME All Points West
WARN: No RSS found...
>    NAME Alone: A Love Story
RSS Request:  https://www.cbc.ca/podcasting/includes/alone.xml
>    NAME Ambushed
RSS Request:  https://www.cbc.ca/podcasting/includes/ambushed.xml
['2050-3qWkhA7S-20181122.mp3', '2050_20170608_55493.mp3', '2050_20170608_79050.mp3', '2050_20170608_54364.mp3', '2050_20170608_99144.mp3', '2050_20170608_68298.mp3', '2050_20170608_49800.mp3', '2050_20170601_85504.mp3', 'airplayyk1-UBw3UAR4-20190131.mp3', 'a

# Transcribe MP3s

See `transcribe.ipynb`

In [116]:
transcribe = boto3.client('transcribe')

## Output transcriptions to S3

Here we've already started transcribing some so lets make a basic training set with what we've got so far

In [151]:
import codecs

In [152]:
def output_transcript_s3(bucket, job_names):
    bucket = bucket
    output = []
    
    for name in job_names:
        #print('Uploading ', name)
        try:
            status = transcribe.get_transcription_job(TranscriptionJobName=name)
            status_pretty = status['TranscriptionJob']['TranscriptionJobStatus']
            if status_pretty not in ['COMPLETED']:
                print('not completed, skipping...')
                continue
            transcript_uri = status['TranscriptionJob']['Transcript']['TranscriptFileUri']
            response = requests.get(transcript_uri)
            json = response.json()
            transcript = json['results']['transcripts'][0]['transcript']
            s3_name = name+'.txt'
            tmpname = 'temp.txt'
            with codecs.open(tmpname, 'w', encoding='utf8') as f:
                f.write(transcript)
            # upload fname to s3
            upload_to_s3(bucket, tmpname, s3_name)
            output.append(s3_name)
        except:
            print("na")
    return output

In [153]:
output_transcript_s3('cbc-ds-transcripts', uploaded)

['2050-3qWkhA7S-20181122.mp3.txt',
 '2050_20170608_55493.mp3.txt',
 '2050_20170608_79050.mp3.txt',
 '2050_20170608_54364.mp3.txt',
 '2050_20170608_99144.mp3.txt',
 '2050_20170608_68298.mp3.txt',
 '2050_20170608_49800.mp3.txt',
 '2050_20170601_85504.mp3.txt',
 'airplayyk1-UBw3UAR4-20190131.mp3.txt',
 'airplayyk1-SArX1n8D-20190117.mp3.txt',
 'airplayyk1-p8wChbFz-20181206.mp3.txt',
 'airplayyk1-pBHS6ZhU-20181204.mp3.txt',
 'airplayyk1-81vD8znp-20181204.mp3.txt',
 'airplayyk1-a5WzV9FF-20181128.mp3.txt',
 'airplayyk1-ZOq5ozC1-20181122.mp3.txt',
 'airplayyk1-NllmW08x-20181115.mp3.txt',
 'airplayyk1-tXX5DdU4-20181107.mp3.txt',
 'airplayyk1-DVtasZGW-20181102.mp3.txt',
 'airplayyk1-SBVtzKbx-20181026.mp3.txt',
 'airplayyk1-myJmPcgZ-20181025.mp3.txt',
 'airplayyk1-E5vcgz72-20181012.mp3.txt',
 'airplayyk1-CSJNmcFm-20181010.mp3.txt',
 'airplayyk1-5hO1p1fy-20181002.mp3.txt',
 'airplayyk1-cJdktsqY-20181002.mp3.txt',
 'airplayyk1-Gp7vMB2D-20180919.mp3.txt',
 'calgwildrose-wULhtPRR-20190131.mp3.txt',
 