# Telugu (tel)

Manually scraping podcasts for Telugu from html pages.

In [1]:
from pyquery import PyQuery as pq
import parse
import dateutil
import sys

import sys; sys.path.append("../src")
import index
import fetch_rss_feed

## The Hindu

The Hindu has a list of editorials translated into Telugu audio. We fetch them here.

In [2]:
def date_from_title(t):
    date_s = parse.parse(
        '{prefix}orial, {date} Translated by{suffix}',
        t
    ).named['date'].rstrip('.')
    return dateutil.parser.parse(date_s).date()

In [3]:
def fetch_thehindu_episodes(max_episodes=20):
    url = 'http://www.thehindu.com/opinion/editorial/article2695389.ece'
    doc = pq(url)
    seen = index.scan()

    for i, div in enumerate(doc('div.stdArtpageRelCt')[:max_episodes]):
        title = div.text_content().strip()
        href = div.find('.//a').attrib['href']
        media_url = fetch_rss_feed.get_audio_link(href)
        if not media_url:
            print('{0}. {1} (no audio, skipping)'.format(i + 1, title))
            continue
        
        if href in seen or media_url in seen:
            print('{0}. {1} (skipping)'.format(i + 1, title))
            continue

        print('{0}. {1}'.format(i + 1, title))
        try:
            date = date_from_title(title)
        except (AttributeError, ValueError) as e:
            print('SKIPPING: bad title')
            continue

        sample = {
            'language': 'tel',
            'title': title,
            'source_name': 'The Hindu: Podcasts in Telugu',
            'source_url': href,
            'media_urls': [media_url],
            'date': str(date),
        }
        _, checksum = index.stage_audio(media_url, 'tel')
        sample['checksum'] = checksum
        if checksum in seen:
            print('SKIPPING: checksum already in index')

        index.save(sample)

In [4]:
fetch_thehindu_episodes()

1. Where there is a will [Telugu: Manasunte maargamundi] Editorial, December 5, 2012. Translated by V.R. Subrahmanyam, presented by Srinivas Ghantasala (skipping)
2. Memorialising Thackeray [Telugu: Thackeray smaraka chihnam erpatu vivadam] Eidtorial, December 4, 2012. Translated by V.R. Subrahmanyam, presented by Srinivas Ghantasala (skipping)
3. Mining politics for a second lease [Telugu: Koththa jeevitham kosam gannu rajakiyalu] Editorial, December 3, 2012. Translated by V.R. Subrahmanyam, presented by Srinivas Ghantasala (skipping)
4. Mamata scores, for Manmohan [Telugu: Manmohan vijayam kosam Mamata saayam] Editorial, November 24, 2012. Translated by V.R. Subrahmanyam, presented by Srinivas Ghantasala (skipping)
5. Sealing the cracks [Telugu: Pagulla moosivetha] Editorial, November 23, 2012. Translated by V.R. Subrahmanyam, presented by Srinivas Ghantasala (skipping)
6. Congress in the time of Rahul [Telugu: Rahul Gandhi saaradyamlo Congress] Editorial, November 20, 2012. Translat

## Telugu One Radio

In [5]:
def get_host_name(doc):
    title = doc('.heading > a')[1].text
    return parse.parse('{} ( Archives )', title).fixed[0].strip()

def fetch_telugu_one_mp3(url):
    doc = pq(url)
    mp3_suffix = parse.parse("{}so.addVariable('file', '{}');{}", pq.html(doc)).fixed[1]
    return 'http://www.teluguoneradio.com' + mp3_suffix

def fetch_telugu_one_host(url, max_episodes=20):
    seen = index.scan()

    doc = pq(url)
    host_name = get_host_name(doc)
    
    rows = doc('.archives_content_main1 tr')[1:]
    for i, tr in enumerate(rows[:max_episodes]):
        td_date, td_name, td_url = tr.findall('.//td')
        source_url = 'http://www.teluguoneradio.com/' + td_url.find('.//a').attrib['href']
        date = dateutil.parser.parse(td_date.text_content()).date()
        title = td_name.text_content().strip()
        if source_url in seen:
            print('{0}. {1} (skipping)'.format(i + 1, title))
            continue
            
        print('{0}. {1}'.format(i + 1, title))
        sys.stdout.flush()

        media_url = fetch_telugu_one_mp3(source_url)
        _, checksum = index.stage_audio(media_url, 'tel')
        if checksum in seen:
            print('SKIPPING: checksum already in index')
            continue
            
        sample = {
            'language': 'tel',
            'source_name': 'Telugu One Radio: {0}'.format(host_name),
            'title': title,
            'date': str(date),
            'source_url': source_url,
            'media_urls': [media_url],
            'checksum': checksum,
        }
        index.save(sample)

fetch_telugu_one_host('http://www.teluguoneradio.com/archiveshostallprogrames.php?host_id=62')

1. Manalo Mana Mata (skipping)
2. Manalo Mana Mata (skipping)
3. Manalo Mana Mata (skipping)
4. Manalo Manamata (skipping)
5. Manalomanamata (skipping)
6. Manalo Mana Mata (skipping)
7. Manalo Mana Mata (skipping)
8. Manalomanamata (skipping)
9. Manalomanamata (skipping)
10. Manalo Mana Mata (skipping)
11. Manalomanamata (skipping)
12. Manalo Mana Mata (skipping)
13. Manalomanamata (skipping)
14. Manalomanamata (skipping)
15. Manalomanamata (skipping)
16. Manalomanamata (skipping)
17. Manalomanamata (skipping)
18. Manalomanamata (skipping)
19. manalo mana mata (skipping)
20. Manalo Manamata (skipping)
