# Tamil (tam)

Manually scraping podcasts for Tamil from html pages.

In [1]:
from pyquery import PyQuery as pq
import parse
import dateutil
import sys

import sys; sys.path.append("../src")
import index
import fetch_rss_feed

In [2]:
LANGUAGE = 'tam'

## The Hindu

The Hindu has a list of editorials translated into Tamil audio. We fetch them here.

In [3]:
def date_from_title(t):
    date_s = parse.parse(
        '{prefix}orial, {date} Translated {suffix}',
        t
    ).named['date'].rstrip('.')
    return dateutil.parser.parse(date_s).date()

In [4]:
def fetch_thehindu_episodes(max_episodes=20):
    url = 'http://www.thehindu.com/opinion/editorial/article2612204.ece'
    doc = pq(url)
    seen = index.scan()

    for i, div in enumerate(doc('div.stdArtpageRelCt')[:max_episodes]):
        title = div.text_content().strip()
        href = div.find('.//a').attrib['href']
        media_url = fetch_rss_feed.get_audio_link(href)
        if not media_url:
            print('{0}. {1} (no audio, skipping)'.format(i + 1, title))
            continue
        
        if href in seen or media_url in seen:
            print('{0}. {1} (skipping)'.format(i + 1, title))
            continue

        print('{0}. {1}'.format(i + 1, title))
        try:
            date = date_from_title(title)
        except (AttributeError, ValueError) as e:
            print('SKIPPING: bad title')
            continue
            
        sample = {
            'language': LANGUAGE,
            'title': title,
            'source_name': 'The Hindu: Podcasts in Tamil',
            'source_url': href,
            'media_urls': [media_url],
            'date': str(date),
        }
        sys.stdout.flush()

        staged = index.stage_audio(media_url, LANGUAGE)
        sample['checksum'] = staged.checksum
        if staged.checksum in seen:
            print('SKIPPING: checksum already in index')

        index.save(sample)

In [5]:
fetch_thehindu_episodes()

1. Reviving dead rivers [Tamil: Uyirkkodukkavendiya neerottamatra aarugal] Editorial, December 8, 2012. Translated and presented by V.B. Ganesan
   downloading http://www.thehindu.com/multimedia/archive/01292/Yamunai_1292506a.mp3
   downloading audio...
2. Who's afraid of moral defeat [Tamil: Nerumurai tholvi patri yaar kavalaip patrathu?] Editorial, December 07, 2012. Translated and presented by V.B. Ganesan
   downloading http://www.thehindu.com/multimedia/archive/01291/FDI_1291407a.mp3
   downloading audio...
3. A party in retreat [Tamil: Pin vangum oru katchi] Editorial, December 6, 2012. Translated and presented by T.A. Narasimhan
   downloading http://www.thehindu.com/multimedia/archive/01290/Pinvaangum_oru_kat_1290316a.mp3
   downloading audio...
4. Time to clean up our game [Tamil: Nam vilaiyattai suththapadutha vendiya neram] Editorial, December 5, 2012. Translated by A. Kumaresan, presented by V.B. Ganesan
   downloading http://www.thehindu.com/multimedia/archive/01289/Nam_Vi