# Malayam (mal)

Manually scraping podcasts for Malayam from html pages.

In [1]:
from pyquery import PyQuery as pq
import parse
import dateutil
import sys

import sys; sys.path.append("../src")
import index
import fetch_rss_feed

In [2]:
LANGUAGE = 'mal'

## The Hindu

The Hindu has a list of editorials translated into Tamil audio. We fetch them here.

In [3]:
def date_from_title(t):
    date_s = parse.parse(
        '{prefix}orial, {date} Translated {suffix}',
        t
    ).named['date'].rstrip('.')
    return dateutil.parser.parse(date_s).date()

In [4]:
THEHINDU_URL = 'http://www.thehindu.com/opinion/editorial/article2692451.ece'
THEHINDU_NAME = 'The Hindu: Podcasts in Malayam'

def fetch_thehindu_episodes(max_episodes=20):
    url = THEHINDU_URL
    doc = pq(url)
    seen = index.scan()

    for i, div in enumerate(doc('div.stdArtpageRelCt')[:max_episodes]):
        sys.stdout.flush()
        title = div.text_content().strip()
        href = div.find('.//a').attrib['href']
        media_url = fetch_rss_feed.get_audio_link(href)
        if not media_url:
            print('{0}. {1} (no audio, skipping)'.format(i + 1, title))
            continue
        
        if href in seen or media_url in seen:
            print('{0}. {1} (skipping)'.format(i + 1, title))
            continue

        print('{0}. {1}'.format(i + 1, title))
        try:
            date = date_from_title(title)
        except (AttributeError, ValueError) as e:
            print('SKIPPING: bad title')
            continue

        sample = {
            'language': LANGUAGE,
            'title': title,
            'source_name': THEHINDU_NAME,
            'source_url': href,
            'media_urls': [media_url],
            'date': str(date),
        }
        sys.stdout.flush()
        
        staged = index.stage_audio(media_url, LANGUAGE)
        sample['checksum'] = staged.checksum
        if staged.checksum in seen:
            print('SKIPPING: checksum already in index')

        index.save(sample)

In [5]:
fetch_thehindu_episodes()

1. The rape of reason [Malayalam: Yukthiye manabhangam cheyyumpol] Editorial, October 15, 2012. Translated by Rasmi Binoy, presented by A Correspondent (skipping)
2. A patch to call their own [Malayalam: Swantamennu vilikkaan oridam]. Editorial, October 13, 2012. Translated by Rasmi Binoy, Presented by A Correspondent
   downloading http://www.thehindu.com/multimedia/archive/01236/Malayalam_Patch_to_1236786a.mp3
3. The drive for exclusivism [Malayalam: Thangalkku vendi maathram]. Editorial, October 11, 2012. Translated by Rasmi Binoy, Presented by A Correspondent
   downloading http://www.thehindu.com/multimedia/archive/01236/Malayalam_2_1236785a.mp3
4. (Mis)treating Ms. Gandhi (Malayalam: Apahasyamaya rashtreeya thantram). Editorial, October 5, 2012. Translated by Rasmi Binoy, Presented by A Correspondent
   downloading http://www.thehindu.com/multimedia/archive/01228/MalayalamPodcast_1228990a.mp3
5. The status of Malayalam [Malayalam: Malayalayhinte padavi]. Editorial, October 4, 201