# Marathi (mar)

## Filtered RSS feed

In [1]:
import feedparser
import itertools
import sys
import parse
import dateutil

sys.path.append('../src')
import index
import fetch_rss_feed

In [2]:
RSS_URL = 'http://www.newsonair.com/reg.asp'
SOURCE_NAME = 'AIR Language Bulletins (Regional)'
LANGUAGE = 'mar'

In [3]:
def iter_entries():
    f = feedparser.parse(RSS_URL)
    for e in f['entries']:
        if 'Marathi' in e['title']:
            yield e

In [4]:
def parse_date(e):
    date_s = parse.parse('<p>{}</p><a href{}',
                         e['summary']).fixed[0]
    date_s = date_s.replace('&nbsp;', ' ')
    return dateutil.parser.parse(date_s).date()

In [5]:
seen = index.scan()
entries = itertools.islice(iter_entries(), 20)
for i, e in enumerate(entries):
    title = e['title']
    media_url = fetch_rss_feed.detect_media_url(e, {})
    date = parse_date(e)
    
    if media_url in seen:
        print('{0}. {1} (skipping)'.format(i + 1, title), flush=True)
        continue
    
    print('{0}. {1}'.format(i + 1, title), flush=True)

    staged = index.stage_audio(media_url, LANGUAGE)
    if staged.checksum in seen:
        print('   SKIPPING: checksum already present')
        continue

    sample = {
        'title': title,
        'media_urls': [media_url],
        'source_url': media_url,
        'source_name': SOURCE_NAME,
        'language': LANGUAGE,
        'date': str(date),
        'checksum': staged.checksum,
    }
    index.save(sample)
    index.mark_as_seen(sample, seen)

1. Aurangabad : Marathi : 0650 hrs (skipping)
2. Aurangabad : Marathi : 1300 hrs (skipping)
3. Aurangabad : Marathi : 1725 hrs (skipping)
4. Mumbai : Marathi : 1035 hrs (skipping)
5. Mumbai : Marathi : 1345 hrs (skipping)
6. Mumbai : Marathi : 1815 hrs (skipping)
7. Mumbai : Marathi : 1900 hrs (skipping)
8. Nagpur : Marathi : 1845 hrs (skipping)
9. Pune : Marathi : 0710 hrs (skipping)
