Permalink
Browse files

fixed get_date_path and media logic

  • Loading branch information...
1 parent 4a403a4 commit fbae0bc9954c5b9010c14582e5ab5f1d4796d26c @nueces nueces committed Apr 13, 2012
Showing with 15 additions and 16 deletions.
  1. +15 −16 src/transmogrify/nitf/xmlsource.py
@@ -26,12 +26,13 @@ def get_text(dom, subelemet, attribute=None):
def get_date_path(dom, subelemet, attribute):
- """ Return a path ibased on a date value normalized into ISO8601
+ """ Return a path 'YYYY/MM/DD' based on a date value normalized into
+ ISO-8601
Note: Only work with the basic format.
"""
- text = get_text(dom, subelemet, attribute)
+ text = get_text(dom, subelemet, attribute)
# We only need the YYYYMMDD part from the string
- date =
+ return "/".join([text[:4], text[4:6], text[6:8]])
class XMLSource(object):
@@ -45,8 +46,6 @@ def __init__(self, transmogrifier, name, options, previous):
def __iter__(self):
for data in self.previous:
- images = []
- videos = []
item = {'id': '',
'path': '',
'title': '',
@@ -57,7 +56,10 @@ def __iter__(self):
'genre': '',
'section': '',
'urgency': '',
- 'location': ''}
+ 'location': '',
+ 'media': {'image': [],
+ 'video': []}
+ }
dom = etree.fromstring(data)
head = dom.find('head')
@@ -70,25 +72,22 @@ def __iter__(self):
'tobject.property.type')
item['section'] = get_text(head, 'pubdata', 'position.section')
item['urgency'] = get_text(head, 'docdata/urgency', 'ed-urg')
- item['location'] = ", ".join([
- get_text(head, 'docdata/evloc', 'city'),
- get_text(head, 'docdata/evloc', 'state-prov'),
- get_text(head, 'docdata/evloc', 'iso-cc')])
+ item['location'] = get_text(body, 'body.head/dateline/location')
item['subtitle'] = get_text(body, 'body.head/hedline/hl2')
item['description'] = get_text(body, 'body.head/abstract')
item['byline'] = get_text(body, 'body.head/byline/person')
for elem in list(body.find('body.content')):
- if elem.tag == 'media':
+ if elem.tag == 'media' and elem.get('media-type') == 'image':
image = dict(elem.find('media-reference'))
- image['alt'] = get_text(elem, 'media-caption')
- images.append(image)
+ image['media-caption'] = get_text(elem, 'media-caption')
+ item['media']['image'].append(image)
- elif elem.tag == 'video':
+ elif elem.tag == 'media' and elem.get('media-type') == 'video':
video = dict(elem.find('media-reference'))
- video['alt'] = get_text(elem, 'media-caption')
- videos.append(video)
+ video['media-caption'] = get_text(elem, 'media-caption')
+ item['media']['video'].append(video)
else: # other tag are considered part of the body text and
# should be preserved.

0 comments on commit fbae0bc

Please sign in to comment.