Skip to content

Commit

Permalink
Include headlines of wiki articles when building SourceArticle object…
Browse files Browse the repository at this point in the history
…s from them
  • Loading branch information
itsjeyd committed Feb 19, 2012
1 parent f7fd85b commit 8692f7c
Showing 1 changed file with 28 additions and 22 deletions.
50 changes: 28 additions & 22 deletions local_apps/wt_articles/models.py
Expand Up @@ -53,38 +53,44 @@ def __unicode__(self):

def save(self, manually_splitting=False, source_sentences=()):
if not self.sentences_processed and not manually_splitting:
# Tokenize the HTML that is fetched from a wiki article
sentences = list()
segment_id = 0
soup = BeautifulSoup(self.source_text)
sentence_splitter = determine_splitter(self.language.code)
# initial save for foreign key based saves to work
# save should occur after sent_detector is loaded
super(SourceArticle, self).save()
# find all paragraphs
for p in soup.findAll('p'):
p_text = ''.join([x.string for x in p.findAll(text=True)
if not re.match('[\[\]\\d]+$',
x.string)])
# split all sentences in the paragraph

sentences = sentence_splitter(p_text.strip())
# TODO: remove bad sentences that were missed above
sentences = [s for s in sentences if \
not re.match("^\**\[\d+\]\**$", s)]

for sentence in sentences:
# Clean up bad spaces ( )
sentence = sentence.replace(" ", " ")

for t in soup.findAll(re.compile('^[ph]')):
if re.match('p', t.name):
p_text = ''.join([x.string for x in t.findAll(text=True)
if not re.match('[\[\]\\d]+$',
x.string)])
sentences = sentence_splitter(p_text.strip())
for sentence in sentences:
sentence = sentence.replace(" ", " ")
s = SourceSentence(article=self,
text=sentence,
segment_id=segment_id)
segment_id += 1
s.save()
s.end_of_paragraph = True
s.save()

elif re.match('h', t.name):
headline = t.findAll(attrs={'class': 'mw-headline'})
print 'Found headline!'
if headline:
h = headline[0].string
else:
h = t.string
s = SourceSentence(article=self,
text=sentence,
text=h,
segment_id=segment_id)
segment_id += 1
s.end_of_paragraph = True
s.save()
s.end_of_paragraph = True
s.save()
segment_id += 1

self.sentences_processed = True

else:
for sentence in source_sentences:
sentence.save()
Expand Down

0 comments on commit 8692f7c

Please sign in to comment.