/
dbpedia_abstract_to_sentences.rb
executable file
·86 lines (80 loc) · 3.81 KB
/
dbpedia_abstract_to_sentences.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env ruby
require 'rubygems'
require 'wukong/script'
#
# Use the stanford NLP parse to split a piece of text into sentences
#
# @example
# SentenceParser.split("Beware the Jabberwock, my son! The jaws that bite, the claws that catch! Beware the Jubjub bird, and shun The frumious Bandersnatch!")
# # => [["Beware", "the", "Jabberwock", ",", "my", "son", "!"], ["The", "jaws", "that", "bite", ",", "the", "claws", "that", "catch", "!"], ["Beware", "the", "Jubjub", "bird", ",", "and", "shun", "The", "frumious", "Bandersnatch", "!"]]
#
class SentenceParser
def self.processor
return @processor if @processor
require 'rubygems'
require 'stanfordparser'
@processor = StanfordParser::DocumentPreprocessor.new
end
def self.split line
processor.getSentencesFromString(line).map{|s| s.map{|w| w.to_s } }
end
end
#
# takes one document per line
# splits into sentences
#
class WordNGrams < Wukong::Streamer::LineStreamer
def recordize line
line.strip!
line.gsub!(%r{^<http://dbpedia.org/resource/([^>]+)> <[^>]+> \"}, '') ; title = $1
line.gsub!(%r{\"@en \.},'')
[title, SentenceParser.split(line)]
end
def process title, sentences
sentences.each_with_index do |words, idx|
yield [title, idx, words].flatten
end
end
end
Wukong.run WordNGrams, nil, :partition_fields => 1, :sort_fields => 2
# ---------------------------------------------------------------------------
#
# Run Time:
#
# Job Name: dbpedia_abstract_to_sentences.rb---/data/rawd/encyc/dbpedia/dbpedia_dumps/short_abstracts_en.nt---/data/rawd/encyc/dbpedia/dbpedia_parsed/short_abstract_sentences
# Status: Succeeded
# Started at: Fri Jan 28 03:14:45 UTC 2011
# Finished in: 41mins, 50sec
# 3 machines: master m1.xlarge, 2 c1.xlarge workers; was having some over-memory issues on the c1.xls
#
# Counter Reduce Total
# SLOTS_MILLIS_MAPS 0 10 126 566
# Launched map tasks 0 15
# Data-local map tasks 0 15
# SLOTS_MILLIS_REDUCES 0 1 217
# HDFS_BYTES_READ 1 327 116 133 1 327 116 133
# HDFS_BYTES_WRITTEN 1 229 841 020 1 229 841 020
# Map input records 3 261 096 3 261 096
# Spilled Records 0 0
# Map input bytes 1 326 524 800 1 326 524 800
# SPLIT_RAW_BYTES 1 500 1 500
# Map output records 9 026 343 9 026 343
#
# Job Name: dbpedia_abstract_to_sentences.rb---/data/rawd/encyc/dbpedia/dbpedia_dumps/long_abstracts_en.nt---/data/rawd/encyc/dbpedia/dbpedia_parsed/long_abstract_sentences
# Status: Succeeded
# Started at: Fri Jan 28 03:23:08 UTC 2011
# Finished in: 41mins, 11sec
# 3 machines: master m1.xlarge, 2 c1.xlarge workers; was having some over-memory issues on the c1.xls
#
# Counter Reduce Total
# SLOTS_MILLIS_MAPS 0 19 872 357
# Launched map tasks 0 29
# Data-local map tasks 0 29
# SLOTS_MILLIS_REDUCES 0 5 504
# HDFS_BYTES_READ 2 175 900 769 2 175 900 769
# HDFS_BYTES_WRITTEN 2 280 332 736 2 280 332 736
# Map input records 3 261 096 3 261 096
# Spilled Records 0 0
# Map input bytes 2 174 849 644 2 174 849 644
# SPLIT_RAW_BYTES 2 533 2533
# Map output records 15 425 467 15 425 467