Skip to content

Commit

Permalink
code translation extractor added
Browse files Browse the repository at this point in the history
  • Loading branch information
kristianmandrup committed Apr 18, 2012
1 parent 62c3e67 commit af6d679
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 0 deletions.
34 changes: 34 additions & 0 deletions tasks/extract_code_translations.rb
@@ -0,0 +1,34 @@
require 'nokogiri'

# html = open('http://www.biblegateway.com/passage/?search=Mateo1-2&version=NVI')
# doc = Nokogiri::HTML(html.read)

link = 'language_table.html'

# doc = Nokogiri::HTML(open(link))

doc = Nokogiri::HTML(open(link).read, nil, 'utf-8')
# doc.encoding = 'utf-8'

codes = doc.css('tr td:first').map(&:content)

puts codes

hash = {}
current_code = ''

doc.css('tr').each do |tr|
td = tr.css('td').first
puts td
if td
code = td.content
hash[code] = []

tr.css('td').each do |td|
label = td.content
hash[code] << label if hash[code] && label != code
end
end
end

puts hash
73 changes: 73 additions & 0 deletions tasks/language_extract.html
@@ -0,0 +1,73 @@
<table class="wikitable sortable">
<tr>
<th>ISO 639-1</th>
<th>bg</th>
<th>cs</th>
<th>da</th>
<th>de</th>
<th>el</th>
<th>en</th>
<th>es</th>
<th>et</th>
<th>fi</th>
</tr>
<tr>
<td>ab</td>
<td>абхазки</td>
<td>abchazština</td>
<td>Abkhazian<sup class="Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources from December 2010">citation needed</span></a></i>]</sup></td>
<td>Abchasisch</td>
<td>αμπχαζικά</td>
<td><a href="/wiki/Abkhazian_language" title="Abkhazian language" class="mw-redirect">Abkhazian</a></td>
<td>abjazio</td>
<td>abhaasi</td>
<td>abhaasi</td>
</tr>
<tr>
<td>af</td>
<td>африканс</td>
<td>afrikánština</td>
<td>Afrikaans</td>
<td>Afrikaans</td>
<td>αφρικάνς</td>
<td><a href="/wiki/Afrikaans_language" title="Afrikaans language" class="mw-redirect">Afrikaans</a></td>
<td>afrikaans</td>
<td>afrikaani</td>
<td>afrikaans</td>
</tr>
<tr>
<td>an</td>
<td>арагонски</td>
<td>aragonština</td>
<td>Aragonesisk</td>
<td>Aragonesisch</td>
<td>γλώσσα της Aragon</td>
<td><a href="/wiki/Aragonese_language" title="Aragonese language">Aragonese</a></td>
<td>aragonés</td>
<td>aragoni</td>
<td>aragonia</td>
</tr>
<tr>
<td>ar</td>
<td>арабски</td>
<td>arabština</td>
<td>Arabisk</td>
<td>Arabisch</td>
<td>αραβικά</td>
<td><a href="/wiki/Arabic_language" title="Arabic language">Arabic</a></td>
<td>árabe</td>
<td>araabia</td>
<td>arabia</td>
</tr>
<tr>
<td>as</td>
<td>асамски</td>
<td>ásámština</td>
<td>Assamesisk</td>
<td>Assamesisch</td>
<td>ασαμέζικα</td>
<td><a href="/wiki/Assamese_language" title="Assamese language">Assamese</a></td>
<td>asamés</td>
<td>assami</td>
<td>assami</td>
</tr>

0 comments on commit af6d679

Please sign in to comment.