Permalink
Browse files

remove html tags from a csv file.

  • Loading branch information...
1 parent 7465352 commit ddb591f06d875ff03be433c0958f195f909e96ca Jenn committed Jul 26, 2013
Showing with 29 additions and 0 deletions.
  1. +29 −0 textmining/remove_html_tags/remove_html.rb
@@ -0,0 +1,29 @@
+# encoding: utf-8
+
+require 'rubygems'
+require 'sanitize'
+require 'CSV'
+
+terms = Hash.new
+terms = {'\xe1' => 'á', '\xe9' => 'é', '\xed' => 'í', '\xfa' => 'ú', '\xf3' => 'ó'}
+
+def removeaccent(word, terms)
+ terms.each do |code, termino|
+ word = word.gsub(code, termino)
+ end
+ return word
+end
+
+file_clean = File.open("lanacion.com.ar.csv.data.clean", "w")
+
+CSV.foreach("lanacion.com.ar.csv.data", encoding: 'UTF-8' ) do |row|
+ #title
+ title = removeaccent(Sanitize.clean(row[1]).force_encoding('UTF-8'), terms)
+ #content
+ content = removeaccent(Sanitize.clean(row[0]).force_encoding('UTF-8'), terms)
+ #csv
+ csv_string = [title, content].to_csv
+ file_clean.write(csv_string)
+end
+
+

0 comments on commit ddb591f

Please sign in to comment.