From bfe96d68ae403424731fa63115586964bfc0f9ca Mon Sep 17 00:00:00 2001 From: Jari Bakken Date: Sun, 10 Mar 2013 16:44:04 +0100 Subject: [PATCH] Find discrepancies between minutes and vote data (154) --- tools/154_read_vote.rb | 277 ++++++++++++++++++++++++++++++++--------- 1 file changed, 216 insertions(+), 61 deletions(-) diff --git a/tools/154_read_vote.rb b/tools/154_read_vote.rb index e9b116a5af37..1e4e2cd6c64f 100755 --- a/tools/154_read_vote.rb +++ b/tools/154_read_vote.rb @@ -1,69 +1,125 @@ #!/usr/bin/env ruby + require 'pp' require 'csv' require 'json' +require 'time' +require 'pathname' +require 'pry' -class IssueFinder - def self.instance - @instance ||= new - end +class VoteReader + attr_reader :identifier - def self.find(kartnr, saknr) - instance.index[[kartnr, saknr]] - end + class << self + include Enumerable - def initialize - @data ||= CSV.parse(File.read(("./rawdata/Fra NSD/154_saksopplysninger.csv"))) - end + def print_counts + by_minutes.each do |minutes, votes| + puts "#{minutes}" - COLUMNS = %w[ - period - date - time - session - room - kartnr - saknr - votnr - issue_type - vote_type - committee - issue_reference - issue_register - topic - president - president_party - internal_comment - link - ] + votes.sort_by { |e| e.time }.each do |vote| + puts "\t #{vote.time.to_s.ljust(40)}: #{vote.counts.inspect}" + end + end + end - def index - @index ||= @data.inject({}) do |mem, var| - issue = {} + def by_minutes + groups = Hash.new { |hash, key| hash[key] = [] } - var.map(&:strip).each_with_index do |col, idx| - issue[COLUMNS.fetch(idx).to_sym] = col + each do |votes| + votes.each { |vote| groups[vote.minutes] << vote } end - if issue[:time] =~ /^0:/ - issue[:time] = "0#{issue[:time]}" + groups.sort_by { |m, v| m } + end + + def find_errors + cache = Pathname.new(File.expand_path('.minutes-cache')) + cache.mkdir unless cache.exist? + + vote_count = 0 + error_count = 0 + + by_minutes.each do |minutes, votes| + vote_count += votes.size + + local_minutes = cache.join(File.basename(minutes)) + local_text = cache.join(File.basename(minutes).gsub(".pdf", ".txt")) + + unless local_text.exist? + unless local_minutes.exist? + ok = system "curl -s -o #{local_minutes.to_s} #{minutes}" + ok or raise "unable to download #{minutes}" + end + + ok = system "java -jar ~/Downloads/pdfbox-app-1.7.1.jar ExtractText #{local_minutes} #{local_text.to_s}" + ok or raise "could not convert #{local_minutes} to text" + end + + lines = local_text.read.split("\n") + minute_votes = {} + current_vote = nil + + lines.each_with_index do |line, index| + case line + when "Vo t e r i n g :" + current_vote = [] + when /Voteringsutskrift kl\. (\d{2}\.\d{2}\.\d{2})/ + next unless current_vote + + minute_votes[$1] = current_vote.join(" ") + current_vote = nil + when /enstemmig bifalt/ + current_vote = nil + else + current_vote << line if current_vote + end + end + + votes.sort_by { |e| e.time }.each do |vote| + mvote = minute_votes[vote.time.strftime("%H.%M.%S")] + counts = vote.counts + + if mvote + nums = mvote.scan(/\d+/).map { |e| e.to_i } + unless nums.include?(counts[:for]) && nums.include?(counts[:against]) + error_count += 1 + + if ENV['HTML'] + puts %{ + + #{vote.time} + #{counts[:for]} + #{counts[:against]} + #{mvote} + + } + else + puts "FEIL: #{vote.time} | for=#{counts[:for]}, mot=#{counts[:against]} | #{mvote}" + end + end + end + end end - votes = mem[[issue[:kartnr], issue[:saknr]]] ||= [] - votes << issue + puts "#{error_count} / #{vote_count} = #{error_count * 100 / vote_count.to_f}%" + end - mem + def each(&blk) + Dir['./rawdata/stortinget-voteringer-154/*.154'].each do |path| + if File.basename(path) =~ /SK(\d+)S(\d+)/ + yield VoteReader.new($1, $2).results + else + raise "bad path: #{path.inspect}" + end + end end end -end -class VoteReader - attr_reader :identifier - - def initialize(kartnr, saksnr) + def initialize(kartnr, saknr) @kartnr = kartnr - @saksnr = saksnr - @identifier = "SK#{kartnr}S#{saksnr}" + @saknr = saknr + @identifier = "SK#{kartnr}S#{saknr}" end def results @@ -76,25 +132,42 @@ def results end end - result.map do |time, results| - Vote.new(time, results, issue_for(time)) + result = result.map do |time, results| + begin + Vote.new(time, results, issue_for(time)) + rescue NoIssueFoundError => ex + # trololol + vote = Vote.allocate + vote.instance_variable_set("@results", results) + counts = vote.counts + + STDERR.puts "#{ex.message}: #{counts.inspect}" + end end + + result.compact end private + class NoIssueFoundError < StandardError + end + def issue_for(time) - issue = issues[time] or raise "no issue found for #{time}, found: #{issues.keys}" + issue = issues[time] + unless issue + raise NoIssueFoundError, "no issue found for kartnr=#{@kartnr} saknr=#{@saknr} @ #{time}, found: #{issues.keys}\n #{issues.values.map { |e| e.first[:link] }.uniq}" + end if issue.size == 1 issue.first else - raise "multiple issues for timestamp: #{time.inspect}" + raise "multiple issues for kartnr=#{@kartnr} saknr=#{@saknr} @ #{time}" end end def issues - @issues ||= IssueFinder.find(@kartnr, @saksnr).group_by { |data| data[:time] } + @issues ||= IssueFinder.find(@kartnr, @saknr).group_by { |data| data[:time] } end def representatives @@ -134,12 +207,32 @@ def votes end class Vote + attr_reader :time + def initialize(time, results, issue) @time = time @results = results @issue = issue - check_handicap_seat + unless time == issue[:time] + raise "time #{time.inspect} doesn't match issue: #{issue.inspect}" + end + + @time = Time.parse(issue.values_at(:date, :time).join(' ')) + + fix_handicap_seat + end + + def minutes + @issue.fetch(:link) + end + + def saknr + @issue.fetch(:saknr) + end + + def kartnr + @issue.fetch(:kartnr) end def counts @@ -163,7 +256,7 @@ def counts ) end - def print + def print(include_votes = true) puts "Tidspunkt : #{@time.inspect}" puts "For : #{counts[:for]}" puts "Mot : #{counts[:against]}" @@ -178,7 +271,7 @@ def print private - def check_handicap_seat + def fix_handicap_seat s62 = @results.find { |e| e[:seat] == 62 } s172 = @results.find { |e| e[:seat] == 172 } @@ -199,13 +292,75 @@ def check_handicap_seat end end +class IssueFinder + def self.instance + @instance ||= new + end + + def self.find(kartnr, saknr) + instance.index[[kartnr, saknr]] + end + + def initialize + @data ||= CSV.parse(File.read(("./rawdata/Fra NSD/154_saksopplysninger.csv"))) + end + + COLUMNS = %w[ + period + date + time + session + room + kartnr + saknr + votnr + issue_type + vote_type + committee + issue_reference + issue_register + topic + president + president_party + internal_comment + link + ] + + def index + @index ||= @data.inject({}) do |mem, var| + issue = {} + + var.map(&:strip).each_with_index do |col, idx| + issue[COLUMNS.fetch(idx).to_sym] = col + end + + if issue[:time] =~ /^0:/ + issue[:time] = "0#{issue[:time]}" + end + + votes = mem[[issue[:kartnr], issue[:saknr]]] ||= [] + votes << issue + + mem + end + end +end + + if __FILE__ == $0 if ARGV.size == 2 - kartnr, saksnr = ARGV - results = VoteReader.new(kartnr, saksnr).results + kartnr, saknr = ARGV + results = VoteReader.new(kartnr, saknr).results results.first.print - else - # TODO: read all - abort "USAGE: #{$0} " + elsif ARGV.size == 1 + cmd = ARGV.first + case cmd + when 'print-counts' + VoteReader.print_counts + when 'find-errors' + VoteReader.find_errors + else + raise "unknown command: #{cmd.inspect}" + end end -end \ No newline at end of file +end