From 0bc5ee213d09e20bd39675bc8649148f2ad6a2af Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Tue, 2 Apr 2019 10:24:54 +0100 Subject: [PATCH] Rewrite in modern style --- Gemfile | 20 ++++++--- Gemfile.lock | 102 ++++++++++++++++++++++++++----------------- lib/cabinet.rb | 109 ++++++++++++++++++++++++++++++++++++++++++++++ lib/politician.rb | 38 ---------------- scraper.rb | 14 ++---- 5 files changed, 187 insertions(+), 96 deletions(-) create mode 100644 lib/cabinet.rb delete mode 100644 lib/politician.rb diff --git a/Gemfile b/Gemfile index 6527119..57f2acf 100644 --- a/Gemfile +++ b/Gemfile @@ -1,15 +1,23 @@ # frozen_string_literal: true + # It's easy to add more libraries or choose different versions. Any libraries # specified here will be installed and made available to your morph.io scraper. # Find out more: https://morph.io/documentation/ruby +ruby '2.4.4' + source 'https://rubygems.org' git_source(:github) { |repo_name| "https://github.com/#{repo_name}.git" } -ruby '2.3.3' - -gem 'everypolitician', github: 'everypolitician/everypolitician-ruby' -gem 'pry' -gem 'rubocop' +gem 'rest-client' +gem 'scraped', github: 'everypolitician/scraped', branch: 'scraper-class' gem 'scraperwiki', github: 'openaustralia/scraperwiki-ruby', branch: 'morph_defaults' -gem 'wikisnakker', github: 'everypolitician/wikisnakker' +gem 'sqlite_magic', github: 'openc/sqlite_magic' + +group :quality do + gem 'rubocop' +end + +group :development do + gem 'pry' +end diff --git a/Gemfile.lock b/Gemfile.lock index 7635633..4dff380 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,19 +1,13 @@ GIT - remote: https://github.com/everypolitician/everypolitician-ruby.git - revision: 20582b1512358a0e8789ea289201e37ab5f085c2 + remote: https://github.com/everypolitician/scraped.git + revision: ecb23adeca95fba5356509d6445d528e212b3905 + branch: scraper-class specs: - everypolitician (0.20.0) - everypolitician-popolo (>= 0.8.0) + scraped (0.6.2) + field_serializer (>= 0.3.0) + nokogiri require_all -GIT - remote: https://github.com/everypolitician/wikisnakker.git - revision: 4e091cdc9619b6c12db8903075effef361071132 - specs: - wikisnakker (0.9.1) - require_all - yajl-ruby - GIT remote: https://github.com/openaustralia/scraperwiki-ruby.git revision: fc50176812505e463077d5c673d504a6a234aa78 @@ -23,50 +17,76 @@ GIT httpclient sqlite_magic +GIT + remote: https://github.com/openc/sqlite_magic.git + revision: 4df975eb4e9891de54f870077c83f63762af9bf9 + specs: + sqlite_magic (0.0.6) + sqlite3 + GEM remote: https://rubygems.org/ specs: - ast (2.3.0) - coderay (1.1.0) - everypolitician-popolo (0.8.0) - require_all - httpclient (2.6.0.1) - method_source (0.8.2) - parser (2.3.1.2) - ast (~> 2.2) - powerpack (0.1.1) - pry (0.10.1) + ast (2.4.0) + coderay (1.1.2) + domain_name (0.5.20180417) + unf (>= 0.0.5, < 1.0.0) + field_serializer (0.3.0) + http-cookie (1.0.3) + domain_name (~> 0.5) + httpclient (2.8.3) + jaro_winkler (1.5.2) + method_source (0.9.2) + mime-types (3.2.2) + mime-types-data (~> 3.2015) + mime-types-data (3.2018.0812) + mini_portile2 (2.4.0) + netrc (0.11.0) + nokogiri (1.10.1) + mini_portile2 (~> 2.4.0) + parallel (1.14.0) + parser (2.6.0.0) + ast (~> 2.4.0) + powerpack (0.1.2) + pry (0.12.2) coderay (~> 1.1.0) - method_source (~> 0.8.1) - slop (~> 3.4) - rainbow (2.1.0) - require_all (1.4.0) - rubocop (0.42.0) - parser (>= 2.3.1.1, < 3.0) + method_source (~> 0.9.0) + psych (3.1.0) + rainbow (3.0.0) + require_all (2.0.0) + rest-client (2.0.2) + http-cookie (>= 1.0.2, < 2.0) + mime-types (>= 1.16, < 4.0) + netrc (~> 0.8) + rubocop (0.65.0) + jaro_winkler (~> 1.5.1) + parallel (~> 1.10) + parser (>= 2.5, != 2.5.1.1) powerpack (~> 0.1) - rainbow (>= 1.99.1, < 3.0) + psych (>= 3.1.0) + rainbow (>= 2.2.2, < 4.0) ruby-progressbar (~> 1.7) - unicode-display_width (~> 1.0, >= 1.0.1) - ruby-progressbar (1.8.1) - slop (3.6.0) - sqlite3 (1.3.10) - sqlite_magic (0.0.3) - sqlite3 - unicode-display_width (1.1.1) - yajl-ruby (1.3.0) + unicode-display_width (~> 1.4.0) + ruby-progressbar (1.10.0) + sqlite3 (1.4.0) + unf (0.1.4) + unf_ext + unf_ext (0.0.7.5) + unicode-display_width (1.4.1) PLATFORMS ruby DEPENDENCIES - everypolitician! pry + rest-client rubocop + scraped! scraperwiki! - wikisnakker! + sqlite_magic! RUBY VERSION - ruby 2.3.3p222 + ruby 2.4.4p296 BUNDLED WITH - 1.13.6 + 1.16.5 diff --git a/lib/cabinet.rb b/lib/cabinet.rb new file mode 100644 index 0000000..81bfcc2 --- /dev/null +++ b/lib/cabinet.rb @@ -0,0 +1,109 @@ +# frozen_string_literal: true + +# TODO: extend Scraped::Scraper with ability to add Strategies +class Scraped::Request::Strategy::LiveRequest + require 'rest-client' + + def url + SPARQL_URL % CGI.escape(raw_query) + end + + private + + SPARQL_URL = 'https://query.wikidata.org/sparql?format=json&query=%s' + + QUERY = <<~SPARQL + SELECT DISTINCT ?ps ?item ?itemLabel ?minister ?ministerLabel ?ordinal ?start ?startprecision ?end ?endprecision ?cabinet ?cabinetLabel { + { + SELECT DISTINCT ?ps ?item ?minister ?ordinal ?start ?startprecision ?end ?endprecision ?cabinet { + ?item p:P39/ps:P39/wdt:P279* wd:%s . + ?item p:P39 ?ps . + ?ps ps:P39 ?minister . + ?minister wdt:P279* wd:Q83307 . + OPTIONAL { ?ps pq:P1545 ?ordinal } + OPTIONAL { ?ps pqv:P580 [wikibase:timeValue ?start ; wikibase:timePrecision ?startprecision ] } + OPTIONAL { ?ps pqv:P582 [wikibase:timeValue ?end ; wikibase:timePrecision ?endprecision ] } + + # Ignore anything with a different jurisdiction + OPTIONAL { wd:%s wdt:P1001 ?legislative_jurisdiction } + OPTIONAL { ?minister wdt:P1001 ?executive_jurisdiction } + FILTER (!BOUND(?legislative_jurisdiction) || !BOUND(?executive_jurisdiction) || (?legislative_jurisdiction = ?executive_jurisdiction)) + } + } + SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } + } + SPARQL + + def raw_query + QUERY % [@url, @url] + end +end + +class CabinetScraper < Scraped::JSON + field :memberships do + json[:results][:bindings].map { |result| fragment(result => Membership).to_h } + end + + class Wikidate + def initialize(date, precision) + @date = date + @precision = precision + end + + # not to_s, as this can return 'nil' + def as_string + return unless date && precision + return unless slice_point + + date.slice(0, slice_point) + end + + private + + attr_reader :date, :precision + + PRECISION_LENGTH = { + '9' => 4, # year + '10' => 7, # month + '11' => 10, # day + }.freeze + + def slice_point + PRECISION_LENGTH[precision] + end + end + + class Membership < Scraped::JSON + field :id do + json.dig(:item, :value).to_s.split('/').last + end + + field :name do + json.dig(:itemLabel, :value) + end + + field :position_id do + json.dig(:ps, :value).to_s.split('/').last + end + + field :position do + json.dig(:minister, :value).to_s.split('/').last + end + + field :label do + json.dig(:ministerLabel, :value) + end + + field :start_date do + Wikidate.new(json.dig(:start, :value), json.dig(:startprecision, :value)).as_string + end + + field :end_date do + Wikidate.new(json.dig(:end, :value), json.dig(:endprecision, :value)).as_string + end + + field :ordinal do + json.dig(:ordinal, :value).to_i + end + end +end diff --git a/lib/politician.rb b/lib/politician.rb deleted file mode 100644 index 4090587..0000000 --- a/lib/politician.rb +++ /dev/null @@ -1,38 +0,0 @@ -require 'wikisnakker' - -module Wikisnakker - class Politician < Item - P39_QUALIFIERS = { - P102: :party, - P155: :follows, - P156: :followed_by, - P194: :body, - P580: :start_date, - P582: :end_date, - P642: :of, - P768: :constituency, - P1365: :replaces, - P1366: :replaced_by, - P2715: :election, - P2937: :term, - }.freeze - - def positions - return [] if self.P39s.empty? - self.P39s.map do |posn| - quals = posn.qualifiers - qdata = quals.properties.partition { |p| P39_QUALIFIERS[p] } - warn "#{id}: #{posn.value.id} + unknown #{qdata.last.join(', ')}" unless qdata.last.empty? - - qgood = qdata.first.map { |p| [P39_QUALIFIERS[p], quals[p].value.to_s] }.to_h - { - id: id, - position: posn.value.id, - label: posn.value.to_s, - description: posn.value.description(:en).to_s, - start_date: '' # need _something_ here so we can key on it - }.merge(qgood) rescue {} - end - end - end -end diff --git a/scraper.rb b/scraper.rb index 3953487..751b1d1 100644 --- a/scraper.rb +++ b/scraper.rb @@ -1,16 +1,8 @@ #!/bin/env ruby -# encoding: utf-8 # frozen_string_literal: true -require 'everypolitician' -require 'pry' +require 'scraped' require 'scraperwiki' +require_relative 'lib/cabinet' -require_relative 'lib/politician' - -ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil -house = EveryPolitician::Index.new.country('United-States-of-America').upper_house -house.popolo.persons.map(&:wikidata).compact.each_slice(100) do |wanted| - data = Wikisnakker::Politician.find(wanted).flat_map(&:positions).compact - ScraperWiki.save_sqlite(%i(id position start_date), data) -end +Scraped::Scraper.new('Q13217683' => CabinetScraper).store(:memberships, index: %i[position_id])