Skip to content

Commit

Permalink
Update scraper.rb to fix dates DDMMYY
Browse files Browse the repository at this point in the history
  • Loading branch information
jamasalbertastash committed Aug 9, 2023
1 parent 385149e commit 5706a21
Showing 1 changed file with 9 additions and 6 deletions.
15 changes: 9 additions & 6 deletions scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,33 +7,36 @@ def scrape_page(page, comment_url)
table = page.at("table")

table.search("tr")[1..-1].each do |tr|
day, month, year = tr.search("td")[3].inner_text.gsub(/[[:space:]]/, ' ').split(" ")
date_str = tr.search("td")[3].inner_text.gsub(/[[:space:]]/, ' ')

# Split the date string on periods
day, month, year = date_str.split(".")

# Print the date values for debugging
puts "Parsed Date: Day: #{day}, Month: #{month}, Year: #{year}"

month_i = Date::MONTHNAMES.index(month)

# Guard clause
unless day && month_i && year
unless day && month && year
puts "Invalid date values. Skipping..."
next
end

# Adjust the year to YYYY format
year = "20" + year if year.length == 2

record = {
"info_url" => tr.search("td a")[0].attributes['href'].to_s,
"comment_url" => comment_url,
"council_reference" => tr.search("td")[0].inner_text,
"description" => tr.search("td")[1].inner_text,
"address" => tr.search("td")[2].inner_text + ", VIC",
"on_notice_to" => Date.new(year.to_i, month_i, day.to_i).to_s,
"on_notice_to" => Date.new(year.to_i, month.to_i, day.to_i).to_s,
"date_scraped" => Date.today.to_s
}

# Check if record already exists
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
puts "Saving record " + record['council_reference'] + ", " + record['address']
# puts record
ScraperWiki.save_sqlite(['council_reference'], record)
else
puts "Skipping already saved record " + record['council_reference']
Expand Down

0 comments on commit 5706a21

Please sign in to comment.