Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

initial commit

  • Loading branch information...
commit 80839795f85069fcfd153f8228653081c334a684 1 parent 3b1d335
Joel Grus authored
47 back_predict.rb
... ... @@ -0,0 +1,47 @@
  1 +# load the default model, and make predictions for every story in the database
  2 +
  3 +require_relative "story"
  4 +require_relative "model"
  5 +
  6 +
  7 +def back_predict(only_new = true,verbose=true)
  8 +
  9 + model = Model.load
  10 +
  11 + stories = only_new ? Story.where( :prediction => nil ) : Story.all
  12 +
  13 + puts "found #{stories.count} stories to classify"
  14 +
  15 + stories.each do |story|
  16 +
  17 + puts story.inspect if verbose
  18 +
  19 + story.prediction = model.classify(story,verbose)
  20 + story.save
  21 + end
  22 +end
  23 +
  24 +def worst_predictions(n = 25)
  25 + false_positives = Story.where( :prediction.gt => 0, :like => false ).sort( :prediction.desc ).take(n)
  26 + false_negatives = Story.where( :prediction.gt => 0, :like => true ).sort( :prediction ).take(n)
  27 +
  28 + puts "worst false negatives:"
  29 + false_negatives.each do |s|
  30 + puts s.inspect
  31 + puts s.prediction
  32 + puts
  33 + end
  34 +
  35 + puts "worst false positives:"
  36 + false_positives.each do |s|
  37 + puts s.inspect
  38 + puts s.prediction
  39 + puts
  40 + end
  41 +end
  42 +
  43 +if __FILE__ == $PROGRAM_NAME
  44 + back_predict
  45 + worst_predictions
  46 +end
  47 +
78 blogger.rb
... ... @@ -0,0 +1,78 @@
  1 +# post any new stories to blogger
  2 +# using credentials from the yaml file .blogger in your home directory
  3 +
  4 +require 'twitter'
  5 +require 'maruku'
  6 +require_relative 'story'
  7 +require_relative 'model'
  8 +require "blogger"
  9 +require "yaml"
  10 +
  11 +puts Time.now
  12 +
  13 +# this is for conditionally formatting the probabilities,
  14 +# although it turns out that blogger just ignores it
  15 +def color_from_prob(p)
  16 +# want 100% to be green 0,255,0
  17 +# want 0% to be red 255,0,0
  18 +
  19 + rgb = [255 * (1-p),255 * p, 0]
  20 + hex = rgb.map{|i| sprintf("%02x", i).upcase}.join
  21 + return hex
  22 +end
  23 +
  24 +new_stories = Story.where(:tweeted => nil).sort(:hnid.desc).take(200)
  25 +
  26 +puts "found #{new_stories.size} new stories"
  27 +
  28 +model = Model.load
  29 +
  30 +# now need to order by joel_probability descending
  31 +
  32 +joel_probs = new_stories.map{|s| model.classify(s,true)}
  33 +
  34 +sorted_stories = joel_probs.zip(new_stories).sort{|s1,s2| s2.first <=> s1.first}
  35 +
  36 +content = sorted_stories.map do |joel_prob,s|
  37 +
  38 + color = color_from_prob joel_prob
  39 + title = s.link_title.gsub("&","&amp;").gsub("<","&lt;").gsub(">","&gt;")
  40 + hn_link = "http://news.ycombinator.com/item?id=#{s.hnid}"
  41 + link = /^http/ =~ s.link_url ? s.link_url : hn_link
  42 +
  43 + body = %Q[**#{sprintf("%.3f",joel_prob)}** [#{title}](#{link}) [*comments*](#{hn_link})\n]
  44 + # body = "<div>"
  45 + # body += %Q[<span style="color:#{color}">#{"%.3f" % joel_probability}</span> ]
  46 + # body += %Q[<a href="#{link}">#{title}</a> ]
  47 + # body += %Q[<small><a href ="#{hn_link}">comments</a></small>]
  48 + # body += "</div>"
  49 +
  50 + body
  51 +end.join("\n")
  52 +
  53 +title = "Hacker News stories for #{Time.now.strftime("%l %p on %A %b %d, %Y").strip}"
  54 +
  55 +params = YAML::load(File.open("#{ENV['HOME']}/.blogger"))
  56 +
  57 +account = Blogger::Account.new(params["username"],params["password"])
  58 +blogid = params["blogid"]
  59 +userid = params["userid"]
  60 +
  61 +post = Blogger::Post.new(:title => title,:formatter => :maruku)
  62 +post.content = content
  63 +
  64 +
  65 +begin
  66 +
  67 + account.post(blogid,post) if new_stories.size > 0
  68 +
  69 + new_stories.each do |s|
  70 + s.tweeted = true
  71 + s.tweeted_at = Time.now
  72 + s.save
  73 + end
  74 +
  75 +rescue Exception => e
  76 + puts e.inspect
  77 + puts new_stories.map{|s| s.link_title}.join("\n")
  78 +end
71 featurizer.rb
... ... @@ -0,0 +1,71 @@
  1 +# turns a story into a set of features
  2 +
  3 +require "set"
  4 +require_relative "stopwords"
  5 +require_relative "story"
  6 +require_relative "utils"
  7 +require_relative "stemmable"
  8 +
  9 +class String
  10 + include Stemmable
  11 +end
  12 +
  13 +
  14 +class Story
  15 + def features()
  16 +
  17 + feature_set = Set.new
  18 +
  19 + #words
  20 + regex_splitter = /[^a-z0-9\-'+#]/
  21 + title = self.link_title.downcase
  22 + words = title.split(regex_splitter).select{|w| !w.empty?}
  23 +
  24 + words.select{|w| !is_stopword w}
  25 + .each {|w| feature_set.add "word_#{w.stem}"}
  26 +
  27 + # bigrams
  28 + bigrams = words.each_cons(2)
  29 + .select{|pair| (!is_stopword pair[0]) or (!is_stopword pair[1])}
  30 + .map{|pair| pair[0].stem + "_" + pair[1].stem}
  31 +
  32 + bigrams.each {|b| feature_set.add "bigram_#{b}"}
  33 +
  34 + #domain
  35 +
  36 + feature_set.add "domain_#{self.domain}"
  37 +
  38 + #user
  39 +
  40 + feature_set.add "user_#{self.user}" if self.user
  41 +
  42 + #user_in_domain
  43 +
  44 + user_in_domain = self.domain.include? self.user if self.user
  45 + feature_set.add "userindomain" if user_in_domain
  46 +
  47 + # is_pdf
  48 + feature_set.add("pdf") if /\.pdf/i =~ self.link_url
  49 +
  50 + # is_question
  51 + feature_set.add("isquestion") if /\?$/ =~ self.link_title
  52 +
  53 + # dollar_amount
  54 + feature_set.add("dollaramount") if /\$[0-9]+/ =~ self.link_title
  55 +
  56 + # hours amount
  57 + feature_set.add("hoursamount") if /\b[1-9][0-9]* hours?\b/i =~ self.link_title
  58 +
  59 + # years amount
  60 + feature_set.add("yearsamount") if /\b[1-9][0-9]* years?\b/i =~ self.link_title
  61 +
  62 + # yc class
  63 + feature_set.add("ycclass") if /YC [A-Z][0-9]{2}/i =~ self.link_title
  64 +
  65 + # in quotes
  66 + feature_set.add("inquotes") if /^".*"$/ =~ self.link_title
  67 +
  68 + return feature_set
  69 + end
  70 +end
  71 +
68 judger.rb
... ... @@ -0,0 +1,68 @@
  1 +# manually score stories as like or dislike, for use in building the model
  2 +# "ruby judger.rb" to see unjudged stories
  3 +# "ruby judger.rb word" to see all stories with "word" in their title
  4 +
  5 +require_relative "story"
  6 +require_relative "model"
  7 +require_relative "utils"
  8 +
  9 +# unjudged stories, sorted by hnid descending
  10 +total = Story.count
  11 +
  12 +puts ARGV
  13 +
  14 +# if we gave it args, use the first to match with
  15 +if ARGV.size > 0
  16 + term = ARGV.first
  17 + re = Regexp.new(term,true)
  18 + stories = Story.where(:link_title => re).to_a.shuffle
  19 +
  20 + puts "#{stories.count} stories containing #{term}"
  21 +else
  22 +
  23 + stories = Story.where(:like => nil).to_a.shuffle #sort(:hnid.desc)
  24 + unjudged_count = stories.count
  25 +
  26 + puts "#{total} stories in database"
  27 + puts "#{total - unjudged_count} judged"
  28 + puts "#{unjudged_count} left to judge"
  29 + puts
  30 +end
  31 +
  32 +
  33 +stories.each do |s|
  34 + puts "hnid: #{s.hnid}"
  35 + puts "title: #{s.link_title}"
  36 + puts "domain: #{s.domain}"
  37 + puts "url: #{s.link_url}"
  38 + puts "user: #{s.user}"
  39 + puts
  40 + puts "previous judgment: #{s.like}" if s.like
  41 + puts "prediction: #{s.prediction}" if s.prediction
  42 + print "good? >> "
  43 + answer = STDIN.gets
  44 + if /^y/i =~ answer
  45 + puts "you liked it!"
  46 + s.like = true
  47 + elsif /^n/i =~ answer
  48 + puts "you didn't like it!"
  49 + s.like = false
  50 + else break
  51 + end
  52 +
  53 + s.save
  54 + puts
  55 +end
  56 +
  57 +puts "retraining model"
  58 +# after judging, retrain the model
  59 +model = Model.new
  60 +model.train(2)
  61 +model.save
  62 +
  63 +# and reclassify everything
  64 +
  65 +require_relative "back_predict"
  66 +back_predict(false,false)
  67 +
  68 +require_relative "test_featurizer"
152 model.rb
... ... @@ -0,0 +1,152 @@
  1 +# class to build a naive bayes model
  2 +# and save it to disk or load it back
  3 +
  4 +require_relative 'story'
  5 +require_relative 'featurizer'
  6 +
  7 +def split(stories,test_split)
  8 + test = []
  9 + train = []
  10 +
  11 + stories.each do |s|
  12 + if rand < test_split
  13 + test << s
  14 + else
  15 + train << s
  16 + end
  17 + end
  18 +
  19 + return [train,test]
  20 +end
  21 +
  22 +
  23 +class Model
  24 + attr_accessor :created_at, :positives, :negatives, :feature_counts
  25 +
  26 + def test(liked,disliked)
  27 +
  28 + results = liked.map{|s| [self.classify(s),1]} +
  29 + disliked.map{|s| [self.classify(s),0]}
  30 +
  31 + total_pos = results.select{|r| r[1] == 1}.size
  32 + 20.downto(0).map do |n|
  33 + pct = n.to_f / 20
  34 + true_pos = results.select{|r| r[0] >= pct and r[1] == 1}.size
  35 + false_pos = results.select{|r| r[0] >= pct and r[1] == 0}.size
  36 + precision = true_pos.to_f / (true_pos + false_pos)
  37 + recall = true_pos.to_f / total_pos
  38 + [pct,true_pos + false_pos,precision,recall]
  39 + end
  40 + end
  41 +
  42 +
  43 + def train(min_feature_freq = 3,test_split = 0.2,verbose=false)
  44 +
  45 + liked = Story.where(:like => true)
  46 + disliked = Story.where(:like => false)
  47 +
  48 + liked_train,liked_test = split(liked,test_split)
  49 + disliked_train,disliked_test = split(disliked,test_split)
  50 +
  51 + puts "training: #{liked_train.size + disliked_train.size} examples"
  52 + puts "test: #{liked_test.size + disliked_test.size} examples"
  53 +
  54 + liked_features = Hash[ liked_train.to_a.map{|story| story.features.to_a}.flatten
  55 + .group_by{|feat| feat}
  56 + .map{|k,v| [k,v.size]} ]
  57 +
  58 + disliked_features = Hash[ disliked_train.to_a.map{|story| story.features.to_a}.flatten
  59 + .group_by{|feat| feat}
  60 + .map{|k,v| [k,v.size]} ]
  61 +
  62 + all_features = (liked_features.keys + disliked_features.keys).uniq
  63 + .map{|k| {:feature => k,
  64 + :positives => liked_features.fetch(k,0),
  65 + :negatives => disliked_features.fetch(k,0),
  66 + :total => liked_features.fetch(k,0) + disliked_features.fetch(k,0)} }
  67 + .select{|f| f[:total] >= min_feature_freq}
  68 +
  69 + # pseudo_counts:
  70 +
  71 + all_features.each do |f|
  72 + f[:positives] += 2
  73 + f[:negatives] += 2
  74 + f[:total] += 4
  75 + end
  76 +
  77 + # and turn into a hash
  78 +
  79 + self.feature_counts = Hash[ all_features.map do |dict|
  80 + feature = dict[:feature]
  81 + dict.delete(:feature)
  82 + [feature,dict]
  83 + end]
  84 +
  85 + self.created_at = Time.now
  86 + self.positives = all_features.map{|f| f[:positives]}.sum
  87 + self.negatives = all_features.map{|f| f[:negatives]}.sum
  88 +
  89 + if verbose
  90 + puts self.feature_counts.inspect
  91 + puts self.positives
  92 + puts self.negatives
  93 + end
  94 +
  95 + test(liked_test,disliked_test).each do |pct,num,precision,recall|
  96 + puts "#{pct}\t#{num}\t#{precision}\t#{recall}"
  97 + end
  98 +
  99 + end
  100 +
  101 + def log_p_feature(feature)
  102 +
  103 + p_feature_given_positive = self.feature_counts[feature][:positives].to_f / self.positives.to_f
  104 + p_feature_given_negative = self.feature_counts[feature][:negatives].to_f / self.negatives.to_f
  105 + return Math.log(p_feature_given_positive / p_feature_given_negative)
  106 + end
  107 +
  108 + def classify(story,verbose=false)
  109 +
  110 + features = story.features
  111 + puts features.inspect if verbose
  112 +
  113 + base_odds = self.positives.to_f / self.negatives.to_f
  114 +
  115 + usable_features = features.select{|f| self.feature_counts.has_key? f}
  116 + feature_odds = usable_features.map{|f| log_p_feature(f)}
  117 +
  118 + if verbose
  119 +
  120 + puts "#{Math.log(base_odds).round(3)}\t -- base log odds"
  121 +
  122 + usable_features.zip(feature_odds).each do |f,lo|
  123 + puts "#{lo.round(3)}\t -- #{f}"
  124 + end
  125 + end
  126 +
  127 + odds = base_odds * Math.exp(feature_odds.sum)
  128 + # odds are p / n = p / 1 - p, so that
  129 + # p = (1 - p) * odds
  130 + # p (1 + odds) = odds
  131 +
  132 + p = odds / (1 + odds)
  133 +
  134 + puts p if verbose
  135 +
  136 + return p
  137 +
  138 + end
  139 +
  140 + def save(filename=File.join(File.dirname(__FILE__), 'model.mod' ))
  141 + File.open(filename,'w') { |f| f.write(YAML::dump(self)) }
  142 + end
  143 +
  144 + def self.load(filename=File.join(File.dirname(__FILE__), 'model.mod' ))
  145 + if File.exists? filename
  146 + return YAML.load(File.read(filename))
  147 + else
  148 + return nil
  149 + end
  150 + end
  151 +end
  152 +
75 scrape_api.rb
... ... @@ -0,0 +1,75 @@
  1 +# scrape new stories from the unofficial hacker news api
  2 +# the API is broken half the time, so this needs to be made more robust
  3 +
  4 +require 'open-uri'
  5 +require 'json'
  6 +require_relative "story"
  7 +require_relative "utils"
  8 +require_relative "model"
  9 +
  10 +base_url = "http://news.ycombinator.com/newest"
  11 +
  12 +puts Time.now
  13 +
  14 +def scrape(max_pages = 1, base_url)
  15 + new_stories = []
  16 + url = base_url
  17 + found_known_story = false
  18 + model = Model.load
  19 +
  20 +
  21 + (1..max_pages).each do |i|
  22 +
  23 + puts "opening page #{i}: #{url}"
  24 +
  25 + r = open(url).readline
  26 +
  27 + puts "found #{r.size} characters"
  28 +
  29 + doc = JSON.parse r
  30 +
  31 + nextId = doc["nextId"]
  32 +
  33 + doc["items"].each do |item|
  34 +
  35 + itemid = item["id"]
  36 +
  37 + if Story.where(:hnid => itemid).count > 0
  38 + found_known_story = true
  39 + puts "known story: #{itemid}"
  40 + else
  41 + story = Story.new
  42 + story.hnid = itemid
  43 + story.link_url = item["url"]
  44 + story.link_title = item["title"]
  45 + story.domain = domain(item["url"])
  46 + story.scraped_at = Time.now
  47 + story.user = item["postedBy"]
  48 + story.prediction = model.classify(story) if model
  49 + new_stories << story
  50 + end
  51 + end
  52 +
  53 + break if found_known_story
  54 + break unless nextId
  55 +
  56 + url = "#{base_url}/#{nextId}"
  57 + puts "moving ahead to #{url}"
  58 + end
  59 +
  60 + puts "found #{new_stories.size} new stories"
  61 + new_stories.each do |s|
  62 + s.save
  63 + puts "new story:"
  64 + puts s.hnid
  65 + puts s.link_title
  66 + puts s.link_url
  67 + end
  68 +end
  69 +
  70 +
  71 +begin
  72 + scrape(20,"http://api.ihackernews.com/new")
  73 +rescue
  74 + puts "new failed, try again later"
  75 +end
52 scrape_old.rb
... ... @@ -0,0 +1,52 @@
  1 +# scrapes the archives of hn daily to get old data for training
  2 +
  3 +require "nokogiri"
  4 +require "open-uri"
  5 +load "story.rb"
  6 +load "utils.rb"
  7 +
  8 +root_url = "http://www.daemonology.net/hn-daily" #2012-01.html"
  9 +
  10 +def month_pages(root_url)
  11 + doc = Nokogiri::HTML(open(root_url))
  12 + doc.xpath("//div[@class='marginlink']/a").to_a
  13 + .map { |node| node["href"] }
  14 + .select { |url| url =~ /[0-9]{4}\-[0-9]{2}\.html/ }
  15 + .map { |url| "#{root_url}/#{url}" }
  16 +end
  17 +
  18 +
  19 +def scrape(url)
  20 + doc = Nokogiri::HTML(open(url))
  21 +
  22 + storylinks = doc.xpath("//li/span[@class='storylink']/a")
  23 + commentlinks = doc.xpath("//li/span[@class='commentlink']/a")
  24 +
  25 + if storylinks.size === commentlinks.size
  26 + (storylinks.zip commentlinks).map do |sl,cl|
  27 + description = sl.text
  28 + url = sl["href"]
  29 + hnurl = cl["href"]
  30 +
  31 + [description,url,hnurl]
  32 + end
  33 + end
  34 +end
  35 +
  36 +month_pages(root_url).each do |url|
  37 + puts "scraping url"
  38 + scrape(url).each do |desc,url,hnurl|
  39 + puts desc
  40 + puts url
  41 + puts hnurl
  42 + story = Story.new
  43 + story.hnid = hnid_from_url(hnurl)
  44 + story.link_url = url
  45 + story.link_title = desc
  46 + story.domain = domain(url)
  47 + story.scraped_at = Time.now
  48 + puts "saving"
  49 + story.save
  50 + puts
  51 + end
  52 +end
212 stemmable.rb
... ... @@ -0,0 +1,212 @@
  1 +#! /local/ruby/bin/ruby
  2 +#
  3 +# $Id: stemmable.rb,v 1.2 2003/02/01 02:07:30 condit Exp $
  4 +#
  5 +# See example usage at the end of this file.
  6 +#
  7 +
  8 +module Stemmable
  9 +
  10 + STEP_2_LIST = {
  11 + 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
  12 + 'izer'=>'ize', 'bli'=>'ble',
  13 + 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
  14 + 'ization'=>'ize', 'ation'=>'ate',
  15 + 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
  16 + 'ousness'=>'ous', 'aliti'=>'al',
  17 + 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
  18 + }
  19 +
  20 + STEP_3_LIST = {
  21 + 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
  22 + 'ical'=>'ic', 'ful'=>'', 'ness'=>''
  23 + }
  24 +
  25 +
  26 + SUFFIX_1_REGEXP = /(
  27 + ational |
  28 + tional |
  29 + enci |
  30 + anci |
  31 + izer |
  32 + bli |
  33 + alli |
  34 + entli |
  35 + eli |
  36 + ousli |
  37 + ization |
  38 + ation |
  39 + ator |
  40 + alism |
  41 + iveness |
  42 + fulness |
  43 + ousness |
  44 + aliti |
  45 + iviti |
  46 + biliti |
  47 + logi)$/x
  48 +
  49 +
  50 + SUFFIX_2_REGEXP = /(
  51 + al |
  52 + ance |
  53 + ence |
  54 + er |
  55 + ic |
  56 + able |
  57 + ible |
  58 + ant |
  59 + ement |
  60 + ment |
  61 + ent |
  62 + ou |
  63 + ism |
  64 + ate |
  65 + iti |
  66 + ous |
  67 + ive |
  68 + ize)$/x
  69 +
  70 +
  71 + C = "[^aeiou]" # consonant
  72 + V = "[aeiouy]" # vowel
  73 + CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
  74 + VV = "#{V}(?>[aeiou]*)" # vowel sequence
  75 +
  76 + MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
  77 + MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
  78 + MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
  79 + VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
  80 +
  81 + #
  82 + # Porter stemmer in Ruby.
  83 + #
  84 + # This is the Porter stemming algorithm, ported to Ruby from the
  85 + # version coded up in Perl. It's easy to follow against the rules
  86 + # in the original paper in:
  87 + #
  88 + # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  89 + # no. 3, pp 130-137,
  90 + #
  91 + # See also http://www.tartarus.org/~martin/PorterStemmer
  92 + #
  93 + # Send comments to raypereda@hotmail.com
  94 + #
  95 +
  96 + def stem_porter
  97 +
  98 + # make a copy of the given object and convert it to a string.
  99 + w = self.dup.to_str
  100 +
  101 + return w if w.length < 3
  102 +
  103 + # now map initial y to Y so that the patterns never treat it as vowel
  104 + w[0] = 'Y' if w[0] == ?y
  105 +
  106 + # Step 1a
  107 + if w =~ /(ss|i)es$/
  108 + w = $` + $1
  109 + elsif w =~ /([^s])s$/
  110 + w = $` + $1
  111 + end
  112 +
  113 + # Step 1b
  114 + if w =~ /eed$/
  115 + w.chop! if $` =~ MGR0
  116 + elsif w =~ /(ed|ing)$/
  117 + stem = $`
  118 + if stem =~ VOWEL_IN_STEM
  119 + w = stem
  120 + case w
  121 + when /(at|bl|iz)$/ then w << "e"
  122 + when /([^aeiouylsz])\1$/ then w.chop!
  123 + when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
  124 + end
  125 + end
  126 + end
  127 +
  128 + if w =~ /y$/
  129 + stem = $`
  130 + w = stem + "i" if stem =~ VOWEL_IN_STEM
  131 + end
  132 +
  133 + # Step 2
  134 + if w =~ SUFFIX_1_REGEXP
  135 + stem = $`
  136 + suffix = $1
  137 + # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
  138 + if stem =~ MGR0
  139 + w = stem + STEP_2_LIST[suffix]
  140 + end
  141 + end
  142 +
  143 + # Step 3
  144 + if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
  145 + stem = $`
  146 + suffix = $1
  147 + if stem =~ MGR0
  148 + w = stem + STEP_3_LIST[suffix]
  149 + end
  150 + end
  151 +
  152 + # Step 4
  153 + if w =~ SUFFIX_2_REGEXP
  154 + stem = $`
  155 + if stem =~ MGR1
  156 + w = stem
  157 + end
  158 + elsif w =~ /(s|t)(ion)$/
  159 + stem = $` + $1
  160 + if stem =~ MGR1
  161 + w = stem
  162 + end
  163 + end
  164 +
  165 + # Step 5
  166 + if w =~ /e$/
  167 + stem = $`
  168 + if (stem =~ MGR1) ||
  169 + (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
  170 + w = stem
  171 + end
  172 + end
  173 +
  174 + if w =~ /ll$/ && w =~ MGR1
  175 + w.chop!
  176 + end
  177 +
  178 + # and turn initial Y back to y
  179 + w[0] = 'y' if w[0] == ?Y
  180 +
  181 + w
  182 + end
  183 +
  184 +
  185 + #
  186 + # make the stem_porter the default stem method, just in case we
  187 + # feel like having multiple stemmers available later.
  188 + #
  189 + alias stem stem_porter
  190 +
  191 +end
  192 +
  193 +
  194 +
  195 +#
  196 +# Make this script executable, and send it words on stdin, one per
  197 +# line, and it will output the stemmed versions to stdout.
  198 +#
  199 +if $0 == __FILE__ then
  200 + class String
  201 + include Stemmable
  202 + end
  203 +
  204 + # the String class, and any subclasses of it you might have, now know
  205 + # how to stem things.
  206 +
  207 + $stdin.each do |word|
  208 + puts word.stem
  209 + end
  210 +end
  211 +
  212 +
188 stopwords.rb
... ... @@ -0,0 +1,188 @@
  1 +require "set"
  2 +require_relative "stemmable"
  3 +
  4 +class String
  5 + include Stemmable
  6 +end
  7 +
  8 +STOPWORDS = Set.new [
  9 + "-",
  10 + "a",
  11 + "about",
  12 + "above",
  13 + "after",
  14 + "again",
  15 + "against",
  16 + "all",
  17 + "am",
  18 + "an",
  19 + "and",
  20 + "any",
  21 + "are",
  22 + "aren't",
  23 + "as",
  24 + "at",
  25 + "be",
  26 + "because",
  27 + "been",
  28 + "before",
  29 + "being",
  30 + "below",
  31 + "between",
  32 + "both",
  33 + "but",
  34 + "by",
  35 + "can't",
  36 + "cannot",
  37 + "could",
  38 + "couldn't",
  39 + "did",
  40 + "didn't",
  41 + "do",
  42 + "does",
  43 + "doesn't",
  44 + "doing",
  45 + "don't",
  46 + "down",
  47 + "during",
  48 + "each",
  49 + "few",
  50 + "for",
  51 + "from",
  52 + "further",
  53 + "had",
  54 + "hadn't",
  55 + "has",
  56 + "hasn't",
  57 + "have",
  58 + "haven't",
  59 + "having",
  60 + "he",
  61 + "he'd",
  62 + "he'll",
  63 + "he's",
  64 + "her",
  65 + "here",
  66 + "here's",
  67 + "hers",
  68 + "herself",
  69 + "him",
  70 + "himself",
  71 + "his",
  72 + "how",
  73 + "how's",
  74 + "i",
  75 + "i'd",
  76 + "i'll",
  77 + "i'm",
  78 + "i've",
  79 + "if",
  80 + "in",
  81 + "into",
  82 + "is",
  83 + "isn't",
  84 + "it",
  85 + "it's",
  86 + "its",
  87 + "itself",
  88 + "let's",
  89 + "me",
  90 + "more",
  91 + "most",
  92 + "mustn't",
  93 + "my",
  94 + "myself",
  95 + "no",
  96 + "nor",
  97 + "not",
  98 + "of",
  99 + "off",
  100 + "on",
  101 + "once",
  102 + "only",
  103 + "or",
  104 + "other",
  105 + "ought",
  106 + "our",
  107 + "ours",
  108 + "ourselves",
  109 + "out",
  110 + "over",
  111 + "own",
  112 + "same",
  113 + "shan't",
  114 + "she",
  115 + "she'd",
  116 + "she'll",
  117 + "she's",
  118 + "should",
  119 + "shouldn't",
  120 + "so",
  121 + "some",
  122 + "such",
  123 + "than",
  124 + "that",
  125 + "that's",
  126 + "the",
  127 + "their",
  128 + "theirs",
  129 + "them",
  130 + "themselves",
  131 + "then",
  132 + "there",
  133 + "there's",
  134 + "these",
  135 + "they",
  136 + "they'd",
  137 + "they'll",
  138 + "they're",
  139 + "they've",
  140 + "this",
  141 + "those",
  142 + "through",
  143 + "to",
  144 + "too",
  145 + "under",
  146 + "until",
  147 + "up",
  148 + "very",
  149 + "was",
  150 + "wasn't",
  151 + "we",
  152 + "we'd",
  153 + "we'll",
  154 + "we're",
  155 + "we've",
  156 + "were",
  157 + "weren't",
  158 + "what",
  159 + "what's",
  160 + "when",
  161 + "when's",
  162 + "where",
  163 + "where's",
  164 + "which",
  165 + "while",
  166 + "who",
  167 + "who's",
  168 + "whom",
  169 + "why",
  170 + "why's",
  171 + "with",
  172 + "won't",
  173 + "would",
  174 + "wouldn't",
  175 + "you",
  176 + "you'd",
  177 + "you'll",
  178 + "you're",
  179 + "you've",
  180 + "your",
  181 + "yours",
  182 + "yourself",
  183 + "yourselves"
  184 +].map {|w| w.stem}
  185 +
  186 +def is_stopword(w)
  187 + return STOPWORDS.include? w
  188 +end
34 story.rb
... ... @@ -0,0 +1,34 @@
  1 +# the class that represents story objects in the database
  2 +
  3 +require 'mongo_mapper'
  4 +
  5 +MongoMapper.database = 'hackernews'
  6 +
  7 +class Story
  8 + include MongoMapper::Document
  9 +
  10 + key :hnid, Integer
  11 + key :link_url, String
  12 + key :link_title, String
  13 + key :domain, String
  14 + key :user, String
  15 +
  16 + key :scraped_at, Time
  17 + key :tweeted_at, Time
  18 +
  19 + key :like, Boolean
  20 + key :tweeted, Boolean
  21 + key :prediction, Float
  22 +
  23 + scope :untweeted, where(:tweeted => false)
  24 +
  25 + def inspect
  26 + "hnid: #{self.hnid}\n#{link_title}\n#{link_url}"
  27 + end
  28 +end
  29 +
  30 +Story.ensure_index [[:tweeted,1],[:tweeted_at,-1]]
  31 +Story.ensure_index [[:like,1]]
  32 +Story.ensure_index [[:hnid,1]], :unique => true
  33 +
  34 +
45 test_featurizer.rb
... ... @@ -0,0 +1,45 @@
  1 +# show the most common features,
  2 +# the most positive features,
  3 +# and the most negative features
  4 +
  5 +require_relative "story"
  6 +require_relative "featurizer"
  7 +
  8 +liked = Story.where(:like => true)
  9 +disliked = Story.where(:like => false)
  10 +
  11 +
  12 +liked_features = Hash[ liked.to_a.map{|story| story.features.to_a}.flatten
  13 + .group_by{|feat| feat}
  14 + .map{|k,v| [k,v.size]} ]
  15 +
  16 +disliked_features = Hash[ disliked.to_a.map{|story| story.features.to_a}.flatten
  17 + .group_by{|feat| feat}
  18 + .map{|k,v| [k,v.size]} ]
  19 +
  20 +all_features = (liked_features.keys + disliked_features.keys).uniq
  21 + .map{|k| [k,
  22 + liked_features.fetch(k,0),
  23 + disliked_features.fetch(k,0),
  24 + liked_features.fetch(k,0) + disliked_features.fetch(k,0)] }
  25 +
  26 +
  27 +puts "top 25 features"
  28 +topf = all_features.sort{|r1,r2| r2.last <=> r1.last}.take(25)
  29 +topf.each do |f|
  30 + puts f.inspect
  31 +end
  32 +puts
  33 +
  34 +puts "top 25 positive features"
  35 +topf = all_features.sort{|r1,r2| (r2[1] - r2[2]) <=> (r1[1] - r1[2])}.take(25)
  36 +topf.each do |f|
  37 + puts f.inspect
  38 +end
  39 +puts
  40 +
  41 +puts "top 25 negative features"
  42 +topf = all_features.sort{|r1,r2| (r2[2] - r2[1]) <=> (r1[2] - r1[1])}.take(25)
  43 +topf.each do |f|
  44 + puts f.inspect
  45 +end
20 utils.rb
... ... @@ -0,0 +1,20 @@
  1 +# a couple of useless utils
  2 +
  3 +require 'set'
  4 +
  5 +def hnid_from_url(hnurl)
  6 + # hnurl is something like http://news.ycombinator.com/item?id=3571958
  7 + /id=([0-9]+)/.match(hnurl)
  8 + return $1.to_i
  9 +end
  10 +
  11 +NEED_SUBDOMAIN = Set.new ["wordpress.com","co.uk"]
  12 +
  13 +def domain(url,default = "ycombinator.com")
  14 + return default unless url[0...4] === "http"
  15 + /https?:\/\/([^\/]+)/.match(url)
  16 + pieces = $1.split(".")
  17 + d = pieces.last(2).join(".")
  18 + d = pieces.last(3).join(".") if NEED_SUBDOMAIN.include? d
  19 + return d
  20 +end
18 worst_errors.rb
... ... @@ -0,0 +1,18 @@
  1 +require_relative "story"
  2 +
  3 +false_positives = Story.where(:like => false).sort(:prediction.desc)
  4 +false_negatives = Story.where(:like => true).sort(:prediction)
  5 +
  6 +puts "*** worst false positives ***"
  7 +false_positives.take(25).each do |s|
  8 + puts s.prediction
  9 + puts s.inspect
  10 + puts
  11 +end
  12 +
  13 +puts "*** worst false negatives ***"
  14 +false_negatives.take(25).each do |s|
  15 + puts s.prediction
  16 + puts s.inspect
  17 + puts
  18 +end

0 comments on commit 8083979

Please sign in to comment.
Something went wrong with that request. Please try again.