Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

initial commit

  • Loading branch information...
commit 80839795f85069fcfd153f8228653081c334a684 1 parent 3b1d335
@joelgrus authored
View
47 back_predict.rb
@@ -0,0 +1,47 @@
+# load the default model, and make predictions for every story in the database
+
+require_relative "story"
+require_relative "model"
+
+
+def back_predict(only_new = true,verbose=true)
+
+ model = Model.load
+
+ stories = only_new ? Story.where( :prediction => nil ) : Story.all
+
+ puts "found #{stories.count} stories to classify"
+
+ stories.each do |story|
+
+ puts story.inspect if verbose
+
+ story.prediction = model.classify(story,verbose)
+ story.save
+ end
+end
+
+def worst_predictions(n = 25)
+ false_positives = Story.where( :prediction.gt => 0, :like => false ).sort( :prediction.desc ).take(n)
+ false_negatives = Story.where( :prediction.gt => 0, :like => true ).sort( :prediction ).take(n)
+
+ puts "worst false negatives:"
+ false_negatives.each do |s|
+ puts s.inspect
+ puts s.prediction
+ puts
+ end
+
+ puts "worst false positives:"
+ false_positives.each do |s|
+ puts s.inspect
+ puts s.prediction
+ puts
+ end
+end
+
+if __FILE__ == $PROGRAM_NAME
+ back_predict
+ worst_predictions
+end
+
View
78 blogger.rb
@@ -0,0 +1,78 @@
+# post any new stories to blogger
+# using credentials from the yaml file .blogger in your home directory
+
+require 'twitter'
+require 'maruku'
+require_relative 'story'
+require_relative 'model'
+require "blogger"
+require "yaml"
+
+puts Time.now
+
+# this is for conditionally formatting the probabilities,
+# although it turns out that blogger just ignores it
+def color_from_prob(p)
+# want 100% to be green 0,255,0
+# want 0% to be red 255,0,0
+
+ rgb = [255 * (1-p),255 * p, 0]
+ hex = rgb.map{|i| sprintf("%02x", i).upcase}.join
+ return hex
+end
+
+new_stories = Story.where(:tweeted => nil).sort(:hnid.desc).take(200)
+
+puts "found #{new_stories.size} new stories"
+
+model = Model.load
+
+# now need to order by joel_probability descending
+
+joel_probs = new_stories.map{|s| model.classify(s,true)}
+
+sorted_stories = joel_probs.zip(new_stories).sort{|s1,s2| s2.first <=> s1.first}
+
+content = sorted_stories.map do |joel_prob,s|
+
+ color = color_from_prob joel_prob
+ title = s.link_title.gsub("&","&amp;").gsub("<","&lt;").gsub(">","&gt;")
+ hn_link = "http://news.ycombinator.com/item?id=#{s.hnid}"
+ link = /^http/ =~ s.link_url ? s.link_url : hn_link
+
+ body = %Q[**#{sprintf("%.3f",joel_prob)}** [#{title}](#{link}) [*comments*](#{hn_link})\n]
+ # body = "<div>"
+ # body += %Q[<span style="color:#{color}">#{"%.3f" % joel_probability}</span> ]
+ # body += %Q[<a href="#{link}">#{title}</a> ]
+ # body += %Q[<small><a href ="#{hn_link}">comments</a></small>]
+ # body += "</div>"
+
+ body
+end.join("\n")
+
+title = "Hacker News stories for #{Time.now.strftime("%l %p on %A %b %d, %Y").strip}"
+
+params = YAML::load(File.open("#{ENV['HOME']}/.blogger"))
+
+account = Blogger::Account.new(params["username"],params["password"])
+blogid = params["blogid"]
+userid = params["userid"]
+
+post = Blogger::Post.new(:title => title,:formatter => :maruku)
+post.content = content
+
+
+begin
+
+ account.post(blogid,post) if new_stories.size > 0
+
+ new_stories.each do |s|
+ s.tweeted = true
+ s.tweeted_at = Time.now
+ s.save
+ end
+
+rescue Exception => e
+ puts e.inspect
+ puts new_stories.map{|s| s.link_title}.join("\n")
+end
View
71 featurizer.rb
@@ -0,0 +1,71 @@
+# turns a story into a set of features
+
+require "set"
+require_relative "stopwords"
+require_relative "story"
+require_relative "utils"
+require_relative "stemmable"
+
+class String
+ include Stemmable
+end
+
+
+class Story
+ def features()
+
+ feature_set = Set.new
+
+ #words
+ regex_splitter = /[^a-z0-9\-'+#]/
+ title = self.link_title.downcase
+ words = title.split(regex_splitter).select{|w| !w.empty?}
+
+ words.select{|w| !is_stopword w}
+ .each {|w| feature_set.add "word_#{w.stem}"}
+
+ # bigrams
+ bigrams = words.each_cons(2)
+ .select{|pair| (!is_stopword pair[0]) or (!is_stopword pair[1])}
+ .map{|pair| pair[0].stem + "_" + pair[1].stem}
+
+ bigrams.each {|b| feature_set.add "bigram_#{b}"}
+
+ #domain
+
+ feature_set.add "domain_#{self.domain}"
+
+ #user
+
+ feature_set.add "user_#{self.user}" if self.user
+
+ #user_in_domain
+
+ user_in_domain = self.domain.include? self.user if self.user
+ feature_set.add "userindomain" if user_in_domain
+
+ # is_pdf
+ feature_set.add("pdf") if /\.pdf/i =~ self.link_url
+
+ # is_question
+ feature_set.add("isquestion") if /\?$/ =~ self.link_title
+
+ # dollar_amount
+ feature_set.add("dollaramount") if /\$[0-9]+/ =~ self.link_title
+
+ # hours amount
+ feature_set.add("hoursamount") if /\b[1-9][0-9]* hours?\b/i =~ self.link_title
+
+ # years amount
+ feature_set.add("yearsamount") if /\b[1-9][0-9]* years?\b/i =~ self.link_title
+
+ # yc class
+ feature_set.add("ycclass") if /YC [A-Z][0-9]{2}/i =~ self.link_title
+
+ # in quotes
+ feature_set.add("inquotes") if /^".*"$/ =~ self.link_title
+
+ return feature_set
+ end
+end
+
View
68 judger.rb
@@ -0,0 +1,68 @@
+# manually score stories as like or dislike, for use in building the model
+# "ruby judger.rb" to see unjudged stories
+# "ruby judger.rb word" to see all stories with "word" in their title
+
+require_relative "story"
+require_relative "model"
+require_relative "utils"
+
+# unjudged stories, sorted by hnid descending
+total = Story.count
+
+puts ARGV
+
+# if we gave it args, use the first to match with
+if ARGV.size > 0
+ term = ARGV.first
+ re = Regexp.new(term,true)
+ stories = Story.where(:link_title => re).to_a.shuffle
+
+ puts "#{stories.count} stories containing #{term}"
+else
+
+ stories = Story.where(:like => nil).to_a.shuffle #sort(:hnid.desc)
+ unjudged_count = stories.count
+
+ puts "#{total} stories in database"
+ puts "#{total - unjudged_count} judged"
+ puts "#{unjudged_count} left to judge"
+ puts
+end
+
+
+stories.each do |s|
+ puts "hnid: #{s.hnid}"
+ puts "title: #{s.link_title}"
+ puts "domain: #{s.domain}"
+ puts "url: #{s.link_url}"
+ puts "user: #{s.user}"
+ puts
+ puts "previous judgment: #{s.like}" if s.like
+ puts "prediction: #{s.prediction}" if s.prediction
+ print "good? >> "
+ answer = STDIN.gets
+ if /^y/i =~ answer
+ puts "you liked it!"
+ s.like = true
+ elsif /^n/i =~ answer
+ puts "you didn't like it!"
+ s.like = false
+ else break
+ end
+
+ s.save
+ puts
+end
+
+puts "retraining model"
+# after judging, retrain the model
+model = Model.new
+model.train(2)
+model.save
+
+# and reclassify everything
+
+require_relative "back_predict"
+back_predict(false,false)
+
+require_relative "test_featurizer"
View
152 model.rb
@@ -0,0 +1,152 @@
+# class to build a naive bayes model
+# and save it to disk or load it back
+
+require_relative 'story'
+require_relative 'featurizer'
+
+def split(stories,test_split)
+ test = []
+ train = []
+
+ stories.each do |s|
+ if rand < test_split
+ test << s
+ else
+ train << s
+ end
+ end
+
+ return [train,test]
+end
+
+
+class Model
+ attr_accessor :created_at, :positives, :negatives, :feature_counts
+
+ def test(liked,disliked)
+
+ results = liked.map{|s| [self.classify(s),1]} +
+ disliked.map{|s| [self.classify(s),0]}
+
+ total_pos = results.select{|r| r[1] == 1}.size
+ 20.downto(0).map do |n|
+ pct = n.to_f / 20
+ true_pos = results.select{|r| r[0] >= pct and r[1] == 1}.size
+ false_pos = results.select{|r| r[0] >= pct and r[1] == 0}.size
+ precision = true_pos.to_f / (true_pos + false_pos)
+ recall = true_pos.to_f / total_pos
+ [pct,true_pos + false_pos,precision,recall]
+ end
+ end
+
+
+ def train(min_feature_freq = 3,test_split = 0.2,verbose=false)
+
+ liked = Story.where(:like => true)
+ disliked = Story.where(:like => false)
+
+ liked_train,liked_test = split(liked,test_split)
+ disliked_train,disliked_test = split(disliked,test_split)
+
+ puts "training: #{liked_train.size + disliked_train.size} examples"
+ puts "test: #{liked_test.size + disliked_test.size} examples"
+
+ liked_features = Hash[ liked_train.to_a.map{|story| story.features.to_a}.flatten
+ .group_by{|feat| feat}
+ .map{|k,v| [k,v.size]} ]
+
+ disliked_features = Hash[ disliked_train.to_a.map{|story| story.features.to_a}.flatten
+ .group_by{|feat| feat}
+ .map{|k,v| [k,v.size]} ]
+
+ all_features = (liked_features.keys + disliked_features.keys).uniq
+ .map{|k| {:feature => k,
+ :positives => liked_features.fetch(k,0),
+ :negatives => disliked_features.fetch(k,0),
+ :total => liked_features.fetch(k,0) + disliked_features.fetch(k,0)} }
+ .select{|f| f[:total] >= min_feature_freq}
+
+ # pseudo_counts:
+
+ all_features.each do |f|
+ f[:positives] += 2
+ f[:negatives] += 2
+ f[:total] += 4
+ end
+
+ # and turn into a hash
+
+ self.feature_counts = Hash[ all_features.map do |dict|
+ feature = dict[:feature]
+ dict.delete(:feature)
+ [feature,dict]
+ end]
+
+ self.created_at = Time.now
+ self.positives = all_features.map{|f| f[:positives]}.sum
+ self.negatives = all_features.map{|f| f[:negatives]}.sum
+
+ if verbose
+ puts self.feature_counts.inspect
+ puts self.positives
+ puts self.negatives
+ end
+
+ test(liked_test,disliked_test).each do |pct,num,precision,recall|
+ puts "#{pct}\t#{num}\t#{precision}\t#{recall}"
+ end
+
+ end
+
+ def log_p_feature(feature)
+
+ p_feature_given_positive = self.feature_counts[feature][:positives].to_f / self.positives.to_f
+ p_feature_given_negative = self.feature_counts[feature][:negatives].to_f / self.negatives.to_f
+ return Math.log(p_feature_given_positive / p_feature_given_negative)
+ end
+
+ def classify(story,verbose=false)
+
+ features = story.features
+ puts features.inspect if verbose
+
+ base_odds = self.positives.to_f / self.negatives.to_f
+
+ usable_features = features.select{|f| self.feature_counts.has_key? f}
+ feature_odds = usable_features.map{|f| log_p_feature(f)}
+
+ if verbose
+
+ puts "#{Math.log(base_odds).round(3)}\t -- base log odds"
+
+ usable_features.zip(feature_odds).each do |f,lo|
+ puts "#{lo.round(3)}\t -- #{f}"
+ end
+ end
+
+ odds = base_odds * Math.exp(feature_odds.sum)
+ # odds are p / n = p / 1 - p, so that
+ # p = (1 - p) * odds
+ # p (1 + odds) = odds
+
+ p = odds / (1 + odds)
+
+ puts p if verbose
+
+ return p
+
+ end
+
+ def save(filename=File.join(File.dirname(__FILE__), 'model.mod' ))
+ File.open(filename,'w') { |f| f.write(YAML::dump(self)) }
+ end
+
+ def self.load(filename=File.join(File.dirname(__FILE__), 'model.mod' ))
+ if File.exists? filename
+ return YAML.load(File.read(filename))
+ else
+ return nil
+ end
+ end
+end
+
View
75 scrape_api.rb
@@ -0,0 +1,75 @@
+# scrape new stories from the unofficial hacker news api
+# the API is broken half the time, so this needs to be made more robust
+
+require 'open-uri'
+require 'json'
+require_relative "story"
+require_relative "utils"
+require_relative "model"
+
+base_url = "http://news.ycombinator.com/newest"
+
+puts Time.now
+
+def scrape(max_pages = 1, base_url)
+ new_stories = []
+ url = base_url
+ found_known_story = false
+ model = Model.load
+
+
+ (1..max_pages).each do |i|
+
+ puts "opening page #{i}: #{url}"
+
+ r = open(url).readline
+
+ puts "found #{r.size} characters"
+
+ doc = JSON.parse r
+
+ nextId = doc["nextId"]
+
+ doc["items"].each do |item|
+
+ itemid = item["id"]
+
+ if Story.where(:hnid => itemid).count > 0
+ found_known_story = true
+ puts "known story: #{itemid}"
+ else
+ story = Story.new
+ story.hnid = itemid
+ story.link_url = item["url"]
+ story.link_title = item["title"]
+ story.domain = domain(item["url"])
+ story.scraped_at = Time.now
+ story.user = item["postedBy"]
+ story.prediction = model.classify(story) if model
+ new_stories << story
+ end
+ end
+
+ break if found_known_story
+ break unless nextId
+
+ url = "#{base_url}/#{nextId}"
+ puts "moving ahead to #{url}"
+ end
+
+ puts "found #{new_stories.size} new stories"
+ new_stories.each do |s|
+ s.save
+ puts "new story:"
+ puts s.hnid
+ puts s.link_title
+ puts s.link_url
+ end
+end
+
+
+begin
+ scrape(20,"http://api.ihackernews.com/new")
+rescue
+ puts "new failed, try again later"
+end
View
52 scrape_old.rb
@@ -0,0 +1,52 @@
+# scrapes the archives of hn daily to get old data for training
+
+require "nokogiri"
+require "open-uri"
+load "story.rb"
+load "utils.rb"
+
+root_url = "http://www.daemonology.net/hn-daily" #2012-01.html"
+
+def month_pages(root_url)
+ doc = Nokogiri::HTML(open(root_url))
+ doc.xpath("//div[@class='marginlink']/a").to_a
+ .map { |node| node["href"] }
+ .select { |url| url =~ /[0-9]{4}\-[0-9]{2}\.html/ }
+ .map { |url| "#{root_url}/#{url}" }
+end
+
+
+def scrape(url)
+ doc = Nokogiri::HTML(open(url))
+
+ storylinks = doc.xpath("//li/span[@class='storylink']/a")
+ commentlinks = doc.xpath("//li/span[@class='commentlink']/a")
+
+ if storylinks.size === commentlinks.size
+ (storylinks.zip commentlinks).map do |sl,cl|
+ description = sl.text
+ url = sl["href"]
+ hnurl = cl["href"]
+
+ [description,url,hnurl]
+ end
+ end
+end
+
+month_pages(root_url).each do |url|
+ puts "scraping url"
+ scrape(url).each do |desc,url,hnurl|
+ puts desc
+ puts url
+ puts hnurl
+ story = Story.new
+ story.hnid = hnid_from_url(hnurl)
+ story.link_url = url
+ story.link_title = desc
+ story.domain = domain(url)
+ story.scraped_at = Time.now
+ puts "saving"
+ story.save
+ puts
+ end
+end
View
212 stemmable.rb
@@ -0,0 +1,212 @@
+#! /local/ruby/bin/ruby
+#
+# $Id: stemmable.rb,v 1.2 2003/02/01 02:07:30 condit Exp $
+#
+# See example usage at the end of this file.
+#
+
+module Stemmable
+
+ STEP_2_LIST = {
+ 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
+ 'izer'=>'ize', 'bli'=>'ble',
+ 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
+ 'ization'=>'ize', 'ation'=>'ate',
+ 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
+ 'ousness'=>'ous', 'aliti'=>'al',
+ 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
+ }
+
+ STEP_3_LIST = {
+ 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
+ 'ical'=>'ic', 'ful'=>'', 'ness'=>''
+ }
+
+
+ SUFFIX_1_REGEXP = /(
+ ational |
+ tional |
+ enci |
+ anci |
+ izer |
+ bli |
+ alli |
+ entli |
+ eli |
+ ousli |
+ ization |
+ ation |
+ ator |
+ alism |
+ iveness |
+ fulness |
+ ousness |
+ aliti |
+ iviti |
+ biliti |
+ logi)$/x
+
+
+ SUFFIX_2_REGEXP = /(
+ al |
+ ance |
+ ence |
+ er |
+ ic |
+ able |
+ ible |
+ ant |
+ ement |
+ ment |
+ ent |
+ ou |
+ ism |
+ ate |
+ iti |
+ ous |
+ ive |
+ ize)$/x
+
+
+ C = "[^aeiou]" # consonant
+ V = "[aeiouy]" # vowel
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
+
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
+
+ #
+ # Porter stemmer in Ruby.
+ #
+ # This is the Porter stemming algorithm, ported to Ruby from the
+ # version coded up in Perl. It's easy to follow against the rules
+ # in the original paper in:
+ #
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+ # no. 3, pp 130-137,
+ #
+ # See also http://www.tartarus.org/~martin/PorterStemmer
+ #
+ # Send comments to raypereda@hotmail.com
+ #
+
+ def stem_porter
+
+ # make a copy of the given object and convert it to a string.
+ w = self.dup.to_str
+
+ return w if w.length < 3
+
+ # now map initial y to Y so that the patterns never treat it as vowel
+ w[0] = 'Y' if w[0] == ?y
+
+ # Step 1a
+ if w =~ /(ss|i)es$/
+ w = $` + $1
+ elsif w =~ /([^s])s$/
+ w = $` + $1
+ end
+
+ # Step 1b
+ if w =~ /eed$/
+ w.chop! if $` =~ MGR0
+ elsif w =~ /(ed|ing)$/
+ stem = $`
+ if stem =~ VOWEL_IN_STEM
+ w = stem
+ case w
+ when /(at|bl|iz)$/ then w << "e"
+ when /([^aeiouylsz])\1$/ then w.chop!
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
+ end
+ end
+ end
+
+ if w =~ /y$/
+ stem = $`
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
+ end
+
+ # Step 2
+ if w =~ SUFFIX_1_REGEXP
+ stem = $`
+ suffix = $1
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
+ if stem =~ MGR0
+ w = stem + STEP_2_LIST[suffix]
+ end
+ end
+
+ # Step 3
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
+ stem = $`
+ suffix = $1
+ if stem =~ MGR0
+ w = stem + STEP_3_LIST[suffix]
+ end
+ end
+
+ # Step 4
+ if w =~ SUFFIX_2_REGEXP
+ stem = $`
+ if stem =~ MGR1
+ w = stem
+ end
+ elsif w =~ /(s|t)(ion)$/
+ stem = $` + $1
+ if stem =~ MGR1
+ w = stem
+ end
+ end
+
+ # Step 5
+ if w =~ /e$/
+ stem = $`
+ if (stem =~ MGR1) ||
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
+ w = stem
+ end
+ end
+
+ if w =~ /ll$/ && w =~ MGR1
+ w.chop!
+ end
+
+ # and turn initial Y back to y
+ w[0] = 'y' if w[0] == ?Y
+
+ w
+ end
+
+
+ #
+ # make the stem_porter the default stem method, just in case we
+ # feel like having multiple stemmers available later.
+ #
+ alias stem stem_porter
+
+end
+
+
+
+#
+# Make this script executable, and send it words on stdin, one per
+# line, and it will output the stemmed versions to stdout.
+#
+if $0 == __FILE__ then
+ class String
+ include Stemmable
+ end
+
+ # the String class, and any subclasses of it you might have, now know
+ # how to stem things.
+
+ $stdin.each do |word|
+ puts word.stem
+ end
+end
+
+
View
188 stopwords.rb
@@ -0,0 +1,188 @@
+require "set"
+require_relative "stemmable"
+
+class String
+ include Stemmable
+end
+
+STOPWORDS = Set.new [
+ "-",
+ "a",
+ "about",
+ "above",
+ "after",
+ "again",
+ "against",
+ "all",
+ "am",
+ "an",
+ "and",
+ "any",
+ "are",
+ "aren't",
+ "as",
+ "at",
+ "be",
+ "because",
+ "been",
+ "before",
+ "being",
+ "below",
+ "between",
+ "both",
+ "but",
+ "by",
+ "can't",
+ "cannot",
+ "could",
+ "couldn't",
+ "did",
+ "didn't",
+ "do",
+ "does",
+ "doesn't",
+ "doing",
+ "don't",
+ "down",
+ "during",
+ "each",
+ "few",
+ "for",
+ "from",
+ "further",
+ "had",
+ "hadn't",
+ "has",
+ "hasn't",
+ "have",
+ "haven't",
+ "having",
+ "he",
+ "he'd",
+ "he'll",
+ "he's",
+ "her",
+ "here",
+ "here's",
+ "hers",
+ "herself",
+ "him",
+ "himself",
+ "his",
+ "how",
+ "how's",
+ "i",
+ "i'd",
+ "i'll",
+ "i'm",
+ "i've",
+ "if",
+ "in",
+ "into",
+ "is",
+ "isn't",
+ "it",
+ "it's",
+ "its",
+ "itself",
+ "let's",
+ "me",
+ "more",
+ "most",
+ "mustn't",
+ "my",
+ "myself",
+ "no",
+ "nor",
+ "not",
+ "of",
+ "off",
+ "on",
+ "once",
+ "only",
+ "or",
+ "other",
+ "ought",
+ "our",
+ "ours",
+ "ourselves",
+ "out",
+ "over",
+ "own",
+ "same",
+ "shan't",
+ "she",
+ "she'd",
+ "she'll",
+ "she's",
+ "should",
+ "shouldn't",
+ "so",
+ "some",
+ "such",
+ "than",
+ "that",
+ "that's",
+ "the",
+ "their",
+ "theirs",
+ "them",
+ "themselves",
+ "then",
+ "there",
+ "there's",
+ "these",
+ "they",
+ "they'd",
+ "they'll",
+ "they're",
+ "they've",
+ "this",
+ "those",
+ "through",
+ "to",
+ "too",
+ "under",
+ "until",
+ "up",
+ "very",
+ "was",
+ "wasn't",
+ "we",
+ "we'd",
+ "we'll",
+ "we're",
+ "we've",
+ "were",
+ "weren't",
+ "what",
+ "what's",
+ "when",
+ "when's",
+ "where",
+ "where's",
+ "which",
+ "while",
+ "who",
+ "who's",
+ "whom",
+ "why",
+ "why's",
+ "with",
+ "won't",
+ "would",
+ "wouldn't",
+ "you",
+ "you'd",
+ "you'll",
+ "you're",
+ "you've",
+ "your",
+ "yours",
+ "yourself",
+ "yourselves"
+].map {|w| w.stem}
+
+def is_stopword(w)
+ return STOPWORDS.include? w
+end
View
34 story.rb
@@ -0,0 +1,34 @@
+# the class that represents story objects in the database
+
+require 'mongo_mapper'
+
+MongoMapper.database = 'hackernews'
+
+class Story
+ include MongoMapper::Document
+
+ key :hnid, Integer
+ key :link_url, String
+ key :link_title, String
+ key :domain, String
+ key :user, String
+
+ key :scraped_at, Time
+ key :tweeted_at, Time
+
+ key :like, Boolean
+ key :tweeted, Boolean
+ key :prediction, Float
+
+ scope :untweeted, where(:tweeted => false)
+
+ def inspect
+ "hnid: #{self.hnid}\n#{link_title}\n#{link_url}"
+ end
+end
+
+Story.ensure_index [[:tweeted,1],[:tweeted_at,-1]]
+Story.ensure_index [[:like,1]]
+Story.ensure_index [[:hnid,1]], :unique => true
+
+
View
45 test_featurizer.rb
@@ -0,0 +1,45 @@
+# show the most common features,
+# the most positive features,
+# and the most negative features
+
+require_relative "story"
+require_relative "featurizer"
+
+liked = Story.where(:like => true)
+disliked = Story.where(:like => false)
+
+
+liked_features = Hash[ liked.to_a.map{|story| story.features.to_a}.flatten
+ .group_by{|feat| feat}
+ .map{|k,v| [k,v.size]} ]
+
+disliked_features = Hash[ disliked.to_a.map{|story| story.features.to_a}.flatten
+ .group_by{|feat| feat}
+ .map{|k,v| [k,v.size]} ]
+
+all_features = (liked_features.keys + disliked_features.keys).uniq
+ .map{|k| [k,
+ liked_features.fetch(k,0),
+ disliked_features.fetch(k,0),
+ liked_features.fetch(k,0) + disliked_features.fetch(k,0)] }
+
+
+puts "top 25 features"
+topf = all_features.sort{|r1,r2| r2.last <=> r1.last}.take(25)
+topf.each do |f|
+ puts f.inspect
+end
+puts
+
+puts "top 25 positive features"
+topf = all_features.sort{|r1,r2| (r2[1] - r2[2]) <=> (r1[1] - r1[2])}.take(25)
+topf.each do |f|
+ puts f.inspect
+end
+puts
+
+puts "top 25 negative features"
+topf = all_features.sort{|r1,r2| (r2[2] - r2[1]) <=> (r1[2] - r1[1])}.take(25)
+topf.each do |f|
+ puts f.inspect
+end
View
20 utils.rb
@@ -0,0 +1,20 @@
+# a couple of useless utils
+
+require 'set'
+
+def hnid_from_url(hnurl)
+ # hnurl is something like http://news.ycombinator.com/item?id=3571958
+ /id=([0-9]+)/.match(hnurl)
+ return $1.to_i
+end
+
+NEED_SUBDOMAIN = Set.new ["wordpress.com","co.uk"]
+
+def domain(url,default = "ycombinator.com")
+ return default unless url[0...4] === "http"
+ /https?:\/\/([^\/]+)/.match(url)
+ pieces = $1.split(".")
+ d = pieces.last(2).join(".")
+ d = pieces.last(3).join(".") if NEED_SUBDOMAIN.include? d
+ return d
+end
View
18 worst_errors.rb
@@ -0,0 +1,18 @@
+require_relative "story"
+
+false_positives = Story.where(:like => false).sort(:prediction.desc)
+false_negatives = Story.where(:like => true).sort(:prediction)
+
+puts "*** worst false positives ***"
+false_positives.take(25).each do |s|
+ puts s.prediction
+ puts s.inspect
+ puts
+end
+
+puts "*** worst false negatives ***"
+false_negatives.take(25).each do |s|
+ puts s.prediction
+ puts s.inspect
+ puts
+end
Please sign in to comment.
Something went wrong with that request. Please try again.