Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

initial - 0.1

  • Loading branch information...
commit 6ac72ac7db91ec06d1f48f5459ef6fc58bd17c4e 0 parents
brez authored
1  LICENSE
@@ -0,0 +1 @@
+PUBLIC DOMAIN
45 README.textile
@@ -0,0 +1,45 @@
+h1. STOPWORDS
+
+h2. REALLY JUST A LIST OF STOPWORDS
+
+Obviously part of something bigger but worth breaking out for reuse.
+
+h2. USAGE
+
+<pre><code>
+
+require 'stopwords'
+
+#List all stop words
+Stopwords::STOP_WORDS
+
+#Test to see if a token is a stop word
+Stopwords.is?('and')
+
+=>true
+
+#Ensures a token is both a 'word' and not a stop word
+Stopwords.valid?('vector')
+
+=>true
+
+</code></pre>
+
+h2. SPECS
+
+<pre><code>
+$ rake specs
+</code></pre>
+
+h2. SANITIZE
+
+Not part of the library but you should probably sanitize tokens before using them (if your tokenize doesn't already)
+
+<pre><code>
+SANITIZE_REGEXP = /('|\"|‘|’|\/|\\)/
+text.downcase.gsub(SANITIZE_REGEXP, '')
+</code></pre>
+
+h2. ENDAX
+
+Software Services shop (primarily Ruby) in Brooklyn, NY.
26 Rakefile
@@ -0,0 +1,26 @@
+require 'rake'
+require 'rubygems'
+require 'rake/gempackagetask'
+require 'spec/rake/spectask'
+
+desc "Run the specs under spec"
+Spec::Rake::SpecTask.new do |t|
+ t.spec_files = FileList['spec/**/*_spec.rb']
+ t.spec_opts << "-c"
+end
+
+spec = Gem::Specification.new do |s|
+ s.name = 'stopwords'
+ s.version = '0.1'
+ s.require_path = 'lib'
+ s.description = 'A stopword library'
+ s.summary = 'A stopword library'
+ s.files = FileList["{bin,docs,lib,test}/**/*"].exclude("rdoc").to_a
+ s.author = "ENDAX, LLC"
+ s.email = "john@endax.com"
+ s.homepage = "http://endax.github.com/"
+end
+
+Rake::GemPackageTask.new(spec) do |pkg|
+ pkg.need_tar = true
+end
2  spec/spec_helper.rb
@@ -0,0 +1,2 @@
+require 'spec'
+require 'stopwords'
29 spec/stopwords_spec.rb
@@ -0,0 +1,29 @@
+require 'spec/spec_helper'
+
+describe Stopwords do
+ describe '.is?' do
+ it "should return true if a given token is stopword" do
+ Stopwords.is?('and').should be(true)
+ end
+ it "should return false if a given token is not stopword" do
+ Stopwords.is?('red').should be(false)
+ end
+ end
+ describe '.valid?' do
+ it "should return true if a given token is a valid word and not a stopword" do
+ Stopwords.valid?('vector').should be(true)
+ end
+ it "should return true if a given token is has got extra characters" do
+ Stopwords.valid?('@#2-+~}v').should be(false)
+ end
+ it "should return false if a given token is stopword" do
+ Stopwords.valid?('and').should be(false)
+ end
+ it "should return false if a given token is just nonsense" do
+ Stopwords.valid?('@#2-+~}v').should be(false)
+ end
+ it "should return false if a given token is blank" do
+ Stopwords.valid?('').should be(false)
+ end
+ end
+end
43 stopwords.rb
@@ -0,0 +1,43 @@
+module Stopwords
+
+ STOP_WORDS = [
+ 'a','cannot','into','our','thus','about','co','is','ours','to','above',
+ 'could','it','ourselves','together','across','down','its','out','too',
+ 'after','during','itself','over','toward','afterwards','each','last','own',
+ 'towards','again','eg','latter','per','under','against','either','latterly',
+ 'perhaps','until','all','else','least','rather','up','almost','elsewhere',
+ 'less','same','upon','alone','enough','ltd','seem','us','along','etc',
+ 'many','seemed','very','already','even','may','seeming','via','also','ever',
+ 'me','seems','was','although','every','meanwhile','several','we','always',
+ 'everyone','might','she','well','among','everything','more','should','were',
+ 'amongst','everywhere','moreover','since','what','an','except','most','so',
+ 'whatever','and','few','mostly','some','when','another','first','much',
+ 'somehow','whence','any','for','must','someone','whenever','anyhow',
+ 'former','my','something','where','anyone','formerly','myself','sometime',
+ 'whereafter','anything','from','namely','sometimes','whereas','anywhere',
+ 'further','neither','somewhere','whereby','are','had','never','still',
+ 'wherein','around','has','nevertheless','such','whereupon','as','have',
+ 'next','than','wherever','at','he','no','that','whether','be','hence',
+ 'nobody','the','whither','became','her','none','their','which','because',
+ 'here','noone','them','while','become','hereafter','nor','themselves','who',
+ 'becomes','hereby','not','then','whoever','becoming','herein','nothing',
+ 'thence','whole','been','hereupon','now','there','whom','before','hers',
+ 'nowhere','thereafter','whose','beforehand','herself','of','thereby','why',
+ 'behind','him','off','therefore','will','being','himself','often','therein',
+ 'with','below','his','on','thereupon','within','beside','how','once',
+ 'these','without','besides','however','one','they','would','between','i',
+ 'only','this','yet','beyond','ie','onto','those','you','both','if','or',
+ 'though','your','but','in','other','through','yours','by','inc','others',
+ 'throughout','yourself','can','indeed','otherwise','thru','yourselves'
+ ]
+ TOKEN_REGEXP = /^[a-z]+$|^\w+\-\w+|^[a-z]+[0-9]+[a-z]+$|^[0-9]+[a-z]+|^[a-z]+[0-9]+$/
+
+ def self.is?(token)
+ STOP_WORDS.member?(token)
+ end
+
+ def self.valid?(token)
+ (((token =~ TOKEN_REGEXP) == 0)) and !(STOP_WORDS.member?(token))
+ end
+
+end
Please sign in to comment.
Something went wrong with that request. Please try again.