Permalink
Browse files

Match tabs as tokens

  • Loading branch information...
1 parent a4af087 commit 69fdf1ac410fcb635edadffc7317e9c1cdd6256a @hinrik hinrik committed May 5, 2011
Showing with 8 additions and 1 deletion.
  1. +2 −0 Changes
  2. +1 −1 lib/Hailo/Tokenizer/Words.pm
  3. +5 −0 t/tokenizer/Words.t
View
@@ -7,6 +7,8 @@ Revision history for Hailo
- Word tokenizer: Improve matching/capitalization of filenames and
domain names
+ - Word tokenizer: Match tabs as tokens
+
0.68 2011-05-03 13:16:05
- Speed up the learning of repetitive sentences by caching more
@@ -70,7 +70,7 @@ sub make_tokens {
my @tokens;
$input =~ s/$DASH\K\s*\n+\s*//;
$input =~ s/\s*\n+\s*/ /gm;
- my @chunks = split /\s+/, $input;
+ my @chunks = split /[\n ]+/, $input;
# process all whitespace-delimited chunks
for my $chunk (@chunks) {
View
@@ -468,6 +468,11 @@ subtest make_output => sub {
[qw<the file is C:\\hlagh\\bar\\foo.txt . just read it .>],
"The file is C:\\hlagh\\bar\\foo.txt. Just read it.",
],
+ [
+ "Tabs\ttabs\ttabs.",
+ ['tabs', "\t", 'tabs', "\t", 'tabs', '.'],
+ "Tabs\ttabs\ttabs.",
+ ],
);
my $toke = Hailo::Tokenizer::Words->new();

0 comments on commit 69fdf1a

Please sign in to comment.