Permalink
Fetching contributors…
Cannot retrieve contributors at this time
executable file 39 lines (23 sloc) 959 Bytes
#!/usr/bin/env perl
use 5.010;
use Any::Moose;
use WWW::Mechanize;
=head1 NAME
crunchbase-companies - Fetch company names from L<http://crunchbase.com>
=head1 SYNOPSIS
perl utils/crunchbase-companies > utils/companies.txt
perl -Ilib -I/home/avar/src/data-section/lib script/hailo --storage SQLite --tokenizer Chars --train utils/companies.txt --brain utils/companies.db
=head1 DESCRIPTION
This is to emulate L<http://collison.ie/blog/2009/04/using-mathematica-to-generate-web-20-company-names>
=cut
# use utf8 everywhere
binmode $_, ':encoding(utf8)' for (*STDIN, *STDOUT, *STDERR);
my $mech = WWW::Mechanize->new;
my %companies;
for my $letter ('a' .. 'z', 'other') {
say STDERR "Getting companies $letter*";
$mech->get("http://www.crunchbase.com/companies?c=$letter");
my @companies = $mech->content =~ m[<a href="/company/[^"]+" title="(.*?)">(.*?)</a></li>]g;
@companies{@companies} = ();
}
say for keys %companies;