-
Notifications
You must be signed in to change notification settings - Fork 0
/
Crawler.pm
95 lines (62 loc) · 2.1 KB
/
Crawler.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
package Crawler;
use Moose;
use utf8;
BEGIN { extends qw/ LWP::RobotUA / };
has page_source => ( is => 'rw', isa => 'ScalarRef');
has urls => ( is => 'rw', isa => 'ArrayRef');
has words => ( is => 'rw', isa => 'ArrayRef');
has logger => ( is => 'rw', isa => 'Object');
has db => ( is => 'rw', isa => 'Object');
sub store_results {
my $self = shift;
my $words = $self->db->resultset('Schema::Word');
foreach( @{ $self->words } ) {
$words->update_or_create({ root => lc $_ });
}
}
sub parse_urls {
my $self = shift ;
#выдираем все ссылки
my @urls;
while( ${ $self->page_source } =~ m/href="?([^?^#^>^\s^"]+)/ig ) {
push @urls, $1;
}
#удалаяем ссылки на почту
@urls = grep $_ !~ m/mailto/i, @urls;
#TODO: удаляем ссылки на статику
$self->urls(\@urls);
$self->logger->info( "Найдено " . @{ $self->urls() } . " урлов." );
}#sub parse_urls
sub parse_words {
my $self = shift ;
#получаем тело документа
if ( ${ $self->page_source } =~ m|(<body[^>]*>(.+)</body>)|si ) {
my $body = $1;
#удаляем картинки скрипты формы
foreach( qw/ script img form / ) {
$body =~ s/<$_[^>]+>[^<]+(?=<\/$_>)/ /gi;
}
#убираем теги
$body =~ s/<[^>]+>/ /gi;
#убираем спецсимволы
$body =~ s/&[^;]+;/ /gi;
#убираем лишние пробелы
$body =~ s/\W+/ /gi;
my @words = split q/\s/, $body;
$self->words(\@words);
}#if
$self->logger->info( "Найдено " . @{ $self->words() } . " слов." );
}#sub parse_words
sub load_page_source {
my ( $self, $url ) = @_;
my $response = $self->get( $url );
if ($response->is_success) {
$self->logger->info( 'Ищу на странице ' . $url );
$self->page_source(\$response->decoded_content);
}
else {
die $response->status_line;
}
}#load_page_source
#__PACKAGE__->meta->make_immutable;
1;