Skip to content

holmofy/spring-spider

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

54 Commits
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Build Status(https://github.com/holmofy/spring-spider/actions/workflows/package.yaml/badge.svg) coverage

A simple crawler tool library based on spring boot

feature

  • support jsonpath & jsoup & xpath
  • Integrate playwright to support pages included js, such as single-page application
  • support raw http message

how to use

  1. Requirements: spring boot 3.0, java17

  2. add dependency

<dependency>
    <groupId>io.github.holmofy</groupId>
    <artifactId>spring-spider</artifactId>
    <version>1.3.3</version>
</dependency>
  1. support jsonpath & Jsoup & Xpath
public class Example {
    @Test
    public void test_jsonpath() {
        Downloader downloader = Downloader.builder().simple();
        String current_user_url = downloader.download(CrawlerRequest.get("https://api.github.com/").build())
                .jsonPath()
                .read("$.current_user_url");
        Assert.assertEquals("https://api.github.com/user", current_user_url);
    }

    @Test
    public void test_jsoup() {
        Downloader downloader = Downloader.builder().simple();
        List<String> repos = downloader.download(CrawlerRequest.get("https://github.com/search?q=spider").build())
                .jsoup()
                .select("div.application-main ul.repo-list > li > div.mt-n1.flex-auto > div.d-flex > div > a")
                .eachText();
        Assert.assertEquals(10, repos.size());
        System.out.println(repos);
    }

    @Test
    public void test_xpath() {
        Downloader downloader = Downloader.builder().simple();
        String location = downloader.download(CrawlerRequest.get("https://www.douban.com/sitemap_index.xml").build())
                .xPath()
                .select("/sitemapindex/sitemap/loc")
                .item(0)
                .getTextContent();
        Assert.assertEquals("https://www.douban.com/sitemap.xml.gz", location);
    }
}

playwright

public class Example {
    public static void main(String[] args) {
        Downloader playwright = Downloader.builder().playwright();
        //...
    }
}

raw http request

import io.github.holmofy.spider.CrawlerResponse;
import io.github.holmofy.spider.Downloader;

public class Example {
    public static void main(String[] args) {
        CrawlerRequest request = CrawlerRequest.parseRaw("""
                POST https://login.example.com/api/users/login
                Accept: application/json, text/plain, */*
                Content-Type: application/x-www-form-urlencoded;charset=UTF-8
                Cookie: 0bd17c6216775852668436416eaee18367962376820602ec6d9cbff1f07b4c
                User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36
                      
                user=admin&password=123456
                """);
        CrawlerResponse response = Downloader.builder().simple().download(request);
        // ...
    }
}

Releases

No releases published

Packages

No packages published

Languages