girie ("go" + "kirie") is a tool for data/metadata extraction from web pages.
- To have a microservice with API (GraphQL) for ETL pipelines.
- Provide a plugin endpoint to other tool - gosquito.
- Extract the primary article (boilerpipe, go-domdistiller) from a web page (HTML and text).
- Extract JSON-LD.
- Extract Microdata.
- Extract Opengraph.
- Extract RDFa.
- Extract images from an entire page or from a page's article.
# Start daemon:
user@localhost ~ $ docker run --name girie -ti --rm ghcr.io/livelace/girie:v1.5.0
INFO[16.01.2021 11:38:59.101] girie v1.5.0
WARN[16.01.2021 11:38:59.102] config error error="Config File \"config.toml\" Not Found in \"[/etc/girie]\""
INFO[16.01.2021 11:38:59.102] listen :8080
# Get API IP:
SERVER=`docker inspect -f "{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}" girie`
# GET + URL:
user@localhost ~ $ docker exec girie curl -s -L -g --request GET \
'http://127.0.0.1:8080/api/?query={data(url:"https://iz.ru/1091344/2020-11-24/effektivnost-vaktciny-sputnik-v-prevysila-95"){article{text_spans{lang,text,tokens_amount}}}}' | jq
# POST + URL:
QUERY=`cat << EOF
{
"query": "{
data(url: \"https://iz.ru/1091344/2020-11-24/effektivnost-vaktciny-sputnik-v-prevysila-95\") {
page{
images{alt,height,src,width}
}
}
}"
}
EOF
`
QUERY=`echo $QUERY | tr -d " \n"`
curl -s -L -X POST "http://${SERVER}:8080/api/?retry=3&timeout=3" \
--header "Content-Type: application/json" \
--data-raw "${QUERY}" | jq
# POST + HTML:
BASE64=`curl -s "https://iz.ru/1091344/2020-11-24/effektivnost-vaktciny-sputnik-v-prevysila-95" | base64 -w0`
QUERY=`cat << EOF
{
"query": "{
data(html: \"${BASE64}\") {
article{
html,
images{alt,height,src,width},
text,
text_spans{lang,text,tokens_amount},
text_spans_append{lang,text,tokens_amount},
text_spans_block{lang,text,tokens_amount},
},
html,
url,
page{
html,
jsonld,
images{alt,height,src,width},
lang,
microdata,
opengraph,
rdfa,
text,
title
}
}
}"
}
EOF
`
echo $QUERY | tr -d " \n" > "/tmp/query.json"
curl -s -L -X POST "http://${SERVER}:8080/api/?retry=3&timeout=3" \
--header "Content-Type: application/json" \
--data "@/tmp/query.json" | jq
[default]
# Options priority order (top -> down):
# 1. Configuration file.
# 2. Environment variables.
# 3. Query options.
# env GIRIE_LISTEN=":8080"
# listen = ":8080"
# env: GIRIE_PROXY="http://127.0.0.1:3128"
# url: http://127.0.0.1:8080/api/?proxy="http://127.0.0.1:3128"
# proxy = "http://127.0.0.1:3128"
# env: GIRIE_RETRY=2
# url: http://127.0.0.1:8080/api/?retry=2
# retry = 2
# env: GIRIE_TIMEOUT=2
# url: http://127.0.0.1:8080/api/?timeout=2
# timeout = 10
# env: GIRIE_USER_AGENT="girie v1.5.0"
# url: http://127.0.0.1:8080/api/?user_agent="curl 3000"
# user_agent = "girie v1.5.0"