Skip to content

Commit

Permalink
Merge pull request #146 from kermitt2/feature/add-additional-dl-models
Browse files Browse the repository at this point in the history
Add additional DL and transformers models
  • Loading branch information
lfoppiano committed Dec 9, 2022
2 parents 4660fb2 + 0b2b138 commit 3b6ffec
Show file tree
Hide file tree
Showing 53 changed files with 102,058 additions and 45 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/ci-build-unstable.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Build unstable

on: [push]

concurrency:
group: gradle
cancel-in-progress: true


jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v1
- name: Set up JDK 11
uses: actions/setup-java@v1
with:
java-version: 1.11
- name: Build with Gradle
run: ./gradlew build -x test

- name: Test with Gradle Jacoco and Coveralls
run: ./gradlew test jacocoTestReport coveralls --no-daemon

# - name: Coveralls GitHub Action
# uses: coverallsapp/github-action@v1.0.1
# with:
# github-token: ${{ secrets.GITHUB_TOKEN }}
51 changes: 11 additions & 40 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,16 @@ FROM openjdk:8u342-jdk as builder

USER root

RUN apt-key del 7fa2af80 && \
curl https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb --output /opt/cuda-keyring_1.0-1_all.deb && \
dpkg -i /opt/cuda-keyring_1.0-1_all.deb && \
apt-get update && \
RUN apt-get update && \
apt-get -y --no-install-recommends install apt-utils libxml2 git

#RUN git clone https://github.com/kermitt2/grobid.git /opt/grobid-source && cd /opt/grobid-source && git checkout 0.7.1
RUN git clone --filter=blob:none --branch 0.7.1 --no-checkout https://github.com/kermitt2/grobid.git /opt/grobid-source && \
cd /opt/grobid-source && \
git sparse-checkout set --cone grobid-home

WORKDIR /opt/grobid-source
COPY gradle.properties .

#RUN git clone https://github.com/kermitt2/grobid-quantities.git ./grobid-quantities && cd grobid-quantities && git checkout 0.7.1
RUN git clone --depth 1 --branch 0.7.1 https://github.com/kermitt2/grobid-quantities.git ./grobid-quantities && \
RUN git clone --depth 1 --branch feature/add-additional-dl-models https://github.com/kermitt2/grobid-quantities.git ./grobid-quantities && \
cd grobid-quantities

WORKDIR /opt/grobid-source/grobid-quantities
#COPY gradle.properties .
COPY gradle.properties .

# Adjust config
RUN sed -i '/#Docker-ignore-log-start/,/#Docker-ignore-log-end/d' ./resources/config/config.yml
Expand All @@ -49,66 +39,47 @@ RUN rm -rf /opt/grobid-source/grobid-home/models/*

WORKDIR /opt/grobid-source/grobid-quantities
RUN ./gradlew clean assemble --no-daemon --stacktrace --info
#RUN ./gradlew installScibert --no-daemon --info --stacktrace && rm -f /opt/grobid-source/grobid-home/models/*.zip
RUN ./gradlew copyModels --no-daemon --info --stacktrace && rm -f /opt/grobid-source/grobid-home/models/*.tar.gz

RUN ./gradlew downloadTransformers --no-daemon --info --stacktrace && rm -f /opt/grobid-source/grobid-home/models/*.zip

WORKDIR /opt

# -------------------
# build runtime image
# -------------------

FROM grobid/grobid:0.7.1u as runtime
FROM grobid/grobid:0.7.2 as runtime

# setting locale is likely useless but to be sure
ENV LANG C.UTF-8

COPY --from=builder /opt/cuda-keyring_1.0-1_all.deb /opt

# install JRE 8, python and other dependencies
RUN apt-key del 7fa2af80 && \
# curl https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb --output cuda-keyring_1.0-1_all.deb && \
dpkg -i /opt/cuda-keyring_1.0-1_all.deb && \
rm /opt/cuda-keyring*.deb
# rm /etc/apt/sources.list.d/cuda.list && \
# rm /etc/apt/sources.list.d/nvidia-ml.list

RUN apt-get update && \
apt-get -y --no-install-recommends install git wget
# apt-get -y remove python3.6 && \
# DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata && \
# apt-get -y --no-install-recommends install git python3.7 python3.7-venv python3.7-dev python3.7-distutil

WORKDIR /opt/grobid

RUN mkdir -p /opt/grobid/grobid-quantities/resources/clearnlp/models /opt/grobid/grobid-quantities/resources/clearnlp/config
COPY --from=builder /opt/grobid-source/grobid-home/models ./grobid-home/models
COPY --from=builder /opt/grobid-source/grobid-quantities/build/libs/* ./grobid-quantities/
COPY --from=builder /opt/grobid-source/grobid-quantities/resources/config/config.yml ./grobid-quantities/
COPY --from=builder /opt/grobid-source/grobid-quantities/resources/clearnlp/models/* ./grobid-quantities/resources/clearnlp/models

VOLUME ["/opt/grobid/grobid-home/tmp"]

# Install requirements
WORKDIR /opt/grobid

#RUN ln -s /opt/grobid/delft/ delft
RUN ln -s /opt/grobid/grobid-quantities/resources /opt/grobid/resources

# JProfiler
#RUN wget https://download-gcdn.ej-technologies.com/jprofiler/jprofiler_linux_12_0_2.tar.gz -P /tmp/ && \
# tar -xzf /tmp/jprofiler_linux_12_0_2.tar.gz -C /usr/local &&\
# rm /tmp/jprofiler_linux_12_0_2.tar.gz

EXPOSE 8060 8061 5005

#CMD ["java", "-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=0.0.0.0:5005", "-jar", "grobid-quantities-0.7.1-SNAPSHOT-onejar.jar", "server", "config.yml"]
#CMD ["java", "-agentpath:/usr/local/jprofiler12.0.2/bin/linux-x64/libjprofilerti.so=port=8849", "-jar", "grobid-superconductors/grobid-superconductors-0.2.1-SNAPSHOT-onejar.jar", "server", "grobid-superconductors/config.yml"]
CMD ["java", "-jar", "grobid-quantities/grobid-quantities-0.7.2-SNAPSHOT-onejar.jar", "server", "grobid-quantities/config.yml"]

ARG GROBID_VERSION
ENV GROBID_VERSION=${GROBID_VERSION:-unknown}

EXPOSE 8060 8061 5005

#CMD ["java", "-agentpath:/usr/local/jprofiler12.0.2/bin/linux-x64/libjprofilerti.so=port=8849", "-jar", "grobid-superconductors/grobid-quantities-${GROBID_VERSION}-onejar.jar", "server", "grobid-superconductors/config.yml"]
CMD ["sh", "-c", "java -jar grobid-quantities/grobid-quantities-${GROBID_VERSION}-onejar.jar server grobid-quantities/config.yml"]
CMD ["sh", "-c", "java -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=0.0.0.0:5005 -jar grobid-quantities/grobid-quantities-${GROBID_VERSION}-onejar.jar server grobid-quantities/config.yml"]

LABEL \
authors="Luca Foppiano, Patrice Lopez" \
Expand Down
36 changes: 32 additions & 4 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ buildscript {
url "https://plugins.gradle.org/m2/"
}
}
dependencies {
classpath group: 'org.yaml', name: 'snakeyaml', version: '1.19'
}
}

plugins {
Expand All @@ -15,6 +18,7 @@ plugins {
id 'jacoco'
id 'com.github.kt3k.coveralls' version '2.12.0'
id 'com.github.johnrengelman.shadow' version '7.0.0'
id "de.undercouch.download" version "4.1.1"
// id 'net.researchgate.release' version '3.0.1'
}

Expand Down Expand Up @@ -59,9 +63,9 @@ dependencies {
testImplementation 'org.powermock:powermock-api-easymock:2.0.9'

//GROBID
implementation 'org.grobid:grobid-core:0.7.1u'
implementation 'org.grobid:grobid-trainer:0.7.1u'
implementation 'org.grobid:grobid-service:0.7.1u'
implementation 'org.grobid:grobid-core:0.7.2'
implementation 'org.grobid:grobid-trainer:0.7.2'
implementation 'org.grobid:grobid-service:0.7.2'
implementation "xerces:xercesImpl:2.12.0"
implementation "net.arnx:jsonic:1.3.10"
implementation "net.sf.saxon:Saxon-HE:9.6.0-9"
Expand Down Expand Up @@ -236,14 +240,38 @@ publishing {
}
}

def conf = new org.yaml.snakeyaml.Yaml().load( new File("resources/config/config.yml").newInputStream() )
def grobidHome = conf.grobidHome.replace("\$", "").replace('{', "").replace("GROBID_HOME:- ", "").replace("}", "")

/** Model management **/

task copyModels(type: Copy) {
from "${rootDir}/resources/models"
include "**/*.wapiti"
include "**/config.json"
include "**/transformer-config.json"
include "**/transformer-tokenizer/**"
include "**/model_weights.hdf5"
include "**/preprocessor.json"
into "${rootDir}/../grobid-home/models/"
exclude "**/features-engineering/**"
exclude "**/result-logs/**"
into "${rootDir}/${grobidHome}/models/"

doLast {
print "Copy models under grobid-home: ${grobidHome}"
}
}

task downloadTransformers(dependsOn: copyModels) {
doLast {
download {
src "https://kdrive.infomaniak.com/app/share/104844/e458be50-b94c-4e16-b190-6fa17db58ef0/147421/download"
dest "${rootDir}/${grobidHome}/models/quantities-transformers.zip"
overwrite false
print "Download bulky transformers files under grobid-home: ${grobidHome}"
}
ant.unzip(src: "${rootDir}/${grobidHome}/models/quantities-transformers.zip", dest: "${rootDir}/${grobidHome}/models/")
}
}

wrapper {
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added resources/dataset/original/pdf/hal-00987664.pdf
Binary file not shown.
Binary file added resources/dataset/original/pdf/hal-01223150.pdf
Binary file not shown.
Binary file added resources/dataset/original/pdf/hal-01252076.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion resources/dataset/values/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The information are provided in the `quantities` dataset. [Here](../quantities/)
| holdout | 1009 | 498 |
| ratio | 25.79% | 26.11% |


## Labels information

| set | `<number>` | `<alpha>` | `<time>` | `<base>` | `<pow>` |
Expand All @@ -17,7 +18,6 @@ The information are provided in the `quantities` dataset. [Here](../quantities/)
| ratio | 27.30% | 18.67% | 23.47% | 35.14% | 38.24% |



## In-domain / out-of-domain information

| label | # in domain | # in domain uniques | # out domain | # out domain unique |
Expand Down
46 changes: 46 additions & 0 deletions resources/models/quantities-BERT_CRF/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"model_name": "quantities-BERT_CRF",
"architecture": "BERT_CRF",
"embeddings_name": null,
"char_vocab_size": 223,
"case_vocab_size": 8,
"char_embedding_size": 25,
"num_char_lstm_units": 25,
"max_char_length": 30,
"features_vocabulary_size": 12,
"features_indices": null,
"features_embedding_size": 4,
"features_lstm_units": 4,
"max_sequence_length": 512,
"word_embedding_size": 0,
"num_word_lstm_units": 100,
"case_embedding_size": 5,
"dropout": 0.5,
"recurrent_dropout": 0.5,
"use_crf": true,
"use_chain_crf": false,
"fold_number": 1,
"batch_size": 10,
"transformer_name": "allenai/scibert_scivocab_cased/dir",
"use_ELMo": false,
"labels": {
"<PAD>": 0,
"B-<unitLeft>": 1,
"B-<unitRight>": 2,
"B-<valueAtomic>": 3,
"B-<valueBase>": 4,
"B-<valueLeast>": 5,
"B-<valueList>": 6,
"B-<valueMost>": 7,
"B-<valueRange>": 8,
"I-<unitLeft>": 9,
"I-<unitRight>": 10,
"I-<valueAtomic>": 11,
"I-<valueBase>": 12,
"I-<valueLeast>": 13,
"I-<valueList>": 14,
"I-<valueMost>": 15,
"I-<valueRange>": 16,
"O": 17
}
}

0 comments on commit 3b6ffec

Please sign in to comment.