<a href="https://colab.research.google.com/github/ivelin/gui2refexp/blob/main/gui_refexp_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GUI RefExp Notebook

*   Based on LayoutML model suggested in this [IPA paper](https://github.com/debymf/ipa_probing)
*   Initial fine tuning on Rico SCA dataset



Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [11]:
#@title Checkout source files from github repo
![[ -d "gui2refexp" ]] || git clone https://github.com/ivelin/gui2refexp.git

!cd gui2refexp && git pull

!python --version


Already up to date.
Python 3.8.16


In [8]:
!#@title Install third party libs
!pip install -r gui2refexp/requirements.txt


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [21]:
!pip show prefect

import prefect

import prefect.engine as e

help(e)
dir(e)

Name: prefect
Version: 0.11.1
Summary: The Prefect Core automation and scheduling engine.
Home-page: https://www.github.com/PrefectHQ/prefect
Author: Prefect Technologies, Inc.
Author-email: help@prefect.io
License: Apache License 2.0
Location: /usr/local/lib/python3.8/dist-packages
Requires: click, cloudpickle, croniter, dask, distributed, docker, marshmallow, marshmallow-oneofschema, mypy-extensions, pendulum, python-box, python-dateutil, python-slugify, pytz, pyyaml, requests, tabulate, toml, urllib3
Required-by: 
Help on module prefect.engine in prefect:

NAME
    prefect.engine - Client-side execution and orchestration of flows and tasks.

DESCRIPTION
    Engine process overview
    
    - The flow or task is called by the user.
        See `Flow.__call__`, `Task.__call__`
    
    - A synchronous engine function acts as an entrypoint to the async engine.
        See `enter_flow_run_engine`, `enter_task_run_engine`
    
    - The async engine creates a run via the API and prepares

['Abort',
 'Any',
 'AsyncExitStack',
 'Awaitable',
 'BaseResult',
 'BaseTaskRunner',
 'CONCURRENCY_MESSAGES',
 'Dict',
 'EngineReturnType',
 'Flow',
 'FlowPauseTimeout',
 'FlowRun',
 'FlowRunContext',
 'FlowRunFilter',
 'FlowRunSort',
 'Iterable',
 'List',
 'Literal',
 'MappingLengthMismatch',
 'MappingMissingIterable',
 'NotPausedError',
 'Optional',
 'OrionClient',
 'OrionHandler',
 'PREFECT_DEBUG_MODE',
 'PREFECT_LOGGING_LOG_PRINTS',
 'PartialModel',
 'Pause',
 'Paused',
 'PausedRun',
 'Pending',
 'PrefectFuture',
 'PrefectObjectRegistry',
 'Quote',
 'R',
 'ResultFactory',
 'Running',
 'Set',
 'SetStateStatus',
 'State',
 'StateDetails',
 'StateType',
 'TagsContext',
 'Task',
 'TaskConcurrencyType',
 'TaskRun',
 'TaskRunContext',
 'TaskRunInput',
 'TaskRunResult',
 'TypeVar',
 'UNTRACKABLE_TYPES',
 'UUID',
 'Union',
 'UpstreamTaskError',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dynamic_key_for_task_run',
 '_

In [9]:
#@title Run LayoutML training flow for RefExp

# !python -m  layout_ipa.flows.layout_lm.layout_lm_train_pair_classification


import prefect
from dynaconf import settings
from loguru import logger
from prefect import Flow, tags
from prefect.engine.flow_runner import FlowRunner
from prefect.engine.results import LocalResult
from layout_ipa.tasks.datasets_parse.rico_sca import PrepareRicoScaPair
# from layout_ipa.tasks.datasets_parse.pixel_help import PreparePixelHelpPair
from layout_ipa.tasks.layout_lm.data_prep import PrepareLayoutLMPairTask
from layout_ipa.tasks.layout_lm.model_pipeline import LayoutLMPair
from sklearn.metrics import f1_score
from layout_ipa.util.evaluation import pair_evaluation_2d

prepare_rico_task = PrepareRicoScaPair()

train_path = settings["rico_sca"]["train"]
dev_path = settings["rico_sca"]["dev"]
test_path = settings["rico_sca"]["test"]

## Uncomment this if you want to test for pixel_help
#test_path = settings["pixel_help"]

prepare_rico_task = PrepareRicoScaPair()
# prepare_pixel_help_task = PreparePixelHelpPair()
prepare_rico_layout_lm_task = PrepareLayoutLMPairTask()
layout_lm_trainer_task = LayoutLMPair()


# Change the instruction type that you require here
INSTRUCTION_TYPE = [0,1,2,3]
#  where: 0 and 3 - Extractive
#             1 - Absolute
#             2 - Relative

LAYOUT_LM_MODEL =  "microsoft/layoutlmv2-base-uncased"

with Flow("Running the Transformers for Pair Classification") as flow1:
    with tags("train"):
        train_input = prepare_rico_task(train_path, type_instructions=INSTRUCTION_TYPE)
        train_dataset = prepare_rico_layout_lm_task(train_input["data"], tokenizer_model = LAYOUT_LM_MODEL)
    with tags("dev"):
        dev_input = prepare_rico_task(dev_path, type_instructions=INSTRUCTION_TYPE)
        dev_dataset = prepare_rico_layout_lm_task(dev_input["data"], tokenizer_model = LAYOUT_LM_MODEL)
    with tags("test"):
        test_input = prepare_rico_task(test_path, type_instructions=INSTRUCTION_TYPE)
        # test_input = prepare_pixel_help_task(test_path)
        test_dataset = prepare_rico_layout_lm_task(test_input["data"], tokenizer_model = LAYOUT_LM_MODEL)
    layout_lm_trainer_task(
        train_dataset=train_dataset,
        dev_dataset=dev_dataset,
        test_dataset=test_dataset,
        mapping_dev=dev_input["mapping"],
        mapping_test=test_input["mapping"],
        bert_model=LAYOUT_LM_MODEL,
        task_name="layout_lm_pair_rico",
        output_dir="./cache/layout_lm_pair_rico/",
        mode="test",
        eval_fn=pair_evaluation_2d,
    )


FlowRunner(flow=flow1).run()


ModuleNotFoundError: ignored