In [None]:
# SPDX-FileCopyrightText: 2023 Idiap Research Institute <contact@idiap.ch>
#
# SPDX-FileContributor: Delmas Maxime maxime.delmas@idiap.ch
#
# SPDX-License-Identifier: GPL-3.0-or-later

# End-to-end Relation Extraction on the natural-product litterature

## Install the dependencies

In [None]:
!pip install transformers
!pip install peft
!pip install sacremoses

In [None]:
import gc
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

## Load the model

In [None]:
# model and adapters path
model_hf = "microsoft/BioGPT-Large"
lora_adapters = "mdelmas/BioGPT-Large-Natural-Products-RE-Diversity-synt-v1.0" # You can also try: mdelmas/BioGPT-Large-Natural-Products-RE-Extended-synt-v1.0

# Load model and plug adapters using peft
model = AutoModelForCausalLM.from_pretrained(model_hf, device_map={"":0})
model = PeftModel.from_pretrained(model, lora_adapters)
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(model_hf)

### Inference function

In [None]:
def inference(text):
  device = torch.device("cuda")

  # Decoding arguments
  EVAL_GENERATION_ARGS = {"max_length": 1024,
                          "do_sample": False,
                          "forced_eos_token_id": tokenizer.eos_token_id,
                          "num_beams": 3,
                          "early_stopping": "never",
                          "length_penalty": 1.5,
                          "temperature": 0}

  # Prepare the input
  input_text = text + tokenizer.eos_token + tokenizer.bos_token

  # Tokenize
  input_tokens = tokenizer(input_text, return_tensors='pt')
  input_tokens.to(device)

  # Generate
  with torch.no_grad():
    beam_output = model.generate(**input_tokens, **EVAL_GENERATION_ARGS)
  output = tokenizer.decode(beam_output[0][len(input_tokens["input_ids"][0]):], skip_special_tokens=True)

  # Parse and print
  rels = output.strip().split("; ")
  for rel in rels:
    print("- " + rel)
  torch.cuda.empty_cache()
  gc.collect()

## Inference

<style>body {text-align: justify}</style>

#### Example 1 [PMID 24048364](https://pubmed.ncbi.nlm.nih.gov/24048364/)

**Title: Producers and important dietary sources of ochratoxin A and citrinin**

**Abstract:**
<font color='green'>Ochratoxin A</font> (OTA) is a very important mycotoxin, and its research is focused right now on the new findings of OTA, like being a complete carcinogen, information about OTA producers and new exposure sources of OTA. <font color='green'>Citrinin</font> (CIT) is another important mycotoxin, too, and its research turns towards nephrotoxicity. Both additive and synergistic effects have been described in combination with OTA. OTA is produced in foodstuffs by Aspergillus Section Circumdati (<font color='blue'>Aspergillus ochraceus</font>, <font color='blue'>A. westerdijkiae</font>, <font color='blue'>A. steynii</font>) and Aspergillus Section Nigri (<font color='blue'>Aspergillus carbonarius</font>, <font color='blue'>A. foetidus</font>, <font color='blue'>A. lacticoffeatus</font>, <font color='blue'>A. niger</font>, <font color='blue'>A. sclerotioniger</font>, <font color='blue'>A. tubingensis</font>), mostly in subtropical and tropical areas. OTA is produced in foodstuffs by <font color='blue'>Penicillium verrucosum</font> and <font color='blue'>P. nordicum</font>, notably in temperate and colder zones. CIT is produced in foodstuffs by Monascus species (<font color='blue'>Monascus purpureus</font>, <font color='blue'>M. ruber</font>) and Penicillium species (<font color='blue'>Penicillium citrinum</font>, <font color='blue'>P. expansum</font>, <font color='blue'>P. radicicola</font>, <font color='blue'>P. verrucosum</font>). OTA was frequently found in foodstuffs of both plant origin (e.g., cereal products, coffee, vegetable, liquorice, raisins, wine) and animal origin (e.g., pork/poultry). CIT was also found in foodstuffs of vegetable origin (e.g., cereals, pomaceous fruits, black olive, roasted nuts, spices), food supplements based on rice fermented with red microfungi Monascus purpureus and in foodstuffs of animal origin (e.g., cheese).

In [None]:
title_text = "Producers and important dietary sources of ochratoxin A and citrinin."
abstract_text = "Ochratoxin A (OTA) is a very important mycotoxin, and its research is focused right now on the new findings of OTA, like being a complete carcinogen, information about OTA producers and new exposure sources of OTA. Citrinin (CIT) is another important mycotoxin, too, and its research turns towards nephrotoxicity. Both additive and synergistic effects have been described in combination with OTA. OTA is produced in foodstuffs by Aspergillus Section Circumdati (Aspergillus ochraceus, A. westerdijkiae, A. steynii) and Aspergillus Section Nigri (Aspergillus carbonarius, A. foetidus, A. lacticoffeatus, A. niger, A. sclerotioniger, A. tubingensis), mostly in subtropical and tropical areas. OTA is produced in foodstuffs by Penicillium verrucosum and P. nordicum, notably in temperate and colder zones. CIT is produced in foodstuffs by Monascus species (Monascus purpureus, M. ruber) and Penicillium species (Penicillium citrinum, P. expansum, P. radicicola, P. verrucosum). OTA was frequently found in foodstuffs of both plant origin (e.g., cereal products, coffee, vegetable, liquorice, raisins, wine) and animal origin (e.g., pork/poultry). CIT was also found in foodstuffs of vegetable origin (e.g., cereals, pomaceous fruits, black olive, roasted nuts, spices), food supplements based on rice fermented with red microfungi Monascus purpureus and in foodstuffs of animal origin (e.g., cheese)."
text = title_text + " " + abstract_text
inference(text)

#### Example 2 [PMID  32902982](https://pubmed.ncbi.nlm.nih.gov/32902982/)

**Title Penithoketone and Penithochromones A-L, Polyketides from the Deep-Sea-Derived Fungus Penicillium thomii YPGA3**

**Abstract:**
Twelve new polyketides, including a naphthoquinone derivative, <font color='green'>penithoketone</font> (1), and 11 chromone derivatives, <font color='green'>penithochromones A-L</font> (2-12), together with three known compounds (13-15) were isolated from the deep-sea-derived fungus <font color='blue'>Penicillium thomii YPGA3</font>. The structures of the metabolites were elucidated based on extensive analyses of the spectroscopic data, and the configuration of 1 was resolved by quantum chemical calculations of NMR shifts and ECD spectra and comparisons to experimental data. Compound 1, containing a naphthoquinone-derived moiety substituted with a butenolide unit, represents a new modified naphthoquinone skeleton. Interestingly, the 5,7-dioxygenated chromone derivatives 2-13 possessed different alkyl acid or alkyl ester side chain lengths, and those with side chain lengths of seven carbon atoms were discovered from nature for the first time. The metabolites were evaluated for their cytotoxicity against four cancer cell lines; compounds 1 and 15 were found to be active, with IC50 values ranging from 4.9 to 9.1 μM.

In [None]:
title_text = "Penithoketone and Penithochromones A-L, Polyketides from the Deep-Sea-Derived Fungus Penicillium thomii YPGA3. "
abstract_text = "Twelve new polyketides, including a naphthoquinone derivative, penithoketone (1), and 11 chromone derivatives, penithochromones A-L (2-12), together with three known compounds (13-15) were isolated from the deep-sea-derived fungus Penicillium thomii YPGA3. The structures of the metabolites were elucidated based on extensive analyses of the spectroscopic data, and the configuration of 1 was resolved by quantum chemical calculations of NMR shifts and ECD spectra and comparisons to experimental data. Compound 1, containing a naphthoquinone-derived moiety substituted with a butenolide unit, represents a new modified naphthoquinone skeleton. Interestingly, the 5,7-dioxygenated chromone derivatives 2-13 possessed different alkyl acid or alkyl ester side chain lengths, and those with side chain lengths of seven carbon atoms were discovered from nature for the first time. The metabolites were evaluated for their cytotoxicity against four cancer cell lines; compounds 1 and 15 were found to be active, with IC50 values ranging from 4.9 to 9.1 μM."
text = title_text + " " + abstract_text
inference(text)

#### Example 3 [PMID 20853137](https://pubmed.ncbi.nlm.nih.gov/20853137/)

**Title: Identification of new members within suites of amphiphilic marine siderophores**

**Abstract:**
Marine bacterial isolates <font color='blue'>Vibrio sp. HC0601C5</font> and <font color='blue'>Halomonas meridiana str. HC4321C1</font> were isolated off the coast of southern California and were found to produce an expanded suite of previously identified amphiphilic siderophores. Specifically two new members of the amphibactin family, <font color='green'>amphibactins S and T</font>, which have a C14:1 ω-7 fatty acid and a saturated C12 fatty acid, respectively, were produced by <font color='blue'>Vibrio sp. HC0601C5</font>. These siderophores are produced in addition to a number of previously described amphibactins and are excreted into the culture supernatant. Two new members of the aquachelin family of siderophores, <font color='green'>aquachelins I and J</font>, which have an hydroxylated C12 fatty acid and a saturated C10 fatty acid, respectively, were produced by <font color='blue'>Halomonas meridiana str. HC4321C1</font>. These four new siderophores are more hydrophilic than their previously reported relatives, aquachelins A-D and the amphibactin suite of siderophores.

In [None]:
title_text = "Identification of new members within suites of amphiphilic marine siderophores."
abstract_text = "Marine bacterial isolates Vibrio sp. HC0601C5 and Halomonas meridiana str. HC4321C1 were isolated off the coast of southern California and were found to produce an expanded suite of previously identified amphiphilic siderophores. Specifically two new members of the amphibactin family, amphibactins S and T, which have a C14:1 ω-7 fatty acid and a saturated C12 fatty acid, respectively, were produced by Vibrio sp. HC0601C5. These siderophores are produced in addition to a number of previously described amphibactins and are excreted into the culture supernatant. Two new members of the aquachelin family of siderophores, aquachelins I and J, which have an hydroxylated C12 fatty acid and a saturated C10 fatty acid, respectively, were produced by Halomonas meridiana str. HC4321C1. These four new siderophores are more hydrophilic than their previously reported relatives, aquachelins A-D and the amphibactin suite of siderophores."
text = title_text + " " + abstract_text
inference(text)

See for instance alternative examples with PMID [20853137](https://pubmed.ncbi.nlm.nih.gov/20853137), [9586194](https://pubmed.ncbi.nlm.nih.gov/9586194), [17252498](https://pubmed.ncbi.nlm.nih.gov/17252498), [12193025](https://pubmed.ncbi.nlm.nih.gov/12193025), [35841670](https://pubmed.ncbi.nlm.nih.gov/35841670)

#### Example 4 [PMID 12193025](https://pubmed.ncbi.nlm.nih.gov/12193025/)

**Title: 5-hydroxytryptamine-derived alkaloids from two marine sponges of the genus Hyrtios**

**Abstract:**
Indonesian specimens of the marine sponges <font color='blue'>Hyrtios erectus</font> and <font color='blue'>H. reticulatus</font> were found to contain 5-hydroxytryptamine-derived alkaloids. Their structures were determined on the basis of their spectral properties. <font color='blue'>H. erectus</font> contained <font color='green'>hyrtiosulawesine</font> (4), a new beta-carboline alkaloid, together with the already known alkaloids <font color='green'>5-hydroxyindole-3-carbaldehyde</font> (1), <font color='green'>hyrtiosin B</font> (2), and <font color='green'>5-hydroxy-3-(2-hydroxyethyl)indole</font> (3). <font color='blue'>H. reticulatus</font> contained the novel derivative <font color='green'>1,6-dihydroxy-1,2,3,4-tetrahydro-beta-carboline</font> (11) together with <font color='green'>serotonin</font> (5), <font color='green'>6-hydroxy-1-methyl-1,2,3,4-tetrahydro-beta-carboline</font> (7), and <font color='green'>6-hydroxy-3,4-dihydro-1-oxo-beta-carboline</font> (9).

In [None]:
title_text = "5-hydroxytryptamine-derived alkaloids from two marine sponges of the genus Hyrtios"
abstract_text = "Indonesian specimens of the marine sponges Hyrtios erectus and H. reticulatus were found to contain 5-hydroxytryptamine-derived alkaloids. Their structures were determined on the basis of their spectral properties. H. erectus contained hyrtiosulawesine (4), a new beta-carboline alkaloid, together with the already known alkaloids 5-hydroxyindole-3-carbaldehyde (1), hyrtiosin B (2), and 5-hydroxy-3-(2-hydroxyethyl)indole (3). H. reticulatus contained the novel derivative 1,6-dihydroxy-1,2,3,4-tetrahydro-beta-carboline (11) together with serotonin (5), 6-hydroxy-1-methyl-1,2,3,4-tetrahydro-beta-carboline (7), and 6-hydroxy-3,4-dihydro-1-oxo-beta-carboline (9)."
text = title_text + " " + abstract_text
inference(text)

#### Example 5 [PMID 35841670](https://pubmed.ncbi.nlm.nih.gov/35841670/)

**Title: [11]-chaetoglobosins with cytotoxic activities from Pseudeurotium bakeri**

**Abstract:**
Fourteen new <font color='green'>c[11]-chaetoglobosins</font> (1-14), along with two known congeners, <font color='green'>ccytochalasins X and Y</font> (15 and 16), were isolated from the cultures of an endophytic fungus <font color='blue'>Pseudeurotium bakeri</font> P1-1-1. Their structures incorporating absolute configurations were elucidated based on the comprehensive analyses of one- and two-dimensional NMR data, HRESIMS spectrometry, chemical methods, and single-crystal X-ray diffraction analysis (Cu Kα). All isolates were evaluated for their cytotoxic activities and <font color='green'>chaetopseudeurin M</font> (1) displayed significant cytotoxic effects against seven human cancer cell lines, with IC50 values ranging from 5.1 ± 0.9 to 10.8 ± 0.1 μM. Western blot experiments exhibited that compound 1 exerted its cytotoxic effect in MCF-7 cells by inducing G2/M cell cycle arrest and apoptosis via downregulating the expression of cyclin B1 and Cdk1, and activating Bcl-2/caspase-3/PARP pathway, respectively.

In [None]:
title_text = "[11]-chaetoglobosins with cytotoxic activities from Pseudeurotium bakeri."
abstract_text = "Fourteen new [11]-chaetoglobosins (1-14), along with two known congeners, cytochalasins X and Y (15 and 16), were isolated from the cultures of an endophytic fungus Pseudeurotium bakeri P1-1-1. Their structures incorporating absolute configurations were elucidated based on the comprehensive analyses of one- and two-dimensional NMR data, HRESIMS spectrometry, chemical methods, and single-crystal X-ray diffraction analysis (Cu Kα). All isolates were evaluated for their cytotoxic activities and chaetopseudeurin M (1) displayed significant cytotoxic effects against seven human cancer cell lines, with IC50 values ranging from 5.1 ± 0.9 to 10.8 ± 0.1 μM. Western blot experiments exhibited that compound 1 exerted its cytotoxic effect in MCF-7 cells by inducing G2/M cell cycle arrest and apoptosis via downregulating the expression of cyclin B1 and Cdk1, and activating Bcl-2/caspase-3/PARP pathway, respectively."
text = title_text + " " + abstract_text
inference(text)

In this example, the model correctly infer the name of almost all the 14 chaetoglobosins, given that Chaetopseudeurin M was annoated as (1).  