In [1]:
from transformers import pipeline

triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')

# We need to use the tokenizer manually since we need special tokens.

text = "It aims to consolidate the understanding of accounting theory and practice gained in the other units. The corporate governance, economic consequences/costly contracting, conceptual framework, and social responsibility perspectives are examined in relation to a number of current accounting issues. The outcomes of this unit are: critically review current issues of relevance to accounting professionals; provide an overview of the issues relating to accounting policy choice, costly contracting and economic consequences; critically review the benefits, and potential risks, of ICT for accounting practice, business systems, communication, and decision-making; demonstrate the ability to apply research skills to investigate a contemporary accounting issue; demonstrate the ability to produce clear and concise written communication in a complex accounting context to accountants and non-accountants; reflect on performance feedback to identify and action opportunities for learning and self-improvement; demonstrate an understanding and respond appropriately to ethical, cultural, social and sustainability issues." 
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text, return_tensors=True, return_text=False)[0]["generated_token_ids"]])
print(extracted_text[0])

<s><triplet> accounting theory <subj> accounting <obj> studies <triplet> accounting <subj> accounting theory <obj> studied by <subj> non-accountants <obj> practiced by <triplet> non-accountants <subj> accounting <obj> field of this occupation</s>


In [2]:
# Function to parse the generated text and extract the triplets
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets


extracted_triplets = extract_triplets(extracted_text[0])
print(extracted_triplets)


[{'head': 'accounting theory', 'type': 'studies', 'tail': 'accounting'}, {'head': 'accounting', 'type': 'studied by', 'tail': 'accounting theory'}, {'head': 'accounting', 'type': 'practiced by', 'tail': 'non-accountants'}, {'head': 'non-accountants', 'type': 'field of this occupation', 'tail': 'accounting'}]


In [27]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")
gen_kwargs = {
    "max_length": 256,
    "length_penalty": 0,
    "num_beams": 3,
    "num_return_sequences": 3,
}

# Text to extract triplets from
#text = 'Punta Cana is a resort town in the municipality of Higüey, in La Altagracia Province, the easternmost province of the Dominican Republic.'
jobTextList = [
    "Salary\n$101,955 to $115,005\nOpportunity Type\nFull-Time\nOpportunity Status\nOngoing\nAPS Classification\nExecutive Level 1\nClosing Date\n25/06/2020\nPosted 11/06/2020\nDepartment of Defence\nCost Model & Benchmarking Manager\nOsborne SA\nThe key duties of the position include:",
    "Your Role\nThe key responsibilities for the Cost Model and Benchmarking Manager are:\n* Establishing and maintaining the NCB Construction Cost Model\n* Developing the NCB capability to forecast construction costs and assess the impact of program changes\n* Managing the NCB construction benchmarking program\n* Maintain a historical record of construction costs and cost metrics\n* Develop and prepare estimates of construction costs and cost drivers for future programs.",
    "* Work with the NCB Technical Cost Estimator and Business Cost manager to prepare NCB cost forecasts.\n* Relate actual costs in the shipyard to the cost models and identify areas for efficiency improvements.\n* Assist NCB technical cost estimator in the assessment of contract proposals\n* Contribute to the delivery of the NCB Strategic Plan in relation to cost activities by assisting with the development, review and implementation of processes and systems to support current and future business requirements.",
    "Our Ideal Candidate\nTo succeed in this role you must have personal drive and integrity with the ability to:\n* Build a collaborative team environment that provides services and outcomes consistent with the organization's operating model with a focus on achieving business objectives.\n* Develop and implement initiatives that are aligned with the organizational strategic plan.",
    "* Lead and motivate teams to continuously achieve business outcomes that support the strategic vision.\n* Support the NCB leadership team by monitoring and effectively reporting the progress of assigned tasking in a timely manner\nEligibility\nApplicants must be able to obtain and maintain a security clearance at 'Negative Vetting Level 1'.",
    "RecruitAbility applies to this vacancy. Under the RecruitAbility scheme you will be invited to participate in further assessment activity for the vacancy if you choose"
]
unitDescription = [
    "Contemporary Issues in Accounting - Face to face / Blended learning is a unit at the UWA Business School. The unit code is ACCT5501. This is the capstone unit in the Master of Professional Accounting. It aims to consolidate the understanding of accounting theory and practice gained in the other units. The corporate governance, economic consequences/costly contracting, conceptual framework, and social responsibility perspectives are examined in relation to a number of current accounting issues.",
    "The outcomes of this unit are: critically review current issues of relevance to accounting professionals; provide an overview of the issues relating to accounting policy choice, costly contracting and economic consequences; critically review the benefits, and potential risks, of ICT for accounting practice, business systems, communication, and decision-making; demonstrate the ability to apply research skills to investigate a contemporary accounting issue; demonstrate the ability to produce clear and concise written communication in a complex accounting context to accountants and non-accountants; reflect on performance feedback to identify and action opportunities for learning and self-improvement; demonstrate an understanding and respond appropriately to ethical, cultural, social and sustainability issues."
]




In [28]:


for text in unitDescription:
    # Tokenizer text
    model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')

    # Generate
    generated_tokens = model.generate(
        model_inputs["input_ids"].to(model.device),
        attention_mask=model_inputs["attention_mask"].to(model.device),
        **gen_kwargs,
    )

    # Extract text
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # Extract triplets
    for idx, sentence in enumerate(decoded_preds):
        #print(f'Prediction triplets sentence {idx}')
        triplets = extract_triplets(sentence)
        for t in triplets:
            print(t)


{'head': 'Master of Professional Accounting', 'type': 'part of', 'tail': 'UWA Business School'}
{'head': 'Contemporary Issues in Accounting', 'type': 'part of', 'tail': 'UWA Business School'}
{'head': 'Master of Professional Accounting', 'type': 'field of work', 'tail': 'accounting'}
{'head': 'non-accountant', 'type': 'subclass of', 'tail': 'accountant'}
{'head': 'non-accountants', 'type': 'subclass of', 'tail': 'accounting professionals'}
{'head': 'non-accountant', 'type': 'subclass of', 'tail': 'accounting professional'}
