In [3]:
import torch
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
import csv
import os
from dotenv import load_dotenv
from huggingface_hub import login
import datasets

from sklearn.metrics import f1_score
import seqeval.metrics

import json
import re

import pprint as pp

import warnings

warnings.filterwarnings("ignore")

import random

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

load_dotenv()
login(os.getenv('HF_READ'))

  from .autonotebook import tqdm as notebook_tqdm


cuda
Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/kelsey/.cache/huggingface/token
Login successful


# Sentiment Analysis

In [2]:
def generate_instruction(data_point):

    if data_point[0] == 0:
        sentiment = "Negattiva"
    elif data_point[0] == 1:
        sentiment = "Pożittiva"
    else:
        raise ValueError(f"Invalid sentiment value: {data_point[0]}")
    
    instruction = 'Għidli jekk din is-sentenza hiex pożittiva jew negattiva.'
    input = data_point[1]
    output = sentiment
    
    return {'instruction': instruction, 'input': input, 'output': output}
    
def open_data(file_path):
    
    sentiment_data = []

    with open(file_path, 'r') as f:
        csv_reader = csv.reader(f)
        
        #skip header
        next(csv_reader)
        
        for row in csv_reader:
            
            sentiment = int(row[0])
            text = row[1]
            
            data_point = (sentiment, text)
            sentiment_data.append(data_point)
            
    return sentiment_data

test_path = 'data/sentiment/test.csv'

test_data = open_data(test_path)

test_prompts = [generate_instruction(data_point) for data_point in test_data]

print(test_prompts[1])

print("Number of test entries:",len(test_prompts))

{'instruction': 'Għidli jekk din is-sentenza hiex pożittiva jew negattiva.', 'input': "Meta Dr.Alfred Sant kien iltaqa mal-istudenti tal-universita, is-saħħara l-ħażina u velenuża kienu ibbujawh u swietlu l-elezzjoni li ħareġ tellief fuq il-velenu anke ta' dak li kien għamel bħad-dittaturi meta ħoloq lilu inniffsu bħala kap ta' Malta Repubblika.", 'output': 'Negattiva'}
Number of test entries: 171


In [3]:
BASE_MODEL = "meta-llama/Llama-2-7b-hf"
LORA_WEIGHTS = "kelseybonnici/maltese-alpaca-sa" #hf link

tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

if device == "cuda":
    model = LlamaForCausalLM.from_pretrained(
        BASE_MODEL,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(model, LORA_WEIGHTS, torch_dtype=torch.float16)

model.eval()
if torch.__version__ >= "2":
    model = torch.compile(model)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:19<00:00,  9.65s/it]


In [4]:
def evaluate(
    data_point,
    temperature=0.1,
    top_p = 1.0,
    top_k=40,
    num_beams=1,
    max_new_tokens=5,
    **kwargs,
):
      
    def generate_prompt(instruction, input=None):
        if input:
            return f"""Hawn taħt hawn struzzjoni li tiddeskrivi kompitu, flimkien ma' input li jipprovdi aktar kuntest. Ikteb tweġiba li timla t-talba kif xieraq.\n\n### Istruzzjoni: \n{instruction}\n\n### Input:\n{input}\n\n### Risposta:\n"""
        
    prompt = generate_prompt(data_point['instruction'], data_point['input'])
    
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
        
    s = generation_output.sequences[0]
    
    output = tokenizer.decode(s)
    
    return output.split("### Risposta:")[1].strip()


pred_list = []

json_file = open('data/sentiment/ma_results.json', 'w')
json_file.write('[')

for idx, data_point in enumerate(test_prompts):
     
    print(idx+1, 'of', len(test_prompts), end = ' ')
    
    output = evaluate(data_point)
    
    print(output)
    
    pred_list.append(output)
    
    json_file.write('[')
    json.dump(output, json_file)
    json_file.write('],\n')
   
json_file.seek(json_file.tell()-2, os.SEEK_SET) #remove last comma
json_file.write(']')    
json_file.close()

1 of 171 negattiva</s>
2 of 171 negattiva</s>
3 of 171 negattiva</s>
4 of 171 negattiva</s>
5 of 171 negattiva</s>
6 of 171 negattiva</s>
7 of 171 negattiva</s>
8 of 171 negattiva</s>
9 of 171 negattiva</s>
10 of 171 negattiva</s>
11 of 171 negattiva</s>
12 of 171 pożittiva</s>
13 of 171 negattiva</s>
14 of 171 negattiva</s>
15 of 171 negattiva</s>
16 of 171 negattiva</s>
17 of 171 pożittiva</s>
18 of 171 negattiva</s>
19 of 171 negattiva</s>
20 of 171 negattiva</s>
21 of 171 negattiva</s>
22 of 171 negattiva</s>
23 of 171 negattiva</s>
24 of 171 negattiva</s>
25 of 171 pożittiva</s>
26 of 171 negattiva</s>
27 of 171 negattiva</s>
28 of 171 negattiva</s>
29 of 171 pożittiva</s>
30 of 171 negattiva</s>
31 of 171 negattiva</s>
32 of 171 negattiva</s>
33 of 171 pożittiva</s>
34 of 171 pożittiva</s>
35 of 171 negattiva</s>
36 of 171 negattiva</s>
37 of 171 negattiva</s>
38 of 171 negattiva</s>
39 of 171 pożittiva</s>
40 of 171 pożittiva</s>
41 of 171 negattiva</s>
42 of 171 pożittiva</s>
4

In [5]:
#mapping word to label for score calculation
def map_sentiment(word):
    lower_word = word.lower()
    if 'negattiva' in lower_word:
        return 0
    elif 'pożittiva' in lower_word:
        return 1
    else:
        raise ValueError(f"Invalid sentiment value: {word}")
    
y_pred = [map_sentiment(pred) for pred in pred_list]

y_true = [data_point[0] for data_point in test_data]

#f1 score macro
f1 = f1_score(y_true, y_pred, average='macro')
print('F1 score macro:', f1*100)

F1 score macro: 75.41185653835093


# Part Of Speech Tagging UPOS

In [2]:
dataset = datasets.load_dataset('universal_dependencies', 'mt_mudt')

#mapping taken from bertu code
upos_mapping = {
        "ADJ": "ADJ",
        "ADV": "ADV",
        "COMP": "SCONJ",
        "CONJ_CORD": "CCONJ",
        "CONJ_SUB": "SCONJ",
        "DEF": "DET",
        "FOC": "ADV",
        "FUT": "AUX",
        "GEN": "ADP",
        "GEN_DEF": "ADP",
        "GEN_PRON": "PRON",
        "HEMM": "VERB",
        "INT": "INTJ",
        "KIEN": "AUX",
        "LIL": "ADP",
        "LIL_PRON": "PRON",
        "LIL_DEF": "ADP",
        "NEG": "PART",
        "NOUN": "NOUN",
        "NOUN_PROP": "PROPN",
        "NUM_CRD": "NUM",
        "NUM_FRC": "NUM",
        "NUM_ORD": "NUM",
        "NUM_WHD": "NUM",
        "PART_ACT": "VERB",
        "PART_PASS": "VERB",
        "PREP": "ADP",
        "PREP_DEF": "ADP",
        "PREP_PRON": "PRON",
        "PROG": "AUX",
        "PRON_DEM": "PRON",
        "PRON_DEM_DEF": "PRON",
        "PRON_INDEF": "PRON",
        "PRON_INT": "PRON",
        "PRON_PERS": "PRON",
        "PRON_PERS_NEG": "AUX",
        "PRON_REC": "PRON",
        "PRON_REF": "PRON",
        "QUAN": "DET",
        "VERB": "VERB",
        "VERB_PSEU": "VERB",
        "X_ABV": "NOUN",
        "X_BOR": "SYM",
        "X_DIG": "NUM",
        "X_ENG": "X",
        "X_FOR": "X",
        "X_PUN": "PUNCT",
    }


train_dataset = []

for entry in dataset['train']:
    text = entry['text']
    tokens = entry['tokens']
    xpos = entry['xpos']
    
    #create upos using mapping
    upos = [upos_mapping[x] for x in xpos]
    
    
    train_dataset.append({'text': text, 'tokens': tokens, 'upos': upos, 'xpos': xpos})
    
#save to json
with open('data/eval/pos/train.json', 'w') as f:
    json.dump(train_dataset, f, ensure_ascii=False, indent=4)


val_dataset = []

for entry in dataset['validation']:
    text = entry['text']
    tokens = entry['tokens']
    xpos = entry['xpos']
    
    #create upos using mapping
    upos = [upos_mapping[x] for x in xpos]
    
    
    val_dataset.append({'text': text, 'tokens': tokens, 'upos': upos, 'xpos': xpos})
    
#save to json
with open('data/eval/pos/validation.json', 'w') as f:
    json.dump(val_dataset, f, ensure_ascii=False, indent=4)
    
test_dataset = []

for entry in dataset['test']:
    text = entry['text']
    tokens = entry['tokens']
    xpos = entry['xpos']
    
    #create upos using mapping
    upos = [upos_mapping[x] for x in xpos]
    
    
    test_dataset.append({'text': text, 'tokens': tokens, 'upos': upos, 'xpos': xpos})
    
#save to json
with open('data/eval/pos/test.json', 'w') as f:
    json.dump(test_dataset, f, ensure_ascii=False, indent=4)

Downloading data: 100%|██████████| 272k/272k [00:01<00:00, 229kB/s]
Downloading data: 100%|██████████| 125k/125k [00:00<00:00, 139kB/s]
Downloading data: 100%|██████████| 137k/137k [00:00<00:00, 143kB/s]
Generating train split: 100%|██████████| 1123/1123 [00:00<00:00, 45353.65 examples/s]
Generating validation split: 100%|██████████| 433/433 [00:00<00:00, 151017.27 examples/s]
Generating test split: 100%|██████████| 518/518 [00:00<00:00, 150627.39 examples/s]


In [4]:
def generate_instruction(data_point, task):

    instruction = 'Agħmel part-of-speech tagging ta\' din is-sentenza.'
        
    input = data_point['tokens']
    
    if task == 'upos':
        output = data_point['upos']
    elif task == 'xpos':
        output = data_point['xpos']
    else:
        raise ValueError(f"Invalid task value: {task}")
    
    return {'instruction': instruction, 'input': input, 'output': output}

#open test data
test_path = 'data/pos/test.json'

with open(test_path, 'r') as f:
    test_data = json.load(f)
    
test_prompts = [generate_instruction(data_point, 'upos') for data_point in test_data]

print(test_prompts[0])

{'instruction': "Agħmel part-of-speech tagging ta' din is-sentenza.", 'input': ['Philip', 'Schembri', ',', 'iċ-', 'Chairman', 'tal-', 'Bord', "ta'", 'Inkjesta', 'dwar', 'l-', 'allegat', 'rimi', 'illegali', "ta'", 'kimika', 'imsejħa', 'mercaptan', 'mill-', 'Enemalta', ',', 'qal', 'li', 'l-', 'Bord', "ta'", 'Inkjesta', 'talab', 'id-', 'dokumenti', 'li', 'qal', 'li', 'kellu', "f'", 'idejh', 'id-', 'Deputat', 'Laburista', 'Joe', 'Mizzi', 'wara', 'li', 'hu', "b'", 'mod', 'pubbliku', 'qal', 'li', 'kellu', 'dokumenti', 'li', 'kien', 'lest', 'jgħaddihom', 'lil', 'Bord', "ta'", 'Inkjesta', '.'], 'output': ['PROPN', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'DET', 'VERB', 'NOUN', 'ADJ', 'ADP', 'NOUN', 'VERB', 'PROPN', 'ADP', 'PROPN', 'PUNCT', 'VERB', 'SCONJ', 'DET', 'NOUN', 'ADP', 'NOUN', 'VERB', 'DET', 'NOUN', 'SCONJ', 'VERB', 'SCONJ', 'VERB', 'ADP', 'NOUN', 'DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN', 'ADP', 'SCONJ', 'PRON', 'ADP', 'NOUN', 'ADJ', 'VERB', 'SCONJ', 'VERB',

In [4]:
BASE_MODEL = "meta-llama/Llama-2-7b-hf"
LORA_WEIGHTS = "kelseybonnici/maltese-alpaca-pos-upos" #hf link

tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

if device == "cuda":
    model = LlamaForCausalLM.from_pretrained(
        BASE_MODEL,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(model, LORA_WEIGHTS, torch_dtype=torch.float16)

model.eval()
if torch.__version__ >= "2":
    model = torch.compile(model)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:21<00:00, 10.65s/it]


In [5]:
def evaluate(
    data_point,
    temperature=0.1,
    top_p = 0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=256,
    **kwargs,
):
      
    def generate_prompt(instruction, input=None):
        if input:
            return f"""Hawn taħt hawn struzzjoni li tiddeskrivi kompitu, flimkien ma' input li jipprovdi aktar kuntest. Ikteb tweġiba li timla t-talba kif xieraq.\n\n### Istruzzjoni: \n{instruction}\n\n### Input:\n{input}\n\n### Risposta:\n"""
        
    prompt = generate_prompt(data_point['instruction'], data_point['input'])
    
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
        
    s = generation_output.sequences[0]
    
    output = tokenizer.decode(s)
    
    return output.split("### Risposta:")[1].strip()

pred_list = []

json_file = open('data/eval/pos/ma_results_upos.json', 'w')
json_file.write('[')

for idx, data_point in enumerate(test_prompts):
    
    print(idx+1, 'of', len(test_prompts), end=' ')
    
    output = evaluate(data_point)
    
    print(output)
    
    pred_list.append(output)
    
    json_file.write('[')
    json.dump(output, json_file)
    json_file.write('],\n')
   
json_file.seek(json_file.tell()-2, os.SEEK_SET) #remove last comma
json_file.write(']')    
json_file.close()

1 of 518 ['PROPN', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN', 'VERB', 'ADJ', 'ADP', 'NOUN', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'VERB', 'SCONJ', 'DET', 'NOUN', 'ADP', 'NOUN', 'VERB', 'DET', 'NOUN', 'SCONJ', 'SCONJ', 'VERB', 'ADP', 'NOUN', 'DET', 'NOUN', 'ADJ', 'PROPN', 'PROPN', 'ADP', 'SCONJ', 'PRON', 'ADP', 'NOUN', 'ADJ', 'VERB', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'PUNCT']</s>
2 of 518 ['DET', 'NOUN', 'VERB', 'SCONJ', 'DET', 'NOUN', 'VERB', 'ADP', 'PROPN', 'PROPN', 'SCONJ', 'PRON', 'AUX', 'ADJ', 'VERB', 'DET', 'NOUN', 'SCONJ', 'VERB', 'CCONJ', 'PROPN', 'PROPN', 'VERB', 'SCONJ', 'PRON', 'AUX', 'VERB', 'DET', 'NOUN', 'ADV', 'ADP', 'PRON', 'PRON', 'PUNCT']</s>
3 of 518 ['PRON', 'VERB', 'SCONJ', 'DET', 'NOUN', 'ADJ', 'ADP', 'DET', 'NOUN', 'ADJ', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'NOUN', 'ADJ', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'VERB', 'ADJ', 'ADP', 'NOUN', 'VERB', 'NOUN', 'PUNCT']</s>
4

In [8]:
equal_count = 0
less_count = 0
more_count = 0

no_s_tag = 0

for i in range(len(pred_list)):

    pred = pred_list[i]
    true = test_data[i]['upos']  
    
    if '</s>' not in pred:
        no_s_tag += 1
        

    #remove <\s> if present
    pred = pred.replace('</s>', '')
    
    pred = re.findall(r'\b\w+\b', pred)

    if len(pred) == len(true):
        equal_count += 1
        
    elif len(pred) < len(true):
        less_count += 1
        
    else:
        more_count += 1
        # print('pred:', pred)
        # print('true:', true)
        # print('\n\n')
      
print('No s tag:', no_s_tag)
        
print('Equal:', equal_count)
print('Less:', less_count)
print('More:', more_count)

No s tag: 6
Equal: 457
Less: 25
More: 36


In [9]:
tag_count = 0
correct_count = 0

for i in range(len(pred_list)):

    pred = pred_list[i]
    true = test_data[i]['upos']  

    #remove <\s> if present
    pred = pred.replace('</s>', '')
    
    pred = re.findall(r'\b\w+\b', pred)

    #in case where test_pred is shorter
    if len(pred) < len(true):
        # print('Padding test_pred')
        # print('pred:',pred)
        # print('true:',true, '\n\n')
        pred += ['None'] * (len(true) - len(pred))

    for i in range(len(true)):
        tag_count += 1
        if true[i] == pred[i]:
            correct_count += 1

#percentage of correct predictions
percentage = (correct_count / tag_count) * 100
print("Percentage correct:", percentage, "%")

Percentage correct: 87.11279689334417 %


# Part of Speech XPOS

In [2]:
def generate_instruction(data_point, task):

    instruction = 'Agħmel part-of-speech tagging ta\' din is-sentenza.'
        
    input = data_point['tokens']
    
    if task == 'upos':
        output = data_point['upos']
    elif task == 'xpos':
        output = data_point['xpos']
    else:
        raise ValueError(f"Invalid task value: {task}")
    
    return {'instruction': instruction, 'input': input, 'output': output}

#open test data
test_path = 'data/eval/pos/test.json'

with open(test_path, 'r') as f:
    test_data = json.load(f)
    
test_prompts = [generate_instruction(data_point, 'xpos') for data_point in test_data]

print(test_prompts[0])

{'instruction': "Agħmel part-of-speech tagging ta' din is-sentenza.", 'input': ['Philip', 'Schembri', ',', 'iċ-', 'Chairman', 'tal-', 'Bord', "ta'", 'Inkjesta', 'dwar', 'l-', 'allegat', 'rimi', 'illegali', "ta'", 'kimika', 'imsejħa', 'mercaptan', 'mill-', 'Enemalta', ',', 'qal', 'li', 'l-', 'Bord', "ta'", 'Inkjesta', 'talab', 'id-', 'dokumenti', 'li', 'qal', 'li', 'kellu', "f'", 'idejh', 'id-', 'Deputat', 'Laburista', 'Joe', 'Mizzi', 'wara', 'li', 'hu', "b'", 'mod', 'pubbliku', 'qal', 'li', 'kellu', 'dokumenti', 'li', 'kien', 'lest', 'jgħaddihom', 'lil', 'Bord', "ta'", 'Inkjesta', '.'], 'output': ['NOUN_PROP', 'NOUN_PROP', 'X_PUN', 'DEF', 'NOUN', 'GEN_DEF', 'NOUN', 'GEN', 'NOUN', 'PREP', 'DEF', 'PART_PASS', 'NOUN', 'ADJ', 'GEN', 'NOUN', 'PART_PASS', 'NOUN_PROP', 'PREP_DEF', 'NOUN_PROP', 'X_PUN', 'VERB', 'COMP', 'DEF', 'NOUN', 'GEN', 'NOUN', 'VERB', 'DEF', 'NOUN', 'COMP', 'VERB', 'COMP', 'VERB_PSEU', 'PREP', 'NOUN', 'DEF', 'NOUN', 'ADJ', 'NOUN_PROP', 'NOUN_PROP', 'PREP', 'COMP', 'PRON_P

In [3]:
BASE_MODEL = "meta-llama/Llama-2-7b-hf"
LORA_WEIGHTS = "kelseybonnici/maltese-alpaca-pos-xpos" #hf link

tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

if device == "cuda":
    model = LlamaForCausalLM.from_pretrained(
        BASE_MODEL,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(model, LORA_WEIGHTS, torch_dtype=torch.float16)

model.eval()
if torch.__version__ >= "2":
    model = torch.compile(model)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:38<00:00, 19.01s/it]


In [4]:
def evaluate(
    data_point,
    temperature=0.1,
    top_p = 0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=256,
    **kwargs,
):
      
    def generate_prompt(instruction, input=None):
        if input:
            return f"""Hawn taħt hawn struzzjoni li tiddeskrivi kompitu, flimkien ma' input li jipprovdi aktar kuntest. Ikteb tweġiba li timla t-talba kif xieraq.\n\n### Istruzzjoni: \n{instruction}\n\n### Input:\n{input}\n\n### Risposta:\n"""
        
    prompt = generate_prompt(data_point['instruction'], data_point['input'])
    
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
        
    s = generation_output.sequences[0]
    
    output = tokenizer.decode(s)
    
    return output.split("### Risposta:")[1].strip()

pred_list = []

json_file = open('data/eval/pos/ma_results_xpos.json', 'w')
json_file.write('[')

for idx, data_point in enumerate(test_prompts):
    
    print(idx+1, 'of', len(test_prompts), end=' ')
    
    output = evaluate(data_point)
    
    print(output)
    
    pred_list.append(output)
    
    json_file.write('[')
    json.dump(output, json_file)
    json_file.write('],\n')
   
json_file.seek(json_file.tell()-2, os.SEEK_SET) #remove last comma
json_file.write(']')    
json_file.close()

1 of 518 ['NOUN_PROP', 'NOUN_PROP', 'X_PUN', 'DEF', 'NOUN', 'GEN_DEF', 'NOUN', 'GEN', 'NOUN', 'PREP', 'DEF', 'ADJ', 'ADV', 'ADJ', 'GEN', 'NOUN', 'ADJ', 'ADJ', 'PREP_DEF', 'NOUN_PROP', 'X_PUN', 'VERB', 'COMP', 'DEF', 'NOUN', 'GEN', 'NOUN', 'VERB', 'DEF', 'NOUN', 'COMP', 'VERB', 'COMP', 'VERB_PSEU', 'PREP', 'NOUN', 'DEF', 'NOUN', 'NOUN', 'NOUN_PROP', 'NOUN_PROP', 'PREP', 'NOUN', 'ADJ', 'VERB', 'LIL', 'NOUN', 'GEN', 'NOUN', 'X_PUN']
2 of 518 ['DEF', 'X_ENG', 'VERB', 'COMP', 'DEF', 'NOUN', 'VERB', 'LIL', 'NOUN_PROP', 'NOUN_PROP', 'COMP', 'PRON_PERS', 'KIEN', 'ADJ', 'VERB', 'DEF', 'NOUN', 'COMP', 'VERB_PSEU', 'CONJ_CORD', 'NOUN_PROP', 'NOUN_PROP', 'VERB', 'COMP', 'PRON_PERS', 'KIEN', 'NOUN', 'DEF', 'NOUN', 'ADV', 'LIL', 'PRON_INT', 'VERB_PSEU', 'X_PUN']</s>
3 of 518 ['PRON_DEM', 'VERB', 'CONJ_SUB', 'DEF', 'NOUN', 'NOUN', 'PREP', 'DEF', 'NOUN', 'ADJ', 'VERB', 'VERB', 'DEF', 'NOUN', 'GEN_DEF', 'NOUN', 'NOUN_PROP', 'GEN', 'NOUN_PROP', 'NOUN_PROP', 'NOUN_PROP', 'PREP', 'DEF', 'NOUN', 'PREP', 'D

In [5]:
equal_count = 0
less_count = 0
more_count = 0

no_s_tag = 0

for i in range(len(pred_list)):

    pred = pred_list[i]
    true = test_data[i]['xpos']  
    
    if '</s>' not in pred:
        no_s_tag += 1
        

    #remove <\s> if present
    pred = pred.replace('</s>', '')
    
    pred = re.findall(r'\b\w+\b', pred)

    if len(pred) == len(true):
        equal_count += 1
        
    elif len(pred) < len(true):
        less_count += 1
        
    else:
        more_count += 1
      
print('No s tag:', no_s_tag)
        
print('Equal:', equal_count)
print('Less:', less_count)
print('More:', more_count)

No s tag: 43
Equal: 438
Less: 55
More: 25


In [6]:
tag_count = 0
correct_count = 0

for i in range(len(pred_list)):

    pred = pred_list[i]
    true = test_data[i]['xpos']  

    #remove <\s> if present
    pred = pred.replace('</s>', '')
    
    pred = re.findall(r'\b\w+\b', pred)

    #in case where test_pred is shorter
    if len(pred) < len(true):  

        pred += ['None'] * (len(true) - len(pred))

    for i in range(len(true)):
        tag_count += 1
        if true[i] == pred[i]:
            correct_count += 1

#percentage of correct predictions
percentage = (correct_count / tag_count) * 100
print("Percentage correct:", percentage, "%")

Percentage correct: 81.31491014178633 %


# Named Entity Recognition

https://huggingface.co/datasets/wikiann/viewer/mt/<br>

In [3]:
dataset = datasets.load_dataset('wikiann', 'mt')

ner_mapping = {
    0: "O",
    1: "B-PER",
    2: "I-PER",
    3: "B-ORG",
    4: "I-ORG",
    5: "B-LOC",
    6: "I-LOC",
    }


train_dataset = []

for entry in dataset['train']:
    tokens = entry['tokens']
    sentence = ' '.join(tokens)

    spans = entry['spans']

    entities_sentences = []

    for span in spans:

        #get first three chars which are either per, org or loc
        label = span[:3]

        #get the rest of the string which is the entity
        entity = span[4:].strip()

        if label == 'PER':
            entity_sentence = entity +' huwa persuna.'
        elif label == 'ORG':
            entity_sentence = entity +' huwa organizzazzjoni.'
        elif label == 'LOC':
            entity_sentence = entity +' huwa post.'

        entities_sentences.append(entity_sentence)

    #join texts
    entities = '\n'.join(entities_sentences)
    
    train_dataset.append({'sentence': sentence, 'entities': entities, 'tokens': tokens, 'spans': spans, 'ner': [ner_mapping[label] for label in entry['ner_tags']]})

#save to json
with open('data/ner/train.json', 'w') as f:
    json.dump(train_dataset, f, ensure_ascii=False, indent=4)


val_dataset = []

for entry in dataset['validation']:
    tokens = entry['tokens']
    sentence = ' '.join(tokens)

    spans = entry['spans']

    entities_sentences = []

    for span in spans:

        #get first three chars which are either per, org or loc
        label = span[:3]

        #get the rest of the string which is the entity
        entity = span[4:].strip()

        if label == 'PER':
            entity_sentence = entity +' huwa persuna.'
        elif label == 'ORG':
            entity_sentence = entity +' huwa organizzazzjoni.'
        elif label == 'LOC':
            entity_sentence = entity +' huwa post.'

        entities_sentences.append(entity_sentence)

    #join texts
    entities = '\n'.join(entities_sentences)
    
    val_dataset.append({'sentence': sentence, 'entities': entities, 'tokens': tokens, 'spans': spans, 'ner': [ner_mapping[label] for label in entry['ner_tags']]})
    
#save to json
with open('data/ner/validation.json', 'w') as f:
    json.dump(val_dataset, f, ensure_ascii=False, indent=4)
    
test_dataset = []

for entry in dataset['test']:
    tokens = entry['tokens']
    sentence = ' '.join(tokens)

    spans = entry['spans']

    entities_sentences = []

    for span in spans:

        #get first three chars which are either per, org or loc
        label = span[:3]

        #get the rest of the string which is the entity
        entity = span[4:].strip()

        if label == 'PER':
            entity_sentence = entity +' huwa persuna.'
        elif label == 'ORG':
            entity_sentence = entity +' huwa organizzazzjoni.'
        elif label == 'LOC':
            entity_sentence = entity +' huwa post.'

        entities_sentences.append(entity_sentence)

    #join texts
    entities = '\n'.join(entities_sentences)
    
    test_dataset.append({'sentence': sentence, 'entities': entities, 'tokens': tokens, 'spans': spans, 'ner': [ner_mapping[label] for label in entry['ner_tags']]})
    
#save to json
with open('data/ner/test.json', 'w') as f:
    json.dump(test_dataset, f, ensure_ascii=False, indent=4)
    

In [2]:
def generate_instruction(data_point):

    instruction = 'Għidli l-postijiet, organizzazzjonijiet jew persuni f\'din is-sentenza.'
        
    input = data_point['sentence']
    
    output = data_point['entities']
    
    return {'instruction': instruction, 'input': input, 'output': output}

#open test data
test_path = 'data/ner/test.json'

with open(test_path, 'r') as f:
    test_data = json.load(f)
    
test_prompts = [generate_instruction(data_point) for data_point in test_data]

print(test_prompts[0])

{'instruction': "Għidli l-postijiet, organizzazzjonijiet jew persuni f'din is-sentenza.", 'input': 'Emerson Ferreira da Rosa', 'output': 'Emerson Ferreira da Rosa huwa persuna.'}


In [3]:
BASE_MODEL = "meta-llama/Llama-2-7b-hf"
LORA_WEIGHTS = "kelseybonnici/maltese-alpaca-ner" #hf link

tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

if device == "cuda":
    model = LlamaForCausalLM.from_pretrained(
        BASE_MODEL,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(model, LORA_WEIGHTS, torch_dtype=torch.float16)

model.eval()
if torch.__version__ >= "2":
    model = torch.compile(model)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.31s/it]


In [4]:
def evaluate(
    data_point,
    temperature=0.1,
    top_p = 0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=200,
    **kwargs,
):
      
    def generate_prompt(instruction, input=None):
        if input:
            return f"""Hawn taħt hawn struzzjoni li tiddeskrivi kompitu, flimkien ma' input li jipprovdi aktar kuntest. Ikteb tweġiba li timla t-talba kif xieraq.\n\n### Istruzzjoni: \n{instruction}\n\n### Input:\n{input}\n\n### Risposta:\n"""
        
    prompt = generate_prompt(data_point['instruction'], data_point['input'])
    
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
        
    s = generation_output.sequences[0]
    
    output = tokenizer.decode(s)
    
    return output.split("### Risposta:")[1].strip()

pred_list = []

json_file = open('data/ner/ma_results.json', 'w')
json_file.write('[')

for idx, data_point in enumerate(test_prompts):
    
    print(idx+1, 'of', len(test_prompts), end=' ')
    
    output = evaluate(data_point)
    
    print(output)
    
    pred_list.append(output)
    
    json_file.write('[')
    json.dump(output, json_file)
    json_file.write('],\n')
   
json_file.seek(json_file.tell()-2, os.SEEK_SET) #remove last comma
json_file.write(']')    
json_file.close()

1 of 100 Emerson Ferreira da Rosa huwa persuna.</s>
2 of 100 Diviżjoni 1 huwa organizzazzjoni.</s>
3 of 100 Muammar Gaddhafi huwa persuna.</s>
4 of 100 Korea t'Isfel huwa post.</s>
5 of 100 Greċja huwa post.</s>
6 of 100 Italja huwa post.
Franza huwa post.
Libja huwa post.
Alġerija huwa post.</s>
7 of 100 Soċjetà tad-Duttrina Nisranija huwa organizzazzjoni.</s>
8 of 100 Igor ' Stravinskij huwa persuna.</s>
9 of 100 Carmel Busuttil huwa persuna.</s>
10 of 100 Danimarka huwa post.</s>
11 of 100 Kanada u l-Karibew huwa post.</s>
12 of 100 Mużew barra Ħamrun huwa post.</s>
13 of 100 Mar Ruma huwa persuna.</s>
14 of 100 Opossum grazzjuż ta ' Aceramarca huwa post.</s>
15 of 100 Żejtun Corinthians huwa organizzazzjoni.
Naxxar huwa organizzazzjoni.
Gudja huwa organizzazzjoni.
Attard huwa organizzazzjoni.
Xgħajra Tornadoes huwa organizzazzjoni.
Qrendi huwa organizzazzjoni.
Sirens huwa organizzazzjoni.
Kirkop huwa organizzazzjoni.
Santa Luċija huwa organizzazzjoni.</s>
16 of 100 Rużar Briffa huw

In [5]:
#change output from sentences to spans like in dataset

pred_spans = []

for prediction in pred_list:
    #remove <\s> if present
    prediction = prediction.replace('</s>', '')

    #split by \n
    entities = prediction.split('\n')

    spans = []

    for entity in entities:

        # print(entity)

        if 'persuna' in entity:
            label = 'PER'
        elif 'organizzazzjoni' in entity:
            label = 'ORG'
        elif 'post' in entity:
            label = 'LOC'

        entity = entity.split(' huwa ')[0]

        spans.append(label + ': ' + entity)

    pred_spans.append(spans)

In [6]:
#change from spans to tags for each token

y_true = [data_point['ner'] for data_point in test_data]

true_tokens = [data_point['tokens'] for data_point in test_data]

y_pred = []

for i in range(len(pred_spans)):

    pred = ['O'] * len(true_tokens[i])

    for span in pred_spans[i]:

        label = span[:3]
        entity = span[4:].strip().split(' ')

        for j in range(len(true_tokens[i])):

            if true_tokens[i][j] in entity:
                if pred[j-1] == 'O':
                    pred[j] = 'B-' + label
                else:
                    pred[j] = 'I-' + label

    y_pred.append(pred)


In [7]:
f1 = seqeval.metrics.f1_score(y_true, y_pred)
print('Span F1 score:', f1*100)

Span F1 score: 79.14893617021276


# Open-ended Generation

In [2]:
#open test set
test_path = 'data/open-ended generation/test_samples.json'

with open(test_path, 'r') as f:
    test_data = json.load(f)


In [3]:
BASE_MODEL = "meta-llama/Llama-2-7b-hf"
LORA_WEIGHTS = "kelseybonnici/maltese-alpaca" #hf link

tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

if device == "cuda":
    model = LlamaForCausalLM.from_pretrained(
        BASE_MODEL,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(model, LORA_WEIGHTS, torch_dtype=torch.float16)

model.eval()
if torch.__version__ >= "2":
    model = torch.compile(model)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:18<00:00,  9.19s/it]


In [4]:
def evaluate(
    data_point,
    temperature=0.1,
    top_p = 0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=256,
    **kwargs,
):

    def generate_prompt(instruction, input=None):
        if input:
            return f"""Hawn taħt hawn struzzjoni li tiddeskrivi kompitu, flimkien ma' input li jipprovdi aktar kuntest. Ikteb tweġiba li timla t-talba kif xieraq.\n\n### Istruzzjoni: \n{instruction}\n\n### Input:\n{input}\n\n### Risposta:\n"""
        else:
            return f"""Hawn taħt hawn struzzjoni li tiddeskrivi kompitu. Ikteb tweġiba li tikkompleta t-talba kif xieraq.\n\n### Istruzzjoni:\n{instruction}\n\n### Risposta:\n"""
        
    prompt = generate_prompt(data_point['instruction'], data_point['input'])
    
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        repetition_penalty = 1.2,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
        
    s = generation_output.sequences[0]
    
    output = tokenizer.decode(s)
    
    return output.split("### Risposta:")[1].strip()


pred_list = []

json_file = open('data/open-ended generation/ma_results.json', 'w')
json_file.write('[')


for idx, data_point in enumerate(test_data):
     
    print(idx+1, 'of', len(test_data))
    
    output = evaluate(data_point)
    
    print('Instruction:')
    print(data_point['instruction'])
    if data_point['input']:
        print('Input:')
        print(data_point['input'])
    print('Output:')
    print(output, '\n')
    
    pred_list.append(output)
    
    json_file.write('[')
    json.dump(output, json_file)
    json_file.write('],\n')
   
json_file.seek(json_file.tell()-2, os.SEEK_SET) #remove last comma
json_file.write(']')    
json_file.close()

1 of 17
Instruction:
Kif nistgħu nnaqqsu t-tniġġis tal-arja?
Output:
It-tniġġis tal-arja jista' jkollu effetti negattivi fuq l-ambjent u l-ħajja tagħna. Hawn huma xi modi kif nistgħu nnaqqsu t-tniġġis tal-arja:

1. Uża t-tniġġis tal-arja mgħaġġla: L-użu ta 'tniġġis tal-arja mgħaġġla jista' jkollu effetti negattivi fuq l-ambjent u l-ħajja tagħna. L-użu ta 'tniġġis tal-arja mgħaġġla jista' jkollu effetti negattivi fuq l-ambjent u l-ħajja tagħna. L-użu ta 'tniġġis tal-arja mgħaġġla jista' jkollu effetti negattivi fuq l-ambjent u l-ħajja tagħna.

2 

2 of 17
Instruction:
Identifika l-fard.
Input:
Twitter, Instagram, Telegram
Output:
Il-fard għandu jkun Twitter.</s> 

3 of 17
Instruction:
X'inhi l-kapitali ta' Franza?
Output:
Il-kapitali ta' Franza huwa Paris.</s> 

4 of 17
Instruction:
Iddeskrivi l-funzjoni tal-motherboard tal-kompjuter
Output:
Il-motherboard huwa l-element ewlenin tal-kompjuter, u huwa responsabbli għall-iżvilupp u l-funzjonalità tal-kompjuter kollha. Il-motherboard huwa 