In [34]:
pip install openprompt

[0mNote: you may need to restart the kernel to use updated packages.


In [35]:
!pip install transformers
!pip install evaluate
!pip install datasets

[0m

In [36]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from transformers import TrainingArguments, Trainer
import evaluate
from datasets import load_metric


In [37]:
model_type = "XLM-R" #@param ["SinBERT", "Bert", "XLM-R"]
technique = "sentiment" #@param ["humor", "hate speech", "sentiment"]
load_adapter = False #@param {type:"boolean"}
# train = True #@param {type:"boolean"}
unfreeze_model = False #@param {type:"boolean"}
save_adapter = False #@param {type:"boolean"}
oversample_dataset = True #@param {type:"boolean"}
lang_adapter_setting = "none" #@param ["none", "stack", "parallel"]
random_state = 42 #@param
adapter_config = "pfeiffer" #@param ["houlsby", "pfeiffer"]
over_sampling_technique = "ROS" #@param ["", "ROS","ADASYN", "SMOTE", "BorderlineSMOTE"]
sampling_strategy = "1:0.25:0.25" #@param [] {allow-input: true} 
## eg: 1:0.25:0.25 for hate | 0.5 for humor | 1:1:1:1 or 0.5:1:0.5:0.5 or 0.25:1:0.25:0.25 for sentiment

In [38]:
from openprompt.data_utils.text_classification_dataset import PROCESSORS
from datasets import load_dataset


In [39]:
dataset_path="../input/cmcs-dataset-task/ompleted_draft - ompleted_draft.csv"

In [40]:
all_data = pd.read_csv(dataset_path)

if (technique == "humor"):
  all_data = all_data[['Sentence', 'Humor']]
elif (technique == "hate speech"):
  all_data = all_data[['Sentence', 'Hate_speech']]
else:
  all_data = all_data[['Sentence', 'Sentiment']]

all_data.columns = ['Sentence', 'Label']
all_data['Label'], uniq = pd.factorize(all_data['Label'])

X = all_data['Sentence'].values.tolist()
y = all_data['Label'].values.tolist()

In [41]:
X[:50]

['Ammage Adarayta❤️Eka Dawasak Madi Neda❤️🙏❤️',
 'We need IPL Champions leak data offers ..please provide it..last year also you all provided 😊',
 '#VPN #ummmaaa #proud_be',
 'chandimal.. uuu thama mulu tem ekama kaaa gahala hari quick karala ganne',
 'sltgo',
 'Dialog 49k dammama eka dawasak wath be',
 'eth anith kattiya sathiyak withara online innawa 49 data eken..',
 'meka salli kanawane...ai e?',
 'Navy🧡❤️💛💗💚💜️',
 'One ring to bomb them all .Terrorist style',
 'Answer no 1) 45 1st president - Jeorge Washington 2nd " - John Adams 3rd " - Thomas Jefferson . . . 44th " - Barack Obama 45th "- Donald Trump 😊',
 'Mage router eka hadala dendooo',
 'මොකක්ද අප්ප මේ වෙනදට වඩා msg එන්නෙ mibitel එකෙන් වාතයක්නෙ',
 'Fiber ඉල්ලලා දැන් අව්රුද්දක් විතර😒 blloh😒',
 'Menna company.... superb hutch superb',
 'Parana sim eka ganna puluwanda',
 'හොඳ වැඩේ....මෙයැයිලගේ හැටි දන්නෑ ඔයැයිලා....❤️❤️❤️❤️❤️',
 'මොකුත් එපා,peak/off peak මඟුල නැති කරහල්ලා ඒ ඇති...',
 'ALL BLACKS HAKA comeing zoon . . .',
 'හුම් 😂'

In [42]:
y[:10]

[0, 1, 1, 2, 1, 0, 1, 0, 1, 1]

In [43]:
uniq

Index(['Negative', 'Neutral', 'Positive', 'Conflict'], dtype='object')

In [44]:
if (technique == 'humor'):
    num_labels=2
    id2label={ 0: "Non-humorous", 1: "Humorous"}
elif (technique == 'hate speech'):
    num_labels=3
    id2label={ 0: "Not offensive", 1: "Hate-Inducing", 2: "Abusive"}
else:
    num_labels=4
    id2label={ 0: "Negative", 1: "Neutral", 2: "Positive", 3:"Conflict"}

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = random_state)

In [46]:
examples=[]
from openprompt.data_utils import InputExample
for i in range(len(X_test)):
  examples.append(InputExample(
        guid = i,
        text_a =X_test[i],
        label=y_test[i]
    ))

In [47]:

from transformers.models.auto.tokenization_auto import tokenizer_class_from_name

from openprompt.plms.utils import TokenizerWrapper
from typing import List, Dict
from collections import defaultdict

class MLMTokenizerWrapper(TokenizerWrapper):
    add_input_keys = ['input_ids', 'attention_mask', 'token_type_ids']

    @property
    def mask_token(self):
        return self.tokenizer.mask_token

    @property
    def mask_token_ids(self):
        return self.tokenizer.mask_token_id

    @property
    def num_special_tokens_to_add(self):
        if not hasattr(self, '_num_specials'):
            self._num_specials = self.tokenizer.num_special_tokens_to_add()
        return self._num_specials

    def tokenize_one_example(self, wrapped_example, teacher_forcing):
        ''' # TODO doesn't consider the situation that input has two parts
        '''

        wrapped_example, others = wrapped_example

        # for some dataset like SuperGLUE.COPA, the answer requires prediction an span of
        # the input. Or in generation tasks, we need to generate a piece of target_text.
        # In these case, it tokenized to the encoded_tgt_text for future use.
        encoded_tgt_text = []
        if 'tgt_text' in others:
            tgt_text = others['tgt_text']
            if isinstance(tgt_text, str):
                tgt_text = [tgt_text]
            for t in tgt_text:
                encoded_tgt_text.append(self.tokenizer.encode(t, add_special_tokens=False))


        mask_id = 0 # the i-th the mask token in the template.

        encoder_inputs = defaultdict(list)
        for piece in wrapped_example:
            if piece['loss_ids']==1:
                if teacher_forcing: # fill the mask with the tgt task
                    raise RuntimeError("Masked Language Model can't perform teacher forcing training!")
                else:
                    encode_text = [self.mask_token_ids]
                mask_id += 1

            if piece['text'] in self.special_tokens_maps.keys():
                to_replace = self.special_tokens_maps[piece['text']]
                if to_replace is not None:
                    piece['text'] = to_replace
                else:
                    raise KeyError("This tokenizer doesn't specify {} token.".format(piece['text']))

            if 'soft_token_ids' in piece and piece['soft_token_ids']!=0:
                encode_text = [0] # can be replace by any token, since these token will use their own embeddings
            else:
                encode_text = self.tokenizer.encode(piece['text'], add_special_tokens=False)

            encoding_length = len(encode_text)
            encoder_inputs['input_ids'].append(encode_text)
            for key in piece:
                if key not in ['text']:
                    encoder_inputs[key].append([piece[key]]*encoding_length)

        encoder_inputs = self.truncate(encoder_inputs=encoder_inputs)
        # delete shortenable ids
        encoder_inputs.pop("shortenable_ids")
        encoder_inputs = self.concate_parts(input_dict=encoder_inputs)
        encoder_inputs = self.add_special_tokens(encoder_inputs=encoder_inputs)
        # create special input ids
        encoder_inputs['attention_mask'] = [1] *len(encoder_inputs['input_ids'])
        if self.create_token_type_ids:
            encoder_inputs['token_type_ids'] = [0] *len(encoder_inputs['input_ids'])
        # padding
        encoder_inputs = self.padding(input_dict=encoder_inputs, max_len=self.max_seq_length, pad_id_for_inputs=self.tokenizer.pad_token_id)


        if len(encoded_tgt_text) > 0:
            encoder_inputs = {**encoder_inputs, "encoded_tgt_text": encoded_tgt_text}# convert defaultdict to dict
        else:
            encoder_inputs = {**encoder_inputs}
        return encoder_inputs

In [48]:
from statistics import mode
from typing import List, Optional
from transformers.modeling_utils import PreTrainedModel
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers import BertConfig, BertTokenizer, BertModel, BertForMaskedLM, \
                         RobertaConfig, RobertaTokenizer, RobertaModel, RobertaForMaskedLM, \
                         XLMRobertaConfig, XLMRobertaTokenizer, XLMRobertaModel, XLMRobertaForMaskedLM
from collections import namedtuple
from yacs.config import CfgNode

from openprompt.utils.logging import logger


ModelClass = namedtuple("ModelClass", ('config', 'tokenizer', 'model','wrapper'))

_MODEL_CLASSES = {
    'bert': ModelClass(**{
        'config': BertConfig,
        'tokenizer': BertTokenizer,
        'model':BertForMaskedLM,
        'wrapper': MLMTokenizerWrapper,
    }),
    'roberta': ModelClass(**{
        'config': RobertaConfig,
        'tokenizer': RobertaTokenizer,
        'model':RobertaForMaskedLM,
        'wrapper': MLMTokenizerWrapper
    }),
    'xlm': ModelClass(**{
        'config': XLMRobertaConfig,
        'tokenizer': XLMRobertaTokenizer,
        'model': XLMRobertaForMaskedLM,
        'wrapper': MLMTokenizerWrapper
    }),
}


def get_model_class(plm_type: str):
    return _MODEL_CLASSES[plm_type]


def load_plm(model_name, model_path, specials_to_add = None):
    r"""A plm loader using a global config.
    It will load the model, tokenizer, and config simulatenously.

    Args:
        config (:obj:`CfgNode`): The global config from the CfgNode.

    Returns:
        :obj:`PreTrainedModel`: The pretrained model.
        :obj:`tokenizer`: The pretrained tokenizer.
        :obj:`model_config`: The config of the pretrained model.
        :obj:`wrapper`: The wrapper class of this plm.
    """
    model_class = get_model_class(plm_type = model_name)
    model_config = model_class.config.from_pretrained(model_path)
    # you can change huggingface model_config here
    # if 't5'  in model_name: # remove dropout according to PPT~\ref{}
    #     model_config.dropout_rate = 0.0
    if 'gpt' in model_name: # add pad token for gpt
        specials_to_add = ["<pad>"]
        # model_config.attn_pdrop = 0.0
        # model_config.resid_pdrop = 0.0
        # model_config.embd_pdrop = 0.0
    model = model_class.model.from_pretrained(model_path, config=model_config)
    tokenizer = model_class.tokenizer.from_pretrained(model_path)
    wrapper = model_class.wrapper


    model, tokenizer = add_special_tokens(model, tokenizer, specials_to_add=specials_to_add)

    if 'opt' in model_name:
        tokenizer.add_bos_token=False
    return model, tokenizer, model_config, wrapper




def load_plm_from_config(config: CfgNode):
    r"""A plm loader using a global config.
    It will load the model, tokenizer, and config simulatenously.

    Args:
        config (:obj:`CfgNode`): The global config from the CfgNode.

    Returns:
        :obj:`PreTrainedModel`: The pretrained model.
        :obj:`tokenizer`: The pretrained tokenizer.
        :obj:`model_config`: The config of the pretrained model.
        :obj:`model_config`: The wrapper class of this plm.
    """
    plm_config = config.plm
    model_class = get_model_class(plm_type = plm_config.model_name)
    model_config = model_class.config.from_pretrained(plm_config.model_path)
    # you can change huggingface model_config here
    # if 't5'  in plm_config.model_name: # remove dropout according to PPT~\ref{}
    #     model_config.dropout_rate = 0.0
    if 'gpt' in plm_config.model_name: # add pad token for gpt
        if "<pad>" not in config.plm.specials_to_add:
            config.plm.specials_to_add.append("<pad>")
    model = model_class.model.from_pretrained(plm_config.model_path, config=model_config)
    tokenizer = model_class.tokenizer.from_pretrained(plm_config.model_path)
    wrapper = model_class.wrapper
    model, tokenizer = add_special_tokens(model, tokenizer, specials_to_add=config.plm.specials_to_add)
    return model, tokenizer, model_config, wrapper

def add_special_tokens(model: PreTrainedModel,
                       tokenizer: PreTrainedTokenizer,
                       specials_to_add: Optional[List[str]] = None):
    r"""add the special_tokens to tokenizer if the special token
    is not in the tokenizer.

    Args:
        model (:obj:`PreTrainedModel`): The pretrained model to resize embedding
                after adding special tokens.
        tokenizer (:obj:`PreTrainedTokenizer`): The pretrained tokenizer to add special tokens.
        specials_to_add: (:obj:`List[str]`, optional): The special tokens to be added. Defaults to pad token.

    Returns:
        The resized model, The tokenizer with the added special tokens.

    """
    if specials_to_add is None:
        return model, tokenizer
    for token in specials_to_add:
        if "pad" in token.lower():
            if tokenizer.pad_token is None:
                tokenizer.add_special_tokens({'pad_token': token})
                model.resize_token_embeddings(len(tokenizer))
                logger.info("pad token is None, set to id {}".format(tokenizer.pad_token_id))
    return model, tokenizer

In [49]:
plm, tokenizer, model_config, WrapperClass =load_plm("xlm", "xlm-roberta-base")

In [50]:
from openprompt.prompts import ManualTemplate
promptTemplate = ManualTemplate(
    text = '{"placeholder": "text_a"}. It was {"mask"}.',
    tokenizer = tokenizer,
    
)

In [51]:
classes = [ 
    "0",
    "1",
    "2",
    "3"
]

In [52]:
from openprompt.prompts import ManualVerbalizer
label_words = {
        "0": ["bad","negative","wrong"],
        "1": ["neutral","fine","safe"],
        "2":["good","positive","useful"],
       "3":["conflict","disagree","clash"]
    }
promptVerbalizer = ManualVerbalizer(
    classes = classes,
    label_words = label_words,
    tokenizer = tokenizer,
)
# 'Negative', 'Neutral', 'Positive', 'Conflict'

In [53]:
from openprompt import PromptForClassification
promptModel = PromptForClassification(
    template = promptTemplate,
    plm = plm,
    verbalizer = promptVerbalizer,
)

In [54]:
MAX_LEN = 128

from openprompt import PromptDataLoader
data_loader = PromptDataLoader(
    dataset = examples,
    tokenizer = tokenizer,
    template = promptTemplate,
    tokenizer_wrapper_class=WrapperClass,
    batch_size=1,
    max_length=MAX_LEN,
    truncation=True,
    padding="max_length"
)

tokenizing: 1352it [00:01, 1199.71it/s]


In [55]:
import torch
torch.cuda.empty_cache()
allpreds = []
alllabels = []
promptModel=promptModel.cuda()
for step, inputs in enumerate(data_loader):
    inputs=inputs.cuda();
    logits = promptModel(inputs)    
    labels = inputs['label']
    alllabels.extend(labels.cpu().tolist())
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())


In [56]:
acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
print(acc)

0.21745562130177515


In [57]:
# training_args = TrainingArguments(
#     learning_rate=5e-4,
#     num_train_epochs=6,
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=32,
#     # logging_steps=200,
#     output_dir="./training_output",
#     # overwrite_output_dir=True,
#     # The next line is important to ensure the dataset labels are properly passed to the model
#     remove_unused_columns=False,
#     metric_for_best_model="eval_macro_f1",
#     load_best_model_at_end=True,
#     save_strategy="epoch",
#     evaluation_strategy="epoch"
# )

# # def compute_accuracy(p: EvalPrediction):
# #   preds = np.argmax(p.predictions, axis=1)
# #   return {"acc": (preds == p.label_ids).mean()}

# def compute_metrics(eval_pred):
#     metric1 = load_metric("precision")
#     metric2 = load_metric("recall")
#     metric3 = load_metric("f1")
#     metric4 = load_metric("accuracy")
    
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
#     recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
#     f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
#     accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]
#     macro_precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
#     macro_recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
#     macro_f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
#     return {"accuracy":accuracy, "precision": precision, "recall": recall, "f1": f1, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_f1": macro_f1}

# trainer = Trainer(
#     model=promptModel,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
# )

In [58]:
def compute_metrics(allpreds,alllabels):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")
    metric4 = load_metric("accuracy")
    
    predictions, labels = allpreds,alllabels
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]
    macro_precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
    macro_recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
    macro_f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
    return {"accuracy":accuracy, "precision": precision, "recall": recall, "f1": f1, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_f1": macro_f1}

In [59]:
compute_metrics(allpreds,alllabels)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.21745562130177515,
 'precision': 0.7025393649047711,
 'recall': 0.21745562130177515,
 'f1': 0.15368617613501517,
 'macro_precision': 0.3560314794077392,
 'macro_recall': 0.3551074247203537,
 'macro_f1': 0.17074883701908378}

In [60]:
for i in range(len(X_test)):
    print(i,":",X_test[i],"--->",id2label[alllabels[i]],"----->",id2label[allpreds[i]])

0 : Well done chamod de silva.... ---> Positive -----> Positive
1 : aka dawas kiyak thiyenawada? ---> Neutral -----> Positive
2 : Data iwara unama kisima alert ekak nathuwa sallith ekka kapagena yanawa ammo dialog hodai 4G kiyala gattata mata oni! ---> Conflict -----> Positive
3 : Connection is very slow ---> Negative -----> Positive
4 : Wal baduwa gayesha perera ---> Neutral -----> Negative
5 : 4G sim ekak ganne kohomada? ---> Neutral -----> Positive
6 : apita signal full locatin eke idala 200m wath na tower ekata ---> Positive -----> Positive
7 : 4G Signal deehan okkotama kalin 😕 ---> Negative -----> Negative
8 : තාමත් නෑ ---> Neutral -----> Negative
9 : කොත්මලේට රෑට සිග්නල් දෙන්නේ කවද්ද.. ---> Neutral -----> Negative
10 : 3. ආනන්ද ස්වාමින් වහන්සේ ---> Neutral -----> Positive
11 : She finds cooking is a meditation n always entertain me with her own recepies. ---> Positive -----> Positive
12 : Hutch සිම් නම් ඕනෙම නෑ රෙද්ද.මුන්ට call ගත්තට answer නෑ ---> Negative -----> Negative
13 : G

In [61]:
tokens=tokenizer.tokenize("Mama cricket matches balanna asai")
print(tokens)

['▁Mama', '▁cri', 'cket', '▁match', 'es', '▁balan', 'na', '▁as', 'ai']


In [62]:
tokenizer.convert_tokens_to_ids(tokens)

[22991, 13625, 27853, 14858, 90, 84581, 76, 237, 508]

In [63]:
tokenizer.decode([22991, 13625, 27853, 14858, 90, 84581, 76, 237, 508])

'Mama cricket matches balanna asai'

In [64]:
tokens=tokenizer.tokenize("මම cricket matches බලන්න ආසයි")
print(tokens)

['▁මම', '▁cri', 'cket', '▁match', 'es', '▁බලන්න', '▁ආස', 'යි']
