## Arapet train example

Some general configs:

In [1]:
from preproccess import simple_clean
from transformers import AutoTokenizer, AutoModelForMaskedLM
%config Completer.use_jedi = False
import pickle,os
import pandas as pd
import torch
from tqdm import tqdm 
os.environ["CUDA_VISIBLE_DEVICES"]='0' #"0,1,2,3"

seed = 42

task_name = "arabic_multilabel"
data_dir = "data/raw_data"
pet_data = "data/few_shot"
label_column = "S2P_label"
label_names = ['no_ref','neutral','positive','negative']
base_model_name = "UBC-NLP/MARBERT"
output_model_dir = "models/personal_sentiment"

### Data preprocessing

In [4]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

def clean_data(df,columns_to_clean=None,extra_tokens_num=0):
    df["len_combined"] = 0
    for col_name in columns_to_clean:
        clean_text_list = []
        clean_text_len = []
        for text in tqdm(list(df[col_name])):
            clean_text = simple_clean(text)
            clean_text_list.append(clean_text)
            clean_text_len.append(len(tokenizer.tokenize(clean_text)))     
        df[f"clean_{col_name}"] = clean_text_list
        df[f"len_clean_{col_name}"] = clean_text_len
        df["len_combined"] = df["len_combined"] + df[f"len_clean_{col_name}"]
    df["len_combined"] = df["len_combined"] + extra_tokens_num
    print("Max len_combined: ",df["len_combined"].max())
    return df

def MakeForPET(data,save_to,maping=None,cut_size=False,save=True):
    columns_to_clean = ['main_post_ar','response_post_ar']
    df = clean_data(data.copy(),columns_to_clean)
    if maping:
        df[label_column] = df[label_column].map(maping)
    if cut_size:
        print("cutting size...")
        print(df.describe()[['len_combined','len_clean_main_post_ar','len_clean_response_post_ar']])
        df = df.loc[(df.len_clean_main_post_ar<85)&(df.len_clean_response_post_ar<85)&(df.len_clean_main_post_ar>5)&(df.len_clean_response_post_ar>5)&(df.len_combined<143)].copy()
    print(df.describe()[['len_combined','len_clean_main_post_ar','len_clean_response_post_ar']])
    if save:
        saving_dir = f"{pet_data}/{save_to}"
        print("save to: ",saving_dir)
        df[["clean_main_post_ar","clean_response_post_ar",label_column]].to_csv(saving_dir,index=False,header=False)
    print("--"*30,"\n")
    return df

def MakeLabel(file_name=None):
    with open(f'{data_dir}/{file_name}.pickle','rb') as f:
        df = pickle.load(f)
        try:
            df[label_column] = df["Sent2Person_1"].copy()
            df.dropna(subset=[label_column],inplace=True)
        except:
            print("unlabeled data! inserting dummy labels")
            df[label_column] = "unlabeled"
    return df

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/701 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [5]:
train_df = MakeLabel("train_df_200")
test_df = MakeLabel("test_df_1589")
df_silver = MakeLabel("df_silver")
unl_df = MakeLabel("full_df_unlabeled")
unl_df = unl_df.loc[unl_df.data=="unlabeled"]
unl_df = pd.concat([df_silver,unl_df]).sample(frac=1,random_state=seed)#.drop(label_column,axis=1)
unl_df[label_column] = "unlabeled"

train_pet = MakeForPET(train_df,"train.csv")    
test_pet = MakeForPET(test_df,"test.csv")
dev_pet = MakeForPET(test_df,"dev.csv")
unl_pet = MakeForPET(unl_df,"unlabeled.csv",cut_size=True)

unlabeled data! inserting dummy labels


In [45]:
test_pet[label_column].value_counts()

no_ref      462
negative    423
positive    410
neutral     294
Name: S2P_label, dtype: int64

### Train model

In [None]:
!python3 cli.py \
--method ipet \
--pattern_ids 0 1 2 \
--ipet_scale_factor 5 \
--split_examples_evenly \
--data_dir $data_dir \
--model_type bert \
--model_name_or_path $base_model_name \
--pet_max_seq_length 160 \
--sc_repetitions 2 \
--task_name $task_name \
--output_dir $output_model_dir \
--overwrite_output_dir \
--pet_num_train_epochs 3 \
--sc_num_train_epochs 2 \
--pet_repetitions 3 \
--ipet_generations 3 \
--pet_per_gpu_train_batch_size 4 \
--pet_per_gpu_eval_batch_size 64 \
--pet_per_gpu_unlabeled_batch_size 64 \
--sc_per_gpu_train_batch_size 4 \
--sc_per_gpu_eval_batch_size 64 \
--sc_per_gpu_unlabeled_batch_size 64 \
--warmup_steps 0 \
--logging_steps 100 \
--do_train \
--do_eval

### Evaluate

In [1]:
%config Completer.use_jedi = False
import pickle,os
import pandas as pd

In [2]:
#!pip install jsonlines
import pandas as pd
import jsonlines
from sklearn.metrics import classification_report
from pathlib import Path


model_dir = output_model_dir # "models/personal_sentiment"
eval_df = pd.read_csv("data/few_shot/test.csv",header=None)
eval_df.columns = ["sentence1","sentence2","label"]
labels = list(eval_df.label.unique())
print(labels)


map_3_class = True
map_4to3 = {'no_ref':"neutral",'neutral':"neutral",'positive':"positive",'negative':"negative"}

#for g in ["","g0/","g1/","g2/","final/"]:
for g in ["final/"]: 
    print(g,"\n","---"*30)
    for pattern in [0,1,2,3]:
        for iteration in [0,1,2,3]:
            final = f"{model_dir}/{g}p{pattern}-i{iteration}/predictions.jsonl"
            my_file = Path(final)
            if my_file.is_file():
                predictions=[]
                with jsonlines.open(final) as f:
                    for line in f.iter():
                        predictions.append(line['label'])
                print(f"iteration: {g}p{pattern}-i{iteration}")
                print(len(predictions))
                eval_df['predictions'] = predictions
                if map_3_class:
                    eval_df['predictions']=eval_df['predictions'].map(map_4to3)
                    eval_df['label']=eval_df['label'].map(map_4to3)
                print("\n iteration no. ",iteration,"\n")
                print(classification_report(y_true=list(eval_df['label']), y_pred=list(eval_df['predictions'])))

['no_ref', 'positive', 'negative', 'neutral']
final/ 
 ------------------------------------------------------------------------------------------
iteration: final/p0-i0
1589

 iteration no.  0 

              precision    recall  f1-score   support

    negative       0.89      0.78      0.83       423
     neutral       0.77      0.61      0.68       756
    positive       0.54      0.83      0.65       410

    accuracy                           0.71      1589
   macro avg       0.73      0.74      0.72      1589
weighted avg       0.74      0.71      0.71      1589

iteration: final/p0-i1
1589

 iteration no.  1 

              precision    recall  f1-score   support

    negative       0.89      0.78      0.83       423
     neutral       0.77      0.61      0.68       756
    positive       0.54      0.83      0.65       410

    accuracy                           0.71      1589
   macro avg       0.73      0.74      0.72      1589
weighted avg       0.74      0.71      0.71      

## Arapet Inference

In [None]:
# on Colab
"""
!git clone https://github.com/idc-dsi/Arapet.git
%cd Arapet
!pip install -r requirements.txt
"""

In [1]:
from preproccess import simple_clean
from transformers import AutoTokenizer, AutoModelForMaskedLM
%config Completer.use_jedi = False
import pickle,os
import pandas as pd
from tqdm import tqdm
import sys
import torch
from pet import InputExample
from pet.ArapetModel import InitArapetModel
device = ('cuda' if torch.cuda.is_available() else 'cpu')
device = "cpu"
model = InitArapetModel()
model.model.to(device)
label_map = {0:"no_personal_sentiment",1:"no_personal_sentiment",2:"good_personal",3:"bad_personal"}
#os.environ["CUDA_VISIBLE_DEVICES"]='0' #"0,1,2,3"

In [2]:
def predict_instance(main_post=None,response_post=None):
    if main_post and response_post:
        data = [InputExample(guid=0,text_a=simple_clean(main_post),text_b=simple_clean(response_post))]
    else:
        if main_post:
            text = main_post
        else:
            text = response_post
        print("Warning! for best performance please insert both 'main_post' and 'response_post'!")
        data = [InputExample(guid=0,text_a=simple_clean(text),text_b=None)]
    
    prediction = model.eval(data, device=device)['logits'].argmax()
    label = label_map[prediction]
    return label

In [3]:
post = "هل تمتلك صديق منذ 5 سنوات ؟"
response = "عندي 3 أصدقاء من ال2003 . وصديق من ال 2009"

predict_instance(post,response)

2021-12-11 20:41:29,616 - INFO - wrapper - Writing example 0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2021-12-11 20:41:29,619 - INFO - wrapper - --- Example 0 ---
2021-12-11 20:41:29,619 - INFO - wrapper - input_ids         = ['[CLS]', 'هل', 'تمتلك', 'صديق', 'منذ', '5', 'سنوات', '?', '[SEP]', 'عندي', '3', 'اصدقاء', 'من', 'ال20', '##0', '##3', '.', 'وصديق', 'من', 'ا', '##ل', '2009', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

'no_personal_sentiment'

In [4]:
post = "خرّبوا البلد وشلّوه، وعملوا شعبويّة وقّفت كل المشاريع المنيحة والمنتجة؛ اليوم عم يشوفوا وين في شي محل منتعش ومزدهر، بدّهم يؤذوه."
response = "@Gebran_Bassil في مراية امامك انت عمتحكي ... انت شو عملت للبلد ؟ كهربا ما في  سدود وهمية ..... ما خلوني وما وما وما .... فاشل كبير انت ارحم الناس من هيك كلام ...الناس عارفتك على حقيقتك انك انسان فاشل  وسبب مباشر من ازمة لبنان .يجب ان تحاكم"

predict_instance(post,response)

2021-12-11 20:41:29,777 - INFO - wrapper - Writing example 0
2021-12-11 20:41:29,779 - INFO - wrapper - --- Example 0 ---
2021-12-11 20:41:29,780 - INFO - wrapper - input_ids         = ['[CLS]', 'خربوا', 'البلد', 'وش', '##لوه', '،', 'وعملوا', 'شعب', '##وية', 'وقفت', 'كل', 'المشاريع', 'المني', '##حة', 'والمنت', '##جة', '؛', 'اليوم', 'عم', 'يشوفوا', 'وين', 'في', 'شي', 'محل', 'منت', '##عش', 'ومز', '##ده', '##ر', '،', 'بدهم', 'يوذ', '##وه', '.', '[SEP]', 'في', 'مراية', 'امامك', 'انت', 'عمت', '##حكي', '.', '.', '.', 'انت', 'شو', 'عملت', 'للبلد', '?', 'كهربا', 'ما', 'في', 'سد', '##ود', 'وهمية', '.', '.', '.', 'ما', 'خلوني', 'وما', 'وما', 'وما', '.', '.', '.', 'فاشل', 'كبير', 'انت', 'ارحم', 'الناس', 'من', 'هيك', 'كلام', '.', '.', '.', 'الناس', 'عارف', '##تك', 'على', 'حقيقتك', 'انك', 'انسان', 'فاشل', 'وسبب', 'مباشر', 'من', 'ازمة', 'لبنان', '.', 'يجب', 'ان', 'تحاكم', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[P

'bad_personal'