## Evaluation metrics

In [159]:
import numpy as np
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,recall_score,precision_score
import torch
from tqdm import tqdm
from nltk import word_tokenize
import nltk
from nltk.translate import meteor
from nltk.translate.bleu_score import SmoothingFunction

In [160]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import numpy as np

import re
import io

def processText(text):
    text = re.sub(r"\S*https?:\S*", "", text)
    #text = re.sub('<user>','',text)
    #text = re.sub('<url>','',text)
    text = re.sub('<.*?>','',text)
    text = re.sub(r'[.!"\/<\*>!@#$%^&*]', r'', text)
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", '', text)
    text = re.sub(' +', ' ', text)
    _RE_COMBINE_WHITESPACE = re.compile(r"(?a:\s+)")
    _RE_STRIP_WHITESPACE = re.compile(r"(?a:^\s+|\s+$)")
    text = _RE_COMBINE_WHITESPACE.sub(" ", text)
    text = _RE_STRIP_WHITESPACE.sub("", text)
    text = text.strip()
    return text


def remove_emojis(text):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f" 
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', text)

def preprocess(text):
    text = remove_emojis(text)
    text = processText(text)
    return text

In [161]:
def hate_refrences(data,test_set):          ###############returns pair of <hate,refrences>  
    hate  = []
    reply = []
    refrences = []
    for sample in data:
        ht , rep = sample[0] , sample[1]
        hate.append(ht)
        reply.append(rep)
    hate = list(set(hate))
    mp={}
    for ht_i in hate:
        refs = []
        for sample in data:
            ht_j , rep =  sample[0] , sample[1]
            if ht_j == ht_i:
                refs.append(rep)
        mp[ht_i] = refs
        refrences.append(refs)
    hate = list(set([x[0] for x in test_set]))
    refs = [mp[ht_i] for ht_i in hate]
    return hate,refs             # a given hate instance and refrences(replies) for metrics evaluation


# In[7]:


def training_corpus(train_set):    # returns training corpus
    replies = []
    for sample in train_set:
        rep = sample[1]
        replies.append(rep)
    replies = list(set(replies))
    return replies                # returns the sentences used while training 




def evaluate(params, model, test_dataloader, device):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    for step, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader), desc="Evaluating"):
        inputs, labels = (batch[0], batch[0])
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1
        
    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))
    return perplexity


###################################### BLEU_SCORE , METEOR #######################################
def hate_refrences(data, test_set):          ###############returns pair of <hate,refrences>  
    hate  = []
    reply = []
    refrences = []
    for ind in data.index:
        ht , rep = data['input_text'][ind] , data['target_text'][ind]
        hate.append(ht)
        reply.append(rep)
    hate = list(set(hate))
    mp={}
    for ht_i in hate:
        refs = []
        for ind in data.index:
            ht_j , rep =  data['input_text'][ind] , data['target_text'][ind]
            if ht_j == ht_i:
                refs.append(rep)
        mp[ht_i] = refs
        refrences.append(refs)
    #hate = list(set([x[0] for x in test_set]))
    #refs = [mp[ht_i] for ht_i in hate]
    return hate, refrences   



############################################ JACCARD SIMILARITY #################################
def get_jaccard_sim(str1, str2):   
    if isinstance(str1, float) or isinstance(str2, float):
        return (-1)
    try:
        a = set(str1.split()) 
        b = set(str2.split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    except:
        print((str1))
        print(type(str2))
        return 0


############################################### NOVELTY #########################################
def get_novelty(sent, training_corpus):
    max_overlap = 0
    for instance in training_corpus:
        max_overlap = max(max_overlap,get_jaccard_sim(instance,sent))
    return 1-max_overlap

def avg_novelty(sentences,training_corpus):
    avg = 0
    for sent in sentences:
        avg += get_novelty(sent,training_corpus)
    avg = (avg/float(len(sentences)))
    return avg



############################################### DIVERSITY ########################################
def get_diversity(sentences):
    avg = 0.0
    for i in range(len(sentences)):
        max_overlap = 0
        for j in range(len(sentences)):
            if i!=j:
                max_overlap = max(max_overlap,get_jaccard_sim(sentences[i],sentences[j]))
        avg = avg + (1-max_overlap)
    avg = (avg/len(sentences))
    return avg, len(sentences)
    
def diversity_and_novelty(training_corpus, gen_replies):
    diversity = get_diversity(gen_replies)
    novelty   = 0#avg_novelty(gen_replies,training_corpus)
    return diversity,novelty

In [168]:
import pandas as pd

# data_train=data_final/Exp3A/Hindi/Bengali/bengali2hindi_train_pairs.csv
# data_val=data_final/Exp3A/Hindi/Bengali/bengali2hindi_val_pairs.csv
# data_test=data_final/Exp3A/Hindi/Bengali/hindi_test_pairs.csv
# data_pred=outputs/Exp3A/Hindi/Bengali/counter_mbart_bengali2hindi.csv
#HateAlert_Folder/JointDir/Saurabh/outputs/Exp3B/Bengali/English/bloom/counter_bloom_english2bengali_0.txt

df_train = pd.read_csv('/home/mithundas/HateAlert_Folder/JointDir/Saurabh/data_final/Exp4/joint_train_pairs.csv',
                       lineterminator='\n')
df_test = pd.read_csv('/home/mithundas/HateAlert_Folder/JointDir/Saurabh/data_final/Exp4/hindi_test_pairs.csv',
                      lineterminator='\n')
df_pred = pd.read_csv('/home/mithundas/HateAlert_Folder/JointDir/Saurabh/outputs/Exp4/counter_bloom_hindi_joint.csv',
                      lineterminator='\n')   
df_pred = df_pred.fillna('')


for ind in df_pred.index:
        df_pred['input_text'][ind] =  preprocess(df_pred['input_text'][ind])
        df_pred['predicted_text'][ind] =  preprocess(df_pred['predicted_text'][ind])
        
for ind in df_train.index:
        df_train['input_text'][ind] =  preprocess(df_train['input_text'][ind])
        df_train['target_text'][ind] =  preprocess(df_train['target_text'][ind])

for ind in df_test.index:
        df_test['input_text'][ind] =  preprocess(df_test['input_text'][ind])
        df_test['target_text'][ind] =  preprocess(df_test['target_text'][ind])

In [169]:
## Diversity Scores
print("Diversity Scores")
# print("Input train: ", get_diversity(df_train['input_text']))
# print("Target train: ", get_diversity(df_train['target_text']))

# print("Input test: ", get_diversity(df_test['input_text']))
# print("Target test: ", get_diversity(df_test['target_text']))

print("Input pred: ", get_diversity(df_pred['input_text']))
print("Predicted pred: ", get_diversity(df_pred['predicted_text']))


## Novelty Scores
# print("Novelty Scores")
# print(avg_novelty(df_train['input_text'], df_train['input_text']), avg_novelty(df_train['input_text'], df_train['target_text']))
# print(avg_novelty(df_train['input_text'], df_test['input_text']), avg_novelty(df_train['input_text'], df_test['target_text']))
# print(avg_novelty(df_train['input_text'], df_pred['input_text']), avg_novelty(df_train['input_text'], df_pred['predicted_text']))

# print(avg_novelty(df_train['target_text'], df_train['input_text']), avg_novelty(df_train['target_text'], df_train['target_text']))
# print(avg_novelty(df_train['target_text'], df_test['input_text']), avg_novelty(df_train['target_text'], df_test['target_text']))
# print(avg_novelty(df_train['target_text'], df_pred['input_text']), avg_novelty(df_train['target_text'], df_pred['predicted_text']))

# print(avg_novelty(df_test['input_text'], df_train['input_text']), avg_novelty(df_test['input_text'], df_train['target_text']))
# print(avg_novelty(df_test['input_text'], df_test['input_text']), avg_novelty(df_test['input_text'], df_test['target_text']))
# print(avg_novelty(df_test['input_text'], df_pred['input_text']), avg_novelty(df_test['input_text'], df_pred['predicted_text']))

# print(avg_novelty(df_test['target_text'], df_train['input_text']), avg_novelty(df_test['target_text'], df_train['target_text']))
# print(avg_novelty(df_test['target_text'], df_test['input_text']), avg_novelty(df_test['target_text'], df_test['target_text']))
# print(avg_novelty(df_test['target_text'], df_pred['input_text']), avg_novelty(df_test['target_text'], df_pred['predicted_text']))

# print(avg_novelty(df_pred['predicted_text'], df_train['input_text']), avg_novelty(df_pred['predicted_text'], df_train['target_text']))
# print(avg_novelty(df_pred['predicted_text'], df_test['input_text']), avg_novelty(df_pred['predicted_text'], df_test['target_text']))
# print(avg_novelty(df_pred['predicted_text'], df_pred['input_text']), avg_novelty(df_pred['predicted_text'], df_pred['predicted_text']))


print("Novelty Score between predicted and test counters : ",avg_novelty(df_pred['predicted_text'], df_test['target_text']))
#print(, avg_novelty(df_pred['predicted_text'], df_train['target_text']))

Diversity Scores
Input pred:  (0.0, 524)
Predicted pred:  (0.007134345666386647, 524)
Novelty Score between predicted and test counters :  0.593115636672502


In [170]:
## bleu and meteor scores
hate  = []
reply = []
refrences = []
for ind in df_train.index:
    ht , rep = df_train['input_text'][ind] , df_train['target_text'][ind]
    hate.append(ht)
    reply.append(rep)

for ind in df_test.index:
    ht , rep = df_test['input_text'][ind] , df_test['target_text'][ind]
    hate.append(ht)
    reply.append(rep)

hate = list(set(hate))
mp={}

for ht_i in hate:
    refs = []
    for ind in df_train.index:
        ht_j , rep =  df_train['input_text'][ind] , df_train['target_text'][ind]
        if ht_j == ht_i:
            refs.append(rep)
    for ind in df_test.index:
        ht_j , rep =  df_test['input_text'][ind] , df_test['target_text'][ind]
        if ht_j == ht_i:
            refs.append(rep)
    mp[ht_i] = refs
    refrences.append(refs)   

In [171]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.translate.meteor_score import meteor_score

[nltk_data] Downloading package punkt to /home/mithundas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mithundas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/mithundas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [172]:
bleu = bleu_2 = bleu_1 = bleu_3 = bleu_4 = meteor_ = 0.0

for ind in df_pred.index:
    hates = df_pred['input_text'][ind]
    counters = df_pred['predicted_text'][ind]
    ref = mp[hates]

    ref_list = []
    for i in range(len(ref)):
        ref_list.append(word_tokenize(ref[i]))
    bleu += nltk.translate.bleu_score.sentence_bleu(ref_list, word_tokenize(counters))
    bleu_1  += nltk.translate.bleu_score.sentence_bleu(ref_list, word_tokenize(counters), smoothing_function=SmoothingFunction().method2, weights=(1.0, 0, 0, 0))
    bleu_2  += nltk.translate.bleu_score.sentence_bleu(ref_list, word_tokenize(counters), smoothing_function=SmoothingFunction().method2, weights=(0.5, 0.5, 0, 0))
    bleu_3  += nltk.translate.bleu_score.sentence_bleu(ref_list, word_tokenize(counters), smoothing_function=SmoothingFunction().method2, weights=(0.33, 0.33, 0.33, 0))
    bleu_4  += nltk.translate.bleu_score.sentence_bleu(ref_list, word_tokenize(counters), smoothing_function=SmoothingFunction().method2, weights=(0.25, 0.25, 0.25, 0.25))
    meteor_ += meteor_score(ref_list, word_tokenize(counters))

bleu    /= len(df_pred)
bleu_2  /= len(df_pred)
bleu_1  /= len(df_pred)
bleu_3  /= len(df_pred)
bleu_4  /= len(df_pred)
meteor_ /= len(df_pred)

#print("Bleu Score ", bleu)

print("Bleu 1 : ", bleu_1)
print("Bleu 2 : ", bleu_2)
print("Bleu 3 : ", bleu_3)
print("Bleu 4 : ", bleu_4)
print("Meteor Score", meteor_)

Bleu 1 :  0.12589624856901427
Bleu 2 :  0.08969732664188472
Bleu 3 :  0.07599577814823956
Bleu 4 :  0.06905791954426893
Meteor Score 0.07370179672461374


In [173]:
def rec(str1, str2):
    match = 0.0
    tok1 = word_tokenize(str1)
    tok2 = word_tokenize(str2)
    if(len(tok1)==0 or len(tok2)==0):
        return -999
    for i in tok1:
        for j in tok2:
            if i == j:
                match += 1.0
                break;
    return match/len(tok1)

def rec2(str1, str2):
    match = 0.0
    tok1 = word_tokenize(str1)
    tok2 = word_tokenize(str2)
    for i in tok2:
        for j in tok1:
            if i == j:
                match += 1.0
                break;
    return match/len(tok2)

recall = 0.0

for ind in df_pred.index:
    recall2 = 0.0
    hates = df_pred['input_text'][ind]
    counters = df_pred['predicted_text'][ind]
    ref = mp[hates]

    for i in range(len(ref)):
        recall2 = max(recall2, rec(counters, ref[i]))
        #print(recall2)
    
    recall += recall2

recall    /= len(df_pred)



precision = 0.0

for ind in df_pred.index:
    recall2 = 0.0
    hates = df_pred['input_text'][ind]
    counters = df_pred['predicted_text'][ind]
    ref = mp[hates]

    for i in range(len(ref)):
        recall2 = max(recall2, rec2(counters, ref[i]))
        #print(recall2)
    
    precision += recall2

precision    /= len(df_pred)

print("Precision: ", precision)
print("Recall: ", recall)
print("F-score: ", 2*precision*recall/(precision+recall))

Precision:  0.11107068199723155
Recall:  0.2975581526972649
F-score:  0.1617604248541627


In [50]:
def get_diversity(sentences):
    avg = 0.0
    for i in range(len(sentences)):
        max_overlap = 0
        for j in range(len(sentences)):
            if i!=j and sentences[i]!=sentences[j]:
                #print(sentences[i])
                #print()
                #print(sentences[j])
                max_overlap = max(max_overlap,get_jaccard_sim(sentences[i],sentences[j]))
                #print(max_overlap)
        avg = avg + (1-max_overlap)
    print("Avg : ",avg)
    print("Len : ",len(sentences))
    avg = (avg/len(sentences))
    return avg, len(sentences)

In [51]:
## Diversity Scores
print("Diversity Scores")
print("Input train: ", get_diversity(df_train['input_text']))
print("Target train: ", get_diversity(df_train['target_text']))

print("Input test: ", get_diversity(df_test['input_text']))
print("Target test: ", get_diversity(df_test['target_text']))

print("Input pred: ", get_diversity(df_pred['input_text']))
print("Predicted pred: ", get_diversity(df_pred['predicted_text']))

Diversity Scores
Avg :  1429.3393995128574
Len :  1818
Input train:  (0.7862152912611977, 1818)
Avg :  1240.9902994597046
Len :  1818
Target train:  (0.6826129259954371, 1818)
Avg :  440.8900923786468
Len :  524
Input test:  (0.8413933060661198, 524)
Avg :  386.6260382663821
Len :  524
Target test:  (0.7378359508900422, 524)
Avg :  440.8900923786468
Len :  524
Input pred:  (0.8413933060661198, 524)
Avg :  88.9443055546406
Len :  524
Predicted pred:  (0.16974104113481028, 524)


In [None]:
Input train:  (0.7862152912611977, 1818)
Target train:  (0.6826129259954371, 1818)
Input test:  (0.8413933060661198, 524)
Target test:  (0.7378359508900422, 524)
Input pred:  (0.8413933060661198, 524)
Predicted pred:  (0.16974104113481028, 524)

In [1]:
import torch

In [2]:
base_path='/home/mithun-binny/HateAlert_Folder/JointDir/Saurabh/'

In [4]:
model = torch.load(base_path + 'saved_models/Exp2/mt5/mt5_hindi_large.pt')