In [1]:
from transformers import AutoTokenizer, TFAutoModel, AutoModel, TFBertForSequenceClassification, TFAutoModelForSequenceClassification, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
import tensorflow as tf
from models import *
import numpy as np
from tqdm import tqdm
import transformers
import shap
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
import scipy
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torchsummary import summary
import torch.nn.functional as F
import nltk
from nltk.corpus import wordnet
from happytransformer import HappyTextToText, TTSettings


# Wordnet

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hubert\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Hubert\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
args = TTSettings(num_beams=5, min_length=1)

01/16/2022 12:26:31 - INFO - happytransformer.happy_transformer -   Using model: cpu


In [4]:
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lm in syn.lemmas():
                 synonyms.append(lm.name().replace('_', ' '))#adding into synonyms
    return list(set(synonyms))

# HateSpeach model

In [5]:
hate_tokenizer = AutoTokenizer.from_pretrained("unitary/toxic-bert")

hate_model = AutoModelForSequenceClassification.from_pretrained("unitary/toxic-bert")

pred = transformers.pipeline("text-classification", model=hate_model, tokenizer=hate_tokenizer, return_all_scores=True)

# SentenceSimilarity

In [6]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

ss_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
ss_model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Functions

In [7]:
def eval_hate(sentence, hate_model=hate_model, hate_tokenizer=hate_tokenizer):
    y = hate_model(**hate_tokenizer(sentence, return_tensors='pt'))
    return scipy.special.expit(y.logits.detach().numpy())[0]

def eval_ss(input_sentence1, input_sentence2, ss_model=ss_model, ss_tokenizer=ss_tokenizer):
    
    input_sentences = [input_sentence1, input_sentence2]
    
    encoded_input = ss_tokenizer(input_sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = ss_model(**encoded_input)

    # Perform pooling. In this case, max pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return torch.sqrt(torch.sum((sentence_embeddings[0] - sentence_embeddings[1])**2)).numpy()

In [8]:
def make_sugestion(input_sentence, word_idx,
                   labels=["toxic","severe_toxic","obscene","threat","insult","identity_hate"],
                  args=args):
    
    sentence = input_sentence
    split_sentence = sentence.split(' ')
    bad_word = split_sentence[word_idx]
    bad_word = bad_word.replace('!', '').replace('?', '').replace('.', '').replace(',', '')
    print(f'original sentence: {input_sentence}')
    original_hate = eval_hate(sentence)
    for i in range(len(labels)):
        print(f'{labels[i]} score: {100*(round(original_hate[i], 2))}%')
        
    print()
    print()
    
    ss_tab = []
    hate_tab = []
    synoms = get_synonyms(bad_word)
    synoms.append('')
    result_tab = []
    love_tab = []
    
    for synom in synoms:
        new_sentence = split_sentence.copy()
        new_sentence[word_idx] = synom
        new_sentence = ' '.join(new_sentence)
        
        result = happy_tt.generate_text(f"grammar: {new_sentence}", args=args).text
        hate = eval_hate(result) - eval_hate(sentence)
        result_tab.append(result)
        hate_tab.append(hate)
        ss_tab.append(eval_ss(sentence, result))
        love_tab.append(np.sum(hate))
    
    
    hate_tab = np.array(hate_tab)
    ss_tab = np.array(ss_tab)
    result_tab = np.array(result_tab)
    sort_arr = np.argsort(love_tab)
    
    hate_tab = hate_tab[sort_arr]
    ss_tab = ss_tab[sort_arr]
    result_tab = result_tab[sort_arr]
    
    for k in range(hate_tab.shape[0]):
        print(result_tab[k])
        for i in range(hate_tab.shape[1]):
            print(labels[i], ": ", 100*(np.round(hate_tab[k, i] / original_hate[i], 2)), "%")
    
        print(f'Similaryty score: {round(ss_tab[k], 2)}')
        print()
        


# Ewaluacja

In [10]:
make_sugestion("I will hurt you", 2)

original sentence: I will hurt you
toxic score: 83.99999737739563%
severe_toxic score: 7.999999821186066%
obscene score: 5.000000074505806%
threat score: 75.99999904632568%
insult score: 7.000000029802322%
identity_hate score: 1.9999999552965164%


I will smart you.
toxic :  -100.0 %
severe_toxic :  -100.0 %
obscene :  -100.0 %
threat :  -100.0 %
insult :  -100.0 %
identity_hate :  -99.00000095367432 %
Similaryty score: 2.8299999237060547

I will distress you.
toxic :  -99.00000095367432 %
severe_toxic :  -100.0 %
obscene :  -99.00000095367432 %
threat :  -100.0 %
insult :  -99.00000095367432 %
identity_hate :  -98.00000190734863 %
Similaryty score: 2.5999999046325684

I will anguish you.
toxic :  -98.00000190734863 %
severe_toxic :  -100.0 %
obscene :  -99.00000095367432 %
threat :  -100.0 %
insult :  -99.00000095367432 %
identity_hate :  -99.00000095367432 %
Similaryty score: 2.5399999618530273

I will you.
toxic :  -98.00000190734863 %
severe_toxic :  -100.0 %
obscene :  -98.0000019

In [None]:
make_sugestion("You are such an idiot", 4)

original sentence: You are such an idiot
toxic score: 99.00000095367432%
severe_toxic score: 3.999999910593033%
obscene score: 75.0%
threat score: 0.0%
insult score: 95.99999785423279%
identity_hate score: 0.9999999776482582%


