In [None]:
# !pip install transformers tqdm 

In [1]:
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline, AutoModelForMaskedLM, AutoTokenizer
import re
from tqdm import tqdm
from itertools import permutations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import pandas as pd
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_candidates_setwise(Ci,Ak, wk):
    Ci_final = set()
    for cj in Ci:
        if cj in Ci_final:
            Ci_final.remove(cj)
        c0j = cj.split()
        c_temp = c0j.copy()
        for a in Ak:
            if type(a) == tuple:
                c0j[a[1]] = a[0]
                Ci_final.add(' '.join((c0j)))
                c0j = c_temp.copy()
                continue
            for i in range(len(a)):
                c0j[a[i][1]] = a[i][0]
                Ci_final.add(' '.join((c0j)))
            c0j = c_temp.copy()
    return Ci_final

def candidate_set_generation(xi, pi0, Ti, T0i):
    c0 = ' '.join(['<mask>'] * len(pi0))
    Ci = {c0}
    Tshared = Ti.intersection(T0i)
    for tk in Tshared:
        Wk = [word for word, pos in xi if pos == tk]
        Sk = [i for i, pos in enumerate(pi0) if pos[1] == tk]
        Ak = []
        if len(Wk)==1 and len(Sk)>1:
            for i in Sk:
                Ak.append((Wk[0],i))
        elif len(Sk)==1: 
            for word in Wk:
                Ak.append((word,Sk[0]))
        elif len(Sk)>len(Wk):
            perms = list(permutations(Sk,len(Wk)))
            for perm in perms:
                Ak.append(list(zip(Wk,perm)))
        Ci_tag = get_candidates_setwise(Ci,Ak,Wk)
        Ci = Ci.union(Ci_tag)
    return Ci

In [3]:
#Import similar sentences dataset
bad_sentences_similar_pos_tags = pd.read_csv('code/data/bad_sentences_similar_pos_tags.csv',header=0)

In [4]:
import ast
candidates_generated = {}
for i in tqdm(range(len(bad_sentences_similar_pos_tags))):
    candidate_sentences = None
    xi = bad_sentences_similar_pos_tags['bad_sentences'][i]

    if len(xi.split())>15:
        continue
    # print(f'xi {xi}')
    tag_xi = bad_sentences_similar_pos_tags['bad_sentences_pos_tags'][i]
    tag_xi = [s.strip() for s in tag_xi[1:-1].split(',')]
    # print(f'tag_xi {tag_xi}')
    xi_mod = [(word,tag) for word,tag in zip(xi.split()[:15],tag_xi[:15])]
    Ti = set(tag_xi[:15])
    list_of_similar_sentences = [s.strip() for s in bad_sentences_similar_pos_tags['similar_sentences'][i][1:-1].split(',')]
    list_of_similar_sentences_pos_tags = ast.literal_eval(bad_sentences_similar_pos_tags['pos_tags_similar_sentences'][i])
    # print(f'list_of_similar_sentences_pos_tags {list_of_similar_sentences_pos_tags}')
    for j in range(len(list_of_similar_sentences)):
        
        pi0 = list_of_similar_sentences[j]
        if len(pi0.split())>15:
            continue
        # print(f'pi0 {pi0}')
        try:
            tag_pi0 = list_of_similar_sentences_pos_tags[j]
        except:
            continue
        # print(f'tag_pi0 {tag_pi0}')
        try:
            tag_pi0 = [s.strip() for s in tag_pi0[1:-1].split(',')]
        except:
            continue
        pi0_mod = [(word,tag) for word,tag in zip(pi0.split(),tag_pi0)]
        T0i = set(tag_pi0)
        # print(f'xi_mod = {xi_mod} ')
        # print(f'pi0_mod = {pi0_mod}')
        Ci = candidate_set_generation(xi_mod, pi0_mod, Ti, T0i)
        # print(len(Ci))
        if candidate_sentences is None:
            candidate_sentences = Ci
        else:
            candidate_sentences = candidate_sentences.union(Ci)
    candidates_generated[i] = candidate_sentences


100%|██████████| 16764/16764 [00:08<00:00, 1906.65it/s]


In [16]:
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer

# Load the pre-trained DistilRoBERTa model and tokenizer
model_name = "distilroberta-base"
model = AutoModelForMaskedLM.from_pretrained(model_name)
model = model.to('cuda')

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Define the sentence with multiple masks

# Initialize the pipeline for masked language modeling
fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
    device=0
)

def mask_filled_sentences(sentences):
    mask_filled_sentences_list = []
    for sentence in tqdm(sentences):
        sentence=sentence
        num_masks = sentence.count(tokenizer.mask_token)
        if num_masks > len(sentence.split())-2:
            continue
        for i in range(num_masks):
            results = fill_mask(sentence, top_k=5)
            if i == num_masks - 1:
                sentence = results[0]['sequence']
            else:
                sentence = results[0][0]['sequence']
        mask_filled_sentences_list.append(sentence)
    return mask_filled_sentences_list

Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/331M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu


def calculate_scores(bad_sentence, generated_sentences):
    # Create a CountVectorizer to convert sentences into BoW vectors
    vectorizer = CountVectorizer()

    # Convert original sentence and generated sentences to BoW vectors
    bow_original = vectorizer.fit_transform([bad_sentence])
    bow_generated = vectorizer.transform(generated_sentences)

    # Calculate cosine similarities between original sentence and generated sentences
    cos_similarities = cosine_similarity(bow_original, bow_generated)[0]
    
    # Preprocess the bad sentence and generated sentences
    bad_sentence_tokens = nltk.word_tokenize(bad_sentence.lower())
    generated_sentences_tokens = [nltk.word_tokenize(sent.lower()) for sent in generated_sentences]

    # Calculate BLEU score and toxicity score for each sentence
    scores = []
    for i, sent_tokens in enumerate(generated_sentences_tokens):
        # Calculate BLEU score
        bleu_score = sentence_bleu([bad_sentence_tokens], sent_tokens)
        
        # Calculate toxicity score using the 
        toxicity_score = 0

        # Sum the scores and append to the list
        total_score = cos_similarities[i] + bleu_score + toxicity_score

        scores.append(total_score)
        
    max_index = scores.index(max(scores))
    
    return scores, generated_sentences[max_index]


In [51]:
# generate text for each of the candidate sentences
generated_texts = {}
for key in (candidates_generated.keys()): 
    candidate_sentences = list(candidates_generated[key])
    mask_filled_sentences_list = mask_filled_sentences(candidate_sentences)
    if mask_filled_sentences_list is not None:
        generated_texts[key] = mask_filled_sentences_list

100%|██████████| 231/231 [00:17<00:00, 13.06it/s]
100%|██████████| 477/477 [00:57<00:00,  8.25it/s]
100%|██████████| 2440/2440 [04:22<00:00,  9.29it/s]
100%|██████████| 226/226 [00:15<00:00, 14.74it/s]
100%|██████████| 487/487 [01:01<00:00,  7.98it/s]
100%|██████████| 54/54 [00:04<00:00, 11.84it/s]
100%|██████████| 47/47 [00:02<00:00, 17.22it/s]
100%|██████████| 24/24 [00:02<00:00,  8.17it/s]
100%|██████████| 70/70 [00:07<00:00,  9.60it/s]
100%|██████████| 1383/1383 [03:10<00:00,  7.26it/s]
100%|██████████| 19/19 [00:05<00:00,  3.67it/s]
100%|██████████| 570/570 [01:11<00:00,  7.99it/s]
100%|██████████| 112/112 [00:09<00:00, 11.24it/s]
100%|██████████| 22/22 [00:00<00:00, 22.05it/s]
100%|██████████| 71/71 [00:02<00:00, 32.98it/s]
100%|██████████| 524/524 [00:48<00:00, 10.75it/s]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use Smoot

KeyboardInterrupt: ignored

In [22]:
generated_texts.keys()

dict_keys([0, 6, 8, 11, 18, 21, 23, 27, 29, 31, 33, 34, 36, 37, 39, 40, 41, 42, 47, 48, 49, 54, 58, 60, 61, 64, 67, 69, 75, 78, 83, 86, 90, 91, 93, 98, 102, 104, 105, 106, 107, 112, 116, 118, 122, 124, 129, 131, 133, 136, 140, 141, 142, 143, 146, 161, 162, 163, 165, 170, 173, 175, 177, 178, 183, 185, 186, 188, 189, 190, 191, 195, 197, 198, 200, 201])

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


def calculate_scores(bad_sentence, generated_sentences):
    # Define the smoothing function
    smoothie = SmoothingFunction().method4

    # Create a CountVectorizer to convert sentences into BoW vectors
    vectorizer = CountVectorizer()

    # Convert original sentence and generated sentences to BoW vectors
    bow_original = vectorizer.fit_transform([bad_sentence])
    bow_generated = vectorizer.transform(generated_sentences)

    # Calculate cosine similarities between original sentence and generated sentences
    cos_similarities = cosine_similarity(bow_original, bow_generated)[0]
    
    # Preprocess the bad sentence and generated sentences
    bad_sentence_tokens = nltk.word_tokenize(bad_sentence.lower())
    generated_sentences_tokens = [nltk.word_tokenize(sent.lower()) for sent in generated_sentences]

    # Calculate BLEU score and toxicity score for each sentence
    scores = []
    for i, sent_tokens in enumerate(generated_sentences_tokens):
        # Calculate BLEU score
        bleu_score = sentence_bleu([bad_sentence_tokens], sent_tokens, smoothing_function=smoothie)
        
        # Calculate toxicity score using the 
        toxicity_score = 0

        # Sum the scores and append to the list
        total_score = cos_similarities[i] + bleu_score + toxicity_score

        scores.append(total_score)
        
    max_index = scores.index(max(scores))
    
    return scores, generated_sentences[max_index]

In [40]:
for i,idx in enumerate(generated_texts.keys()): 
    bad_sentence = bad_sentences_similar_pos_tags["bad_sentences"][idx]
    generated_sentences = generated_texts[idx]
    print(f'{i} bad sentence -------------------------------- {bad_sentence}')
    # Calculate BLEU score and toxicity score for each sentence
    scores,best_sentence = calculate_scores(bad_sentence, generated_sentences)
    print(f'Good sentence ------------------------------ {best_sentence}\n')


0 bad sentence -------------------------------- dude my employer uses svb were dead in the water right now
Good sentence ------------------------------ in the water right

1 bad sentence -------------------------------- here we go first turd to hit the fan here come the fireworks
Good sentence ------------------------------ To help the fan enjoy the content we offer!
Click here → turd

2 bad sentence -------------------------------- here comes the beginning of another fucking recession
Good sentence ------------------------------ Another recession here marks the beginning of the beginning of another recession

3 bad sentence -------------------------------- i wonder if anyones actually going to prison over this shit this timelol jk
Good sentence ------------------------------ I wonder if this shit is really actually funny."

4 bad sentence -------------------------------- yea work in advertising and todays been a shit show
Good sentence ------------------------------ yea in advertising