# Experiment with dictionary based model

In [1]:
import pandas as pd


# The data should be downloaded, use 1.0-download-raw-data.ipynb notebook
raw_data_path = '../data/raw/filtered.tsv'
df = pd.read_csv(raw_data_path, sep='\t', index_col=0)
df.head(15)

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348
5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...,0.703185,0.206522,0.950956,0.035846
6,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.,0.618866,0.230769,0.999492,0.000131
7,Maine was very short on black people back then.,there wasn't much black in Maine then.,0.720482,0.1875,0.96368,0.14871
8,"Briggs, what the hell's happening?","Briggs, what the hell is going on?",0.920373,0.0,0.159096,0.841071
9,"Another one simply had no clue what to do, so ...","another simply didn't know what to do, so when...",0.87754,0.101695,0.055371,0.930472


In [2]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test
train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)

In [44]:
train_data[train_data.lenght_diff == 0.0]

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
258171,"Unable to do so, he lifted his spear to drive ...","he couldn't, so he raised his spear to stab th...",0.853247,0.0,0.007300,0.971280
377953,He's now with the renegade cutthroats and hund...,"now he's there with those renegade thugs, and ...",0.660654,0.0,0.008816,0.864869
127413,And did your soul burst into flames?,did your insides leak out in flames?,0.640929,0.0,0.001363,0.869302
25150,Assassin!,murderer!,0.918425,0.0,0.451359,0.999035
97010,"I'll take the blame for last night, but you ba...","last night is my fault, but you fucked her in ...",0.698537,0.0,0.004944,0.999083
...,...,...,...,...,...,...
561353,Ho-lee crap.,bloody hell.,0.677188,0.0,0.999175,0.485974
500186,Where's all this rebel-artist shit now?,and where's the artist-revel thing now?,0.735851,0.0,0.997290,0.000055
278167,You backstabbers thought you could count cards...,you hypocrites thought you'd be counting cards...,0.855054,0.0,0.027535,0.998347
87498,We've got four long arns to stay low. - Yes.,we have 4 long arns to unseat the attention.,0.607185,0.0,0.008188,0.567953


In [48]:
train_data[train_data.similarity > 0.94]

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
222139,Killed him!,he killed him!,0.949454,0.200000,0.971677,0.007861
531990,"Nasty, cunning creatures.","nasty, cunning creature.",0.940734,0.038462,0.168051,0.990833
243573,"This monkey business is in your blood, under y...","this business is in your blood, under your skin.",0.942663,0.125000,0.979490,0.000264
164738,How the hell did they find this place?,how did they find this place?,0.946241,0.230769,0.742521,0.000038
324381,"Jimmy, Jimmy, what the hell are you doing, man?","Jimmy, what are you doing, man?",0.944909,0.333333,0.971490,0.000058
...,...,...,...,...,...,...
158823,Not until Judas is at your back.,not until Judas is behind your back.,0.940015,0.108108,0.028196,0.872542
65726,What are you waiting for?,what the hell are you waiting for?,0.946496,0.257143,0.000038,0.993094
400109,Keep your womb open.,keep your uterus open.,0.949700,0.086957,0.003472,0.995132
394070,She's depressed and miserable.,she's depressed and unhappy.,0.940012,0.064516,0.996192,0.001995


In [30]:
from tqdm import tqdm
import numpy as np


def get_toxic_text_with_threshold(data, toxicity_threshold=0.9):
    # Create a boolean masks
    reference_toxic_mask = data['ref_tox'] > toxicity_threshold
    translation_toxic_mask = data['trn_tox'] > toxicity_threshold

    # Use boolean indexing to filter the DataFrame
    toxic_ref = data[reference_toxic_mask]['reference'].to_numpy()
    toxic_trn = data[translation_toxic_mask]['translation'].to_numpy()

    # Concatenate
    return np.concatenate([toxic_ref, toxic_trn])

In [37]:
toxicity_threshold = 0.99
toxic_texts = get_toxic_text_with_threshold(train_data, toxicity_threshold)
print(f"{len(toxic_texts)} texts with toxicity threshold {toxicity_threshold}")

196046 texts with toxicity threshold 0.99


In [9]:
# Refernce: PMLDL Lab 3 notebook
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re


def lower_text(text: str):
    return text.lower()

def remove_numbers(text: str):
    """
    Substitute all punctuations with space in case of
    "there is5dogs".
    
    If subs with '' -> "there isdogs"
    With ' ' -> there is dogs
    """
    text_nonum = re.sub(r'\d+', ' ', text)
    return text_nonum

def remove_punctuation(text: str):
    """
    Substitute all punctiations with space in case of
    "hello!nice to meet you"
    
    If subs with '' -> "hellonice to meet you"
    With ' ' -> "hello nice to meet you"
    """
    text_nopunct = re.sub(r'[^a-z|\s]+', '', text)
    return text_nopunct

def remove_multiple_spaces(text: str):
    text_no_doublespace = re.sub('\s+', ' ', text).strip()
    return text_no_doublespace

def remove_contracted_forms(text: str):
    text_no_contracted_forms = re.sub("(\w+)'(\w+)", '', text)
    return text_no_contracted_forms

def tokenize_text(text: str) -> list[str]:
    return word_tokenize(text)

def remove_stop_words(tokenized_text: list[str]) -> list[str]:
    return [token for token in tokenized_text if token not in stopwords.words('english')]
    

stemmer = PorterStemmer()


def stem_words(tokenized_text: list[str]) -> list[str]:
    return [stemmer.stem(token) for token in tokenized_text]

In [12]:
def clean(text):
    _text = lower_text(text)
    _text = remove_numbers(_text)
    _text = remove_contracted_forms(_text)
    _text = remove_punctuation(_text)
    _text = remove_multiple_spaces(_text)
    return _text

def tokenize_and_stem(text):
    tokenized = tokenize_text(text)
    tokenized = remove_stop_words(tokenized)
    stemmed = stem_words(tokenized)
    return tokenized, stemmed

In [13]:
tox_dict = {}


for _, row in list(test_data.iterrows())[:20]:
    print(row.reference)
    print(row.translation)
    ref_cleaned = clean(row.reference)
    trn_cleaned = clean(row.translation)
    ref_tokenized, ref_stemmed = tokenize_and_stem(ref_cleaned)
    trn_tokenized, trn_stemmed = tokenize_and_stem(trn_cleaned)
    print(ref_tokenized)
    print(trn_tokenized)
    if row.ref_tox < row.trn_tox:
        ref_tokenized, trn_tokenized = trn_tokenized, ref_tokenized
        ref_stemmed, trn_stemmed = trn_stemmed, ref_stemmed

    for i in range(len(ref_stemmed)):
        w = ref_stemmed[i]
        if w in trn_stemmed:
            j = trn_stemmed.index(w)
            ref_tokenized[i] = None
            trn_tokenized[j] = None
    
    j = 0
    for i in range(len(ref_tokenized)):
        w1 = ref_tokenized[i]
        w2 = trn_tokenized[j] if j < len(trn_tokenized) else None
        if w1 is not None:
            if w2 is not None:
                tox_dict[w1] = w2  # For replacing
                j += 1
            else:
                tox_dict[w1] = ''  # For removing
        else:
            j += 1
    print(ref_tokenized)
    print(trn_tokenized)
    print(tox_dict)
    print()

# print(tox_dict)

Listen, call off the butchers, and I'll tell you.
call out your butchers and I'll tell you.
['listen', 'call', 'butchers', 'tell']
['call', 'butchers', 'tell']
[None, None, None]
['listen', None, None, None]
{}

Who the fuck has been going through my stuff?!
who the hell was going through my stuff?
['fuck', 'going', 'stuff']
['hell', 'going', 'stuff']
['fuck', None, None]
['hell', None, None]
{'fuck': 'hell'}

She still might die . . .?
he can still die.
['still', 'might', 'die']
['still', 'die']
[None, None]
[None, 'might', None]
{'fuck': 'hell'}

Yeah, that's the fucker's name.
that's what his name was.
['yeah', 'name']
['name']
['yeah', None]
[None]
{'fuck': 'hell', 'yeah': ''}

I would take you on my shoulders, like, I'd strap you up, and I'd be like, let's go through hell.
I'd take you on your shoulders... I'd tie you up, and I'd say,
['would', 'take', 'shoulders', 'like', 'strap', 'like', 'go', 'hell']
['take', 'shoulders', 'tie', 'say']
['would', None, None, 'like', 'strap', 'li