In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
%matplotlib inline
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


In [2]:
DATA_ROOT = Path("data") / "jigsaw"

In [3]:
train, test = [pd.read_csv(DATA_ROOT / fname) for fname in ["train_new_large_without_leaks.csv", "test_proced.csv"]]

In [4]:
train.toxic = train.target>0.5

  """Entry point for launching an IPython kernel.


In [5]:
import spacy
nlp = spacy.load('en_core_web_sm')
def tok(s): return [tok.text for tok in nlp.tokenizer(s)]

In [6]:
def MostIndicativeN(vectorizer, clf, N):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)
        

In [7]:
vectorizer = CountVectorizer(tokenizer=tok, ngram_range=(1,1))
clf = LogisticRegression()
pipe = Pipeline([('vectorizer', vectorizer), ('clf', clf)])

train1 = train['comment_text'].tolist()
test1 = test['comment_text'].tolist()
labelsTest1 = test.toxic.tolist()

labelsTrain1 = train.toxic.tolist()

pipe.fit(train1, labelsTrain1)

MostIndicativeN(vectorizer, clf, 10)





Class 1 best: 
(-0.6055146956335433, 'cuntnlu')
(-0.5819076848671683, 'redirect')
(-0.5781602364093442, 'thanks')
(-0.44569571694290727, 'talk')
(-0.37282909639777817, 'thank')
(-0.35716070658040605, 'please')
(-0.3002855035075513, 'scrotumcan')
(-0.2947513122719908, 'scratch')
(-0.27648761225028434, 'at')
(-0.26095453811477454, 'may')
Class 2 best: 
(2.0121141421705597, 'fuck')
(1.8311746320367872, 'stupid')
(1.6570192184075108, 'fucking')
(1.4442572758859067, 'shit')
(1.2636729928692303, 'idiot')
(1.0693873315781715, 'ass')
(1.0541216769006476, 'suck')
(0.9631088047460871, 'penis')
(0.8659579934653944, 'bitch')
(0.846472173946417, 'asshole')


In [15]:
def transform(word):
    transformations = 'insert'
    
    name = transformations
    
    if "swap" == name:
        return swap(word)
    elif "insert" == name:
        return insert(word)
    elif "remove" == name:
        return remove(word)
    elif "homoglyph" == name:
        return homoglyph(word)
    elif "repeat_char" == name:
        return repeat_char(word)
       elif "distractor" == name:
        return distractor(word)
        

In [17]:

def swap(word):
    cword = word
    if len(word)>=4:
        s = np.random.randint(1,len(word)-2)
        cword = word[:s] + word[s+1] + word[s] + word[s+2:]
    return (cword)

def remove(word):
    s = np.random.randint(0,len(word))
    if len(word)>2:
        cword = word[:s] + word[s+1:]
    else:
        cword = word
    return cword

def insert(word):
    cword = word
    s = np.random.randint(0,len(word)+1)
    cword = word[:s] + chr(97+np.random.randint(0,26)) + word[s:]
    
    return (cword)


def homoglyph(word):
    s = np.random.randint(0,len(word))
    homos = {'-':'Àó','9':'‡ß≠','8':'»¢','7':'ùüï','6':'–±','5':'∆º','4':'·èé','3':'∆∑','2':'·íø','1':'l','0':'O',"'":'`','a': '…ë', 'b': '–¨', 'c': 'œ≤', 'd': '‘Å', 'e': '–µ', 'f': 'ùöè', 'g': '…°', 'h': '’∞', 'i': '—ñ', 'j': 'œ≥', 'k': 'ùíå', 'l': '‚Öº', 'm': 'ÔΩç', 'n': '’∏', 'o':'–æ', 'p': '—Ä', 'q': '‘õ', 'r': '‚≤Ö', 's': '—ï', 't': 'ùöù', 'u': '’Ω', 'v': '—µ', 'w': '‘ù', 'x': '√ó', 'y': '—É', 'z': '·¥¢'}

    if word[s] in homos: 
        rletter = homos[word[s]]
    else:
        rletter = word[s]
    cword = word[:s] + rletter + word[s+1:]

    return (cword)


def repeat_char(word):
    s = np.random.randint(0,len(word))
    rletter = word[s]
    cword = word[:s] + rletter + word[s:]

    return (cword)

def distractor(word):
    s = np.random.randint(0,len(topClass1))
    distractor_word = topClass1[s][1]
    cword = word + ' ' + distractor_word

    return (cword)

In [18]:
#to change only words indicative of toxicity

def transform_sentence(text):
    
    text_2  =text
    
    #if (random.uniform(0,1) < prob):
    for word in text.split():
        if word.lower() in Top10000:
            text_2 = text.replace(word, transform(word))
    return text_2

In [77]:
#

def transform_sentence_2(text):
    
    text_2  =text
    s = np.random.randint(0,len(text.split()))
    word = text.split()[s]
    prob = 1
    if (random.uniform(0,1) < prob):
        text_2 = text.replace(word, transform(word))
    return text_2

In [19]:
N=10000
feature_names = vectorizer.get_feature_names()
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
topClass1 = coefs_with_fns[:N]
topClass2 = coefs_with_fns[:-(N + 1):-1]

In [25]:
Top10000 = list(np.array(topClass2)[:,1])


In [22]:
train =pd.read_csv(DATA_ROOT/'train.csv')

In [None]:
test2_insert = [transform_sentence(sent) for sent in test1]  #change transform(word) code accordingly

In [None]:
test2_remove = [transform_sentence(sent) for sent in test1]

In [None]:
test2_swap = [transform_sentence(sent) for sent in test1]

In [None]:
test2_homoglyph = [transform_sentence(sent) for sent in test1]

In [None]:
test2_repeat = [transform_sentence(sent) for sent in test1]

In [None]:
test_2 = test
test_2['comment_text'] = pd.Series(test2_repeat)
test_2.to_csv(DATA_ROOT / 'new_large/test_noised_repeat.csv', index = False)

In [29]:
Top10000_insert = [insert(word) for word in Top10000]
Top10000_remove = [remove(word) for word in Top10000]
Top10000_swap = [swap(word) for word in Top10000]
Top10000_homoglyph = [homoglyph(word) for word in Top10000]
Top10000_repeat = [repeat_char(word) for word in Top10000]



In [40]:
noise = [('original', Top10000),
         ('insert', Top10000_insert),
        ('remove', Top10000_remove),
        ('swap', Top10000_swap),
        ('homoglyph', Top10000_homoglyph),
        ('repeat', Top10000_repeat)]

In [41]:
noise_df= pd.DataFrame.from_items(noise)
noise_df.head()

  """Entry point for launching an IPython kernel.


In [72]:
def transform(word):
    transformations = ['insert','remove','homoglyph','repeat_char']
    s = np.random.randint(0,len(transformations))
    
    name = transformations
    
    if "swap" == name:
        return swap(word)
    elif "insert" == name:
        return insert(word)
    elif "remove" == name:
        return remove(word)
    elif "homoglyph" == name:
        return homoglyph(word)
    elif "repeat_char" == name:
        return repeat_char(word)
    elif "distractor" == name:
        return distractor(word)
        

In [None]:
test2_transformed = [transform_sentence(sent) for sent in test1]


In [None]:
test_2 = test
test_2['comment_text'] = pd.Series(test2_repeat)
test_2.to_csv(DATA_ROOT / 'new_large/test_noised_repeat.csv', index = False)