# setups

In [1]:
!pip3 install tensorflow_text>=2.0.0rc0

In [2]:
!pip3 install sentencepiece



# data load

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

import nltk
import re
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [4]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from tensorflow.keras.layers import GRU, Conv1D, MaxPooling1D, Flatten, BatchNormalization, GlobalAveragePooling1D, AveragePooling1D, Average, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping

In [5]:
train =  pd.read_csv("/content/drive/MyDrive/Parrot_teamproject/train.csv")
test =  pd.read_csv("/content/drive/MyDrive/Parrot_teamproject/test.csv")
test_labels =  pd.read_csv("/content/drive/MyDrive/Parrot_teamproject/test_labels.csv")
submission =  pd.read_csv("/content/drive/MyDrive/Parrot_teamproject/sample_submission.csv")

In [6]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
train_x = train['comment_text']
test_x = test['comment_text']
train_y = train[labels].values

In [8]:
print('훈련용 코멘트 : {}'.format(len(train_x)))
print('테스트용 코멘트 : {}'.format(len(test_x)))
num_classes = 6
print('카테고리 : {}'.format(num_classes))

훈련용 코멘트 : 159571
테스트용 코멘트 : 153164
카테고리 : 6


# data preprocessing

## re & lemmatization

In [9]:
train_x = list(train_x)

In [10]:
import string

result = string.punctuation
print(result)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [11]:
def clean_text(text):
    output = ""
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    text = re.sub(r"its\s","it is",text) 
    text = re.sub(r"arent","are not",text)
    text = re.sub(r"couldnt","could not",text)
    text = re.sub(r"didnt","did not",text)
    text = re.sub(r"doesnt","does not",text)
    text = re.sub(r"dont","do not",text)
    text = re.sub(r"hadnt","had not",text)
    text = re.sub(r"hasnt","has not",text)
    text = re.sub(r"havent","have not",text)
    text = re.sub(r"isnt","is not",text)

    text = re.sub(r"mustnt","must not",text)
    text = re.sub(r"shadnt","shall not",text)
    text = re.sub(r"werent","were not",text)
    text = re.sub(r"wheres","where is",text)
    text = re.sub(r"whod","who would",text)
    text = re.sub(r"wont","will not",text)
    text = re.sub(r"wouldnt","would not",text)
    text = re.sub(r"whats", "what is", text)

    text = re.sub(r"\ve", " have", text)
    text = re.sub(r"cant", "can not", text)
    text = re.sub(r"lets","let us",text)
    text = re.sub(r"mightnt","might not",text)
    text = re.sub(r"im", "i am", text)
    
    for word in text:
      output = output + "" + word
    return str(output.strip())

In [12]:
train_texts = [] 

for line in tqdm(train_x, total=train.shape[0]): 
    train_texts.append(clean_text(line))

100%|██████████| 159571/159571 [00:22<00:00, 7004.77it/s]


In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
from nltk.stem import WordNetLemmatizer

l = WordNetLemmatizer()

def lemma(text, lemmatization=True):
  output=""
  if lemmatization:
    text=text.split(" ")
    for word in text:
       word1 = l.lemmatize(word, pos = "n")
       word2 = l.lemmatize(word1, pos = "v")
       word3 = l.lemmatize(word2, pos = "a")
       word4 = l.lemmatize(word3, pos = "r")
       output=output + " " + word4
  else:
    output=text
  
  return str(output)

In [15]:
train_x_lemma = []

for line in tqdm(train_texts, total=train.shape[0]): 
    train_x_lemma.append(lemma(line))

100%|██████████| 159571/159571 [01:59<00:00, 1339.73it/s]


In [16]:
print('Cleaned data:', train_texts[1], train_y[1])
print('Length of cleaned cleaned data:', len(train_texts[1]))
print('Lemmatized data:', train_x_lemma[1], train_y[1])
print('Length of lemmatized data:', len(train_x_lemma[1]))

Cleaned data: daww he matches this background colour i am seemingly stuck with thanks  talk  january   utc [0 0 0 0 0 0]
Length of cleaned cleaned data: 92
Lemmatized data:  daww he match this background colour i be seemingly stick with thank  talk  january   utc [0 0 0 0 0 0]
Length of lemmatized data: 90


In [17]:
test_x = list(test_x)

In [18]:
texts_test = [] 

for line in tqdm(test_x, total=test.shape[0]): 
    texts_test.append(clean_text(line))

100%|██████████| 153164/153164 [00:19<00:00, 7890.60it/s]


In [19]:
test_x_lemma = [] 

for line in tqdm(texts_test, total=test.shape[0]): 
    test_x_lemma.append(lemma(line))

100%|██████████| 153164/153164 [01:45<00:00, 1450.45it/s]


## sentencepiece tokenize

https://lsjsj92.tistory.com/600

https://wikidocs.net/86657

tokenizing의 또다른 방법입니다. train data를 기반으로 저만의 vocabulary를 만드는 거라고 보시면 될 것 같습니다.

sentencepiece tokenizing을 하기 위해서는 훈련 데이터를 텍스트 파일로 분리하고, 읽어오면서 fit을 진행하기 때문에 번거롭지만 제 디렉토리에 분리해서 파일을 만들었습니다!

In [20]:
import pickle

In [21]:
filePath_train = '/content/drive/MyDrive/Parrot/toxic_train.txt'
trainList = train_x_lemma
with open(filePath_train, 'wb') as lf_train:
    pickle.dump(trainList, lf_train)

In [22]:
filePath_test = '/content/drive/MyDrive/Parrot/toxic_test.txt'
testList = test_x_lemma
with open(filePath_test, 'wb') as lf_test:
    pickle.dump(testList, lf_test)

In [23]:
with open(filePath_train, 'rb') as lf_train:
    readList = pickle.load(lf_train)
    print(readList[1])

 daww he match this background colour i be seemingly stick with thank  talk  january   utc


In [24]:
with open(filePath_test, 'rb') as lf_test:
    readList = pickle.load(lf_test)
    print(readList[1])

 from rfc   the title be fine a it be i amo




> RuntimeError: Internal: /sentencepiece/python/bundled/sentencepiece/src/trainer_interface.cc(579) [(trainer_spec_.vocab_size()) == (model_proto->pieces_size())] Vocabulary size too high (30000). Please set it to a value <= 10261.

벌써 망한 조짐이 보이죠... 사이즈가 이렇게 작아도 좋은 결과를 얻을 수 있나..?



In [None]:
def write_cipher_text(texts, filename='/content/drive/MyDrive/Parrot/spm_train.txt'):
    with open(filename, 'wb') as f:
        for text in texts:
            f.write(text + "\n")

In [28]:
input_file = '/content/drive/MyDrive/Parrot/spm_train.txt'
vocab_size = 25000
prefix = '/content/drive/MyDrive/Parrot/sentencepiece/toxic'

In [29]:
templates = '--input={} --model_prefix={} --vocab_size={}'
cmd = templates.format(input_file, prefix, vocab_size)
cmd

'--input=/content/drive/MyDrive/Parrot/spm_train.txt --model_prefix=/content/drive/MyDrive/Parrot/sentencepiece/toxic --vocab_size=25000'

In [32]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load("/content/drive/MyDrive/Parrot/sentencepiece/toxic.model")

True

In [33]:
for t in train_x_lemma[:5]:
  print(t)
  print(sp.encode_as_pieces(t))
  print(sp.encode_as_ids(t), '\n')

 explanationwhy the edit ismade under my username hardcore metallica fan be revert they be not vandalism just closure on some gas after i vote at new york doll fac and please do not remove the template from the talk page since i be retire
['▁explanation', 'why', '▁the', '▁edit', '▁is', 'made', '▁under', '▁my', '▁username', '▁hard', 'core', '▁metallic', 'a', '▁fan', '▁be', '▁revert', '▁they', '▁be', '▁not', '▁vandalism', '▁just', '▁clo', 'sure', '▁on', '▁some', '▁gas', '▁after', '▁i', '▁vote', '▁at', '▁new', '▁york', '▁doll', '▁fac', '▁and', '▁please', '▁do', '▁not', '▁remove', '▁the', '▁template', '▁from', '▁the', '▁talk', '▁page', '▁since', '▁i', '▁be', '▁retire']
[1182, 3719, 4, 48, 94, 2686, 279, 37, 1654, 432, 2902, 3996, 82, 1082, 5, 186, 63, 5, 17, 308, 60, 2865, 1911, 19, 49, 4216, 193, 8, 888, 41, 163, 1080, 6775, 2420, 10, 59, 21, 17, 117, 4, 420, 42, 4, 62, 33, 198, 8, 5, 3977] 

 daww he match this background colour i be seemingly stick with thank  talk  january   utc
['▁daw

In [34]:
def encode_ciphertext(ciphertext):
    sp = spm.SentencePieceProcessor()
    sp.Load('/content/drive/MyDrive/Parrot/sentencepiece/toxic.model')
    encodedtext = []
    for text in ciphertext:
        encodedtext.append(sp.encode_as_ids(text))
    return encodedtext

train_encoded = encode_ciphertext(train_x_lemma)
test_encoded = encode_ciphertext(test_x_lemma)

In [35]:
train_encoded[1]

[7212,
 245,
 57,
 1730,
 216,
 23,
 1168,
 3,
 3471,
 8,
 5,
 168,
 1524,
 1474,
 27,
 101,
 62,
 1346,
 460]

In [36]:
print('train dataset의 최대 길이 : {}'.format(max(len(l) for l in train_encoded)))
print('train dataset의 평균 길이 : {}'.format(sum(map(len, train_encoded))/len(train_encoded)))

train dataset의 최대 길이 : 4935
train dataset의 평균 길이 : 82.62205538600372


In [37]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 200
X_train = pad_sequences(train_encoded, maxlen=max_len)
X_test = pad_sequences(test_encoded, maxlen=max_len)

In [38]:
from numpy import array, asarray, zeros

embeddings_dictionary = {}

glove_file = open('/content/drive/MyDrive/Parrot_teamproject/glove.6B.100d.txt.zip (Unzipped Files)/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

print('Found %s numbers of vector' % len(embeddings_dictionary))

Found 400000 numbers of vector


In [43]:
vocabs = [[sp.id_to_piece(id), id] for id in range(sp.get_piece_size())]

In [45]:
embedding_matrix = zeros((vocab_size, 100))
for word, index in vocabs:
  if index < vocab_size:
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# modeling

In [None]:
def model_CNN(num_classes):
    inp = Input(shape = (max_len, ))
    layer = Embedding(vocab_size, 128)(inp)
    #CNNlayer1
    layer = Conv1D(64, 5, padding='same', activation='relu')(layer)
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(5)(layer)
    #CNNlayer2
    layer = Conv1D(64, 5, padding='same', activation='relu')(layer)
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(3)(layer)
    #CNNlayer3
    layer = Conv1D(64, 5, padding='same', activation='relu')(layer)
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(3)(layer)
    #CNNlayer4
    layer = Conv1D(64, 5, padding='same', activation='relu')(layer)
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(3)(layer)
    #Dense
    layer = Flatten()(layer)
    layer = Dense(64, activation='relu')(layer)
    layer = Dropout(0.4)(layer)
    layer = Dense(num_classes, activation = 'softmax')(layer)
    model = Model(inputs = inp, outputs = layer)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    return model

In [None]:
model1 = model_CNN(num_classes)
model1.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 128)          1313408   
_________________________________________________________________
conv1d (Conv1D)              (None, 150, 64)           41024     
_________________________________________________________________
batch_normalization (BatchNo (None, 150, 64)           256       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 30, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 30, 64)            20544     
_________________________________________________________________
batch_normalization_1 (Batch (None, 30, 64)            256   

In [None]:
early_stopping = EarlyStopping(monitor = 'val_accuracy', mode='max', patience = 3)
hist1 = model1.fit(X_train, train_y, batch_size = 64, epochs = 10, validation_split=0.20, shuffle=True, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [None]:
def model_GRU(num_classes):
    inp = Input(shape = (max_len, ))
    layer = Embedding(vocab_size, 128)(inp)
    layer = Conv1D(32, 5, padding='same', activation='relu')(layer)
    layer = MaxPooling1D(3)(layer)
    later = Conv1D(32, 5, padding='same', activation='relu')(layer)
    layer = GRU(32, dropout=0.1, recurrent_dropout=0.5)(layer)
    layer = Dense(num_classes, activation = 'sigmoid')(layer)
    model = Model(inputs = inp, outputs = layer)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    return model

In [None]:
model2 = model_GRU(num_classes)
model2.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 128)          1313408   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 150, 32)           20512     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 50, 32)            0         
_________________________________________________________________
gru (GRU)                    (None, 32)                6336      
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 198       
Total params: 1,340,454
Trainable params: 1,340,454
Non-trainable params: 0
_________________________________________________

In [None]:
hist1 = model1.fit(X_train, train_y, batch_size = 128, epochs = 10, validation_split=0.20, shuffle=True, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [56]:
def model_LSTM(num_classes):
    inp = Input(shape = (max_len, ))
    layer = Embedding(vocab_size,
                      100,
                      weights = [embedding_matrix],
                      trainable=False)(inp)
    layer = Bidirectional(LSTM(50, return_sequences = True))(layer)
    layer = GlobalMaxPool1D()(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(50, activation = 'relu')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(num_classes, activation = 'sigmoid')(layer)
    model = Model(inputs = inp, outputs = layer)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    return model

In [57]:
model3 = model_LSTM(num_classes)
model3.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 200, 100)          2500000   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 200, 100)          60400     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 100)               0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_7 (Dropout)          (None, 50)                0   

In [58]:
early_stop = EarlyStopping(monitor = 'val_loss', mode='min', patience = 3)
hist3 = model3.fit(X_train, train_y, batch_size = 128, epochs = 10, validation_split=0.2, callbacks = [early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# submit

In [None]:
# Prediction for test data
predict_1 = model1.predict(X_test)
predict_2 = model2.predict(X_test)
predict_3 = model3.predict(X_test)

In [59]:
predict = model3.predict(X_test)

In [None]:
sub_c =  pd.read_csv("/content/drive/MyDrive/Parrot_teamproject/sample_submission.csv")
sub_g =  pd.read_csv("/content/drive/MyDrive/Parrot_teamproject/sample_submission.csv")
sub_b =  pd.read_csv("/content/drive/MyDrive/Parrot_teamproject/sample_submission.csv")

In [None]:
sub_c.iloc[:,1:] = predict_1
sub_g.iloc[:,1:] = predict_2
sub_b.iloc[:,1:] = predict_3

In [60]:
submission.iloc[:,1:] = predict

In [None]:
sub_c.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.741677,0.0003477016,0.202912,8.7e-05,0.054339,0.000637
1,0000247867823ef7,0.533714,3.156565e-07,0.135329,0.001275,0.259541,0.070141
2,00013b17ad220c46,0.204971,1.267591e-07,0.243006,0.000983,0.402604,0.148435
3,00017563c3f7919a,0.02854,5.103792e-10,0.142128,8.8e-05,0.537328,0.291916
4,00017695ad8997eb,0.919607,3.356389e-07,0.019754,0.000658,0.052983,0.006997


In [None]:
sub_g.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.499684,0.509525,0.505492,0.495593,0.520382,0.504229
1,0000247867823ef7,0.497373,0.507183,0.507371,0.495529,0.517859,0.50879
2,00013b17ad220c46,0.497155,0.502145,0.500323,0.491992,0.515688,0.499497
3,00017563c3f7919a,0.497568,0.499332,0.501475,0.490672,0.513177,0.505808
4,00017695ad8997eb,0.501078,0.502554,0.505121,0.491919,0.51842,0.507503


In [None]:
sub_b.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.991108,0.4603746,0.933894,0.1777964,0.82613,0.3377025
1,0000247867823ef7,0.000227,2.532966e-09,8e-06,5.774232e-07,4e-06,4.850141e-07
2,00013b17ad220c46,0.000128,2.104631e-08,1e-05,2.881492e-06,4e-06,1.477917e-06
3,00017563c3f7919a,0.000438,2.614215e-08,3.1e-05,2.100759e-06,1.2e-05,2.360645e-06
4,00017695ad8997eb,0.000319,1.131876e-08,1e-05,2.253467e-06,6e-06,1.403541e-06


In [None]:
sub_g.to_csv('/content/drive/MyDrive/Parrot/toxic_sentence_0408_1.csv', index=False)

In [None]:
sub_g.to_csv('/content/drive/MyDrive/Parrot/toxic_sentence_0408_2.csv', index=False)

In [None]:
sub_b.to_csv('/content/drive/MyDrive/Parrot/toxic_sentence_0408_3.csv', index=False)

In [None]:
model3.save('/content/drive/MyDrive/Parrot/toxic_sentence_bilstm.h5')

In [61]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.942909,0.23434,0.834452,0.020928,0.754692,0.084436
1,0000247867823ef7,0.164702,0.021672,0.102759,0.005784,0.091936,0.015664
2,00013b17ad220c46,0.034804,0.000746,0.012428,0.000169,0.010702,0.001283
3,00017563c3f7919a,0.098834,0.004651,0.045107,0.001039,0.041532,0.004829
4,00017695ad8997eb,0.164702,0.021672,0.102759,0.005784,0.091936,0.015664


In [62]:
submission.to_csv('/content/drive/MyDrive/Parrot/toxic_setencepiece_2.csv', index=False)