In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
import re
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.externals import joblib

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from tensorflow.keras.layers import GRU, Conv1D, MaxPooling1D, Flatten, BatchNormalization, GlobalAveragePooling1D, AveragePooling1D, Average, GlobalMaxPooling1D



In [4]:
train = pd.read_csv('/content/drive/MyDrive/Parrot_teamproject/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Parrot_teamproject/test.csv')
test_labels = pd.read_csv('/content/drive/MyDrive/Parrot_teamproject/test_labels.csv')

# Preprocessing - Text Cleaning

### 소윤님

**clean_text2 function**
* 줄임표현 분해('re, 've, ...)
* 이메일주소, 인터넷 주소 제거
* 기타 문자열 아닌 표현 제거
* '-' 제거

In [None]:
X = train['comment_text']
y = train[train.columns[2:]].values

test_X = test['comment_text']
test_labels = test_labels['id']

In [None]:
def clean_text2(text):
    text = text.lower() 
    text = re.sub(r"it's\s","it is ",text) 
    text = re.sub(r"aren't","are not ",text)
    text = re.sub(r"couldn't","could not ",text)
    text = re.sub(r"didn't","did not ",text)
    text = re.sub(r"doen't","does not ",text)
    text = re.sub(r"don't","do not ",text)
    text = re.sub(r"hadn't","had not ",text)
    text = re.sub(r"hasn't","has not ",text)
    text = re.sub(r"haven't","have not ",text)
    text = re.sub(r"isn't","is not ",text)

    text = re.sub(r"arent","are not ",text)
    text = re.sub(r"couldnt","could not ",text)
    text = re.sub(r"didnt","did not ",text)
    text = re.sub(r"doesnt","does not ",text)
    text = re.sub(r"dont","do not ",text)
    text = re.sub(r"hadnt","had not ",text)
    text = re.sub(r"hasnt","has not ",text)
    text = re.sub(r"havent","have not ",text)
    text = re.sub(r"isnt","is not ",text)

    text = re.sub(r"\\n"," ",text)

    text = re.sub(r"mustn't","must not ",text)
    text = re.sub(r"shadn't","shall not ",text)
    text = re.sub(r"weren't","were not ",text)
    text = re.sub(r"where's","where is ",text)
    text = re.sub(r"who'd","who would ",text)
    text = re.sub(r"won't","will not ",text)
    text = re.sub(r"wouldn't","would not ",text)
    text = re.sub(r"what's", "what is ", text)

    text = re.sub(r"mustnt","must not ",text)
    text = re.sub(r"shadnt","shall not ",text)
    text = re.sub(r"werent","were not " ,text)
    text = re.sub(r"wheres","where is ",text)
    text = re.sub(r"whod","who would ",text)
    text = re.sub(r"wont","will not ",text)
    text = re.sub(r"wouldnt","would not ",text)
    text = re.sub(r"whats", "what is ", text)

    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"let's","let us ",text)
    text = re.sub(r"mightn't","might not ",text)
    text = re.sub(r"i'm", "i am ", text)

    text = re.sub(r"cant", "can not ", text)
    text = re.sub(r"lets","let us ",text)
    text = re.sub(r"mightnt","might not ",text)
    text = re.sub(r"im\s", "i am ", text)

    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"tryin","trying",text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text= re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-Z.!?']", ' ', text)
    text = re.sub(r"-"," ",text)
    text = text.strip(' ')
    return text

In [None]:
X= X.map(lambda com : clean_text2(com))
test_X= test_X.map(lambda com : clean_text2(com))

### 상희님

**clean_text1 function**
* stopwords 제거
* 개행 이스케이프(\n) 제거
* 문자나 숫자, whitespace 이외의 것들 제외. punctuation 제거 목적으로 실행함.
* 공백 기준으로 split해 문자열 분리

#### cleansing

In [None]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
train_x = train['comment_text']
test_x = test['comment_text']
train_y = train[labels].values

In [None]:
print('훈련용 코멘트 : {}'.format(len(train_x)))
print('테스트용 코멘트 : {}'.format(len(test_x)))
num_classes = 6
print('카테고리 : {}'.format(num_classes))

훈련용 코멘트 : 159571
테스트용 코멘트 : 153164
카테고리 : 6


In [None]:
train_x = list(train_x)

In [None]:
import string

result = string.punctuation
print(result)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [None]:
def clean_text1(text):
    output = ""
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    text = re.sub(r"its\s","it is ",text) 
    text = re.sub(r"arent","are not ",text)
    text = re.sub(r"couldnt","could not ",text)
    text = re.sub(r"didnt","did not ",text)
    text = re.sub(r"doesnt","does not ",text)
    text = re.sub(r"dont","do not ",text)
    text = re.sub(r"hadnt","had not ",text)
    text = re.sub(r"hasnt","has not ",text)
    text = re.sub(r"havent","have not ",text)
    text = re.sub(r"isnt","is not ",text)

    text = re.sub(r"mustnt","must not ",text)
    text = re.sub(r"shadnt","shall not ",text)
    text = re.sub(r"werent","were not ",text)
    text = re.sub(r"wheres","where is ",text)
    text = re.sub(r"whod","who would ",text)
    text = re.sub(r"wont","will not ",text)
    text = re.sub(r"wouldnt","would not ",text)
    text = re.sub(r"whats", "what is ", text)

    text = re.sub(r"\ve", " have ", text)
    text = re.sub(r"cant", "can not ", text)
    text = re.sub(r"lets","let us ",text)
    text = re.sub(r"mightnt","might not ",text)
    text = re.sub(r"im", "i am ", text)
    
    for word in text:
      output = output + "" + word
    return str(output.strip())

In [None]:
train_texts = [] 

for line in tqdm(train_x, total=train.shape[0]): 
    train_texts.append(clean_text1(line))

HBox(children=(FloatProgress(value=0.0, max=159571.0), HTML(value='')))




In [None]:
print('Original data:', train_x[1], train_y[1])
print('Length of original data:', len(train_x[1]))
print('Cleaned data:', train_texts[1], train_y[1])
print('Length of cleaned data:', len(train_texts[1]))

Original data: D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC) [0 0 0 0 0 0]
Length of original data: 112
Cleaned data: daww he matches this background colour i am  seemingly stuck with thanks  talk  january   utc [0 0 0 0 0 0]
Length of cleaned data: 93


#### lematization

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer

l = WordNetLemmatizer()

def lemma(text, lemmatization=True):
  output=""
  if lemmatization:
    text=text.split(" ")
    for word in text:
       word1 = l.lemmatize(word, pos = "n")
       word2 = l.lemmatize(word1, pos = "v")
       word3 = l.lemmatize(word2, pos = "a")
       word4 = l.lemmatize(word3, pos = "r")
       output=output + " " + word4
  else:
    output=text
  
  return str(output)

In [None]:
train_x_lemma = []

for line in tqdm(train_texts, total=train.shape[0]): 
    train_x_lemma.append(lemma(line))

HBox(children=(FloatProgress(value=0.0, max=159571.0), HTML(value='')))




In [None]:
print('Cleaned data:', train_texts[1], train_y[1])
print('Length of cleaned cleaned data:', len(train_texts[1]))
print('Lemmatized data:', train_x_lemma[1], train_y[1])
print('Length of lemmatized data:', len(train_x_lemma[1]))

Cleaned data: daww he matches this background colour i am  seemingly stuck with thanks  talk  january   utc [0 0 0 0 0 0]
Length of cleaned cleaned data: 93
Lemmatized data:  daww he match this background colour i be  seemingly stick with thank  talk  january   utc [0 0 0 0 0 0]
Length of lemmatized data: 91


### 수정님

**clean_text3 function**
전처리는 수정하지 못했습니다ㅠ
* 줄임표현 분해
* 연속적 공백 제거
* 단어 길이 1인 단어 제거

In [5]:
import re

def clean_text3(sen):
    
    # 글자 모두 lower
    sentence = sen.lower()

    # is 줄임표현 분해
    sentence = re.sub(r'it\'s', 'it is', sentence)
    sentence = re.sub(r'that\'s', 'that is', sentence)
    sentence = re.sub(r'he\'s', 'he is', sentence)
    sentence = re.sub(r'she\'s', 'she is', sentence)
    sentence = re.sub(r'here\'s', 'here is', sentence)
    sentence = re.sub(r'there\'s', 'there is', sentence)

    #기타 줄임표현 분해... 노가다,,
    sentence = re.sub(r"\'ve", " have ", sentence)
    sentence = re.sub(r"can't", "can not ", sentence)
    sentence = re.sub(r"n't", " not ", sentence)
    sentence = re.sub(r"i'm", "i am ", sentence)
    sentence = re.sub(r"\'re", " are ", sentence)
    sentence = re.sub(r"\'d", " would ", sentence)
    sentence = re.sub(r"\'ll", " will ", sentence)
    sentence = re.sub(r"\'scuse", " excuse ", sentence)
    sentence = re.sub('\W', ' ', sentence)  

    # 알파벳, ' 빼고 다 공백으로 바꿈
    sentence = re.sub('[^a-z\']', ' ', sentence)

    # 단어길이 1인 단어 제거
    sentence = re.sub(r'\W*\b\w{1}\b', ' ', sentence)

    # 공백 연속적으로 나타날 경우 공백 하나로 바꿈
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [6]:
X = []
sentences = list(train["comment_text"])
for sen in sentences:
    X.append(clean_text3(sen))

y = train[train.columns[2:]].values

In [11]:
X[0]

'explanation why the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after voted at new york dolls fac and please do not remove the template from the talk page since am retired now '

# Preprocessing - Text Augmentation

### 소윤님

In [None]:
!pip install textaugment

Collecting textaugment
  Downloading https://files.pythonhosted.org/packages/2c/63/9960414280dba3d9eba332502231d69fdc8ba664a4bd3d46842ba8cf0ef2/textaugment-1.3.4-py3-none-any.whl
Collecting googletrans
  Downloading https://files.pythonhosted.org/packages/71/3a/3b19effdd4c03958b90f40fe01c93de6d5280e03843cc5adf6956bfc9512/googletrans-3.0.0.tar.gz
Collecting httpx==0.13.3
[?25l  Downloading https://files.pythonhosted.org/packages/54/b4/698b284c6aed4d7c2b4fe3ba5df1fcf6093612423797e76fbb24890dd22f/httpx-0.13.3-py3-none-any.whl (55kB)
[K     |████████████████████████████████| 61kB 3.3MB/s 
Collecting hstspreload
[?25l  Downloading https://files.pythonhosted.org/packages/dd/50/606213e12fb49c5eb667df0936223dcaf461f94e215ea60244b2b1e9b039/hstspreload-2020.12.22-py3-none-any.whl (994kB)
[K     |████████████████████████████████| 1.0MB 10.1MB/s 
[?25hCollecting rfc3986<2,>=1.3
  Downloading https://files.pythonhosted.org/packages/78/be/7b8b99fd74ff5684225f50dd0e865393d2265656ef3b4ba9eaaaffe6

In [None]:
from textaugment import EDA
import nltk
nltk.download('stopwords')
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
toxic__X = train[train['toxic'] + train['threat'] + train['severe_toxic'] 
                 + train['obscene'] + train['insult'] + train['identity_hate'] > 0]

In [None]:
toxic__X

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
...,...,...,...,...,...,...,...,...
159494,fef4cf7ba0012866,"""\n\n our previous conversation \n\nyou fuckin...",1,0,1,0,1,1
159514,ff39a2895fc3b40e,YOU ARE A MISCHIEVIOUS PUBIC HAIR,1,0,0,0,1,0
159541,ffa33d3122b599d6,Your absurd edits \n\nYour absurd edits on gre...,1,0,1,0,1,0
159546,ffb47123b2d82762,"""\n\nHey listen don't you ever!!!! Delete my e...",1,0,0,0,1,0


In [None]:
toxic_X = toxic__X['comment_text']

In [None]:
toxic_label = toxic__X.drop(columns = ["id","comment_text"])
toxic_id = toxic__X['id']

#### text cleaning

In [None]:
toxic_X= toxic_X.map(lambda com : clean_text2(com))

#### synonym replacement

In [None]:
from textaugment import EDA
e = EDA()

X_sr = toxic_X.map(lambda x: e.synonym_replacement(x))


In [None]:
toxic_X[43]

'fuck your filthy mother in the ass dry'

In [None]:
X_sr[43]

'fuck your filthy mother in the keister dry'

#### random deletion

In [None]:
X_rd = toxic_X.map(lambda x: e.random_deletion(x))


In [None]:
toxic_X[43]

'fuck your filthy mother in the ass dry'

In [None]:
X_rd[43]

'fuck your mother in the ass'

#### random swap

In [None]:
X_rs = toxic_X.map(lambda x: e.random_swap(x))

In [None]:
toxic_X[43]

'fuck your filthy mother in the ass dry'

In [None]:
X_rs[43]

'fuck your filthy in mother the ass dry'

#### random insertion

In [None]:
X_ri = toxic_X.map(lambda x: e.random_insertion(x))

In [None]:
toxic_X[43]

'fuck your filthy mother in the ass dry'

In [None]:
X_ri[43]

'fuck your filthy bed mother in the ass dry'

#### non-toxic data + augmented toxic data

In [None]:
not_toxic__X = train[train['toxic'] + train['threat'] + train['severe_toxic'] 
                 + train['obscene'] + train['insult'] + train['identity_hate'] == 0]
not_toxic_label = not_toxic__X.drop(["id","comment_text"],axis=1)
not_toxic_id = not_toxic__X['id']

In [None]:
not_toxic__X

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [None]:
not_toxic_comment = not_toxic__X['comment_text']
not_toxic_comment = not_toxic_comment.map(lambda com : clean_text2(com))

In [None]:
not_toxic_comment = pd.DataFrame(not_toxic_comment)
not_toxic = pd.concat([not_toxic_id,not_toxic_comment,not_toxic_label],axis=1)
not_toxic

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i am s...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i am really not tryingg to edit war it...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i can not make any real suggestions on im...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,and for the second time of asking when your vi...,0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres no actual article for prost...,0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


In [None]:
toxic_X = pd.DataFrame(toxic_X)
toxic_X = pd.concat([toxic_id,toxic_X,toxic_label], axis=1)
toxic_X.reset_index(drop=True)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0002bcb3da6cb337,cocksucker before you piss around on my work,1,1,1,0,1,0
1,0005c987bdfc9d4b,hey what is it talk what is it an exclusive gr...,1,0,0,0,0,0
2,0007e25b2121310b,bye do not look come or think of comming back ...,1,0,0,0,0,0
3,001810bf8c45bf5f,you are gay or antisemmitian archangel white t...,1,0,1,0,1,1
4,00190820581d90ce,fuck your filthy mother in the ass dry,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...
16220,fef4cf7ba0012866,our previous conversation you fucking shit eat...,1,0,1,0,1,1
16221,ff39a2895fc3b40e,you are a mischievious pubic hair,1,0,0,0,1,0
16222,ffa33d3122b599d6,your absurd edits your absurd edits on great w...,1,0,1,0,1,0
16223,ffb47123b2d82762,hey listen do not you ever delete my edits eve...,1,0,0,0,1,0


In [None]:
final_train = pd.concat([not_toxic,X_rd,X_sr,toxic_X], ignore_index= True)
final_train

Unnamed: 0,0,comment_text,id,identity_hate,insult,obscene,severe_toxic,threat,toxic
0,,explanation why the edits made under my userna...,0000997932d777bf,0.0,0.0,0.0,0.0,0.0,0.0
1,,d aww he matches this background colour i am s...,000103f0d9cfb60f,0.0,0.0,0.0,0.0,0.0,0.0
2,,hey man i am really not tryingg to edit war it...,000113f07ec002fd,0.0,0.0,0.0,0.0,0.0,0.0
3,,more i can not make any real suggestions on im...,0001b41b1c6bb37e,0.0,0.0,0.0,0.0,0.0,0.0
4,,you sir are my hero any chance you remember wh...,0001d958c54c6e35,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
192016,,our previous conversation you fucking shit eat...,fef4cf7ba0012866,1.0,1.0,1.0,0.0,0.0,1.0
192017,,you are a mischievious pubic hair,ff39a2895fc3b40e,0.0,1.0,0.0,0.0,0.0,1.0
192018,,your absurd edits your absurd edits on great w...,ffa33d3122b599d6,0.0,1.0,1.0,0.0,0.0,1.0
192019,,hey listen do not you ever delete my edits eve...,ffb47123b2d82762,0.0,1.0,0.0,0.0,0.0,1.0


In [None]:
X = final_train['comment_text']
y = final_train[train.columns[2:]].values

test_X = test['comment_text']
#test_labels = test_labels['id']

### final train load

In [None]:
final_train = pd.read_csv('/content/drive/MyDrive/Parrot_teamproject/final_train')

# Tokenization

### 상희님

In [None]:
maxlen = 200
max_words = 25000
embeding_dim = 100

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_words)

# Fit the tokenizer on the comments 
tokenizer.fit_on_texts(train_x_lemma)

# Get the word index of the top 20000 words from the dataset
word_idx = tokenizer.word_index

# Convert the string sentence to a sequence of their numerical values
sequences = tokenizer.texts_to_sequences(train_x_lemma)

# Pad the sequences to make them of uniform length
padded_sequences = pad_sequences(sequences, maxlen = maxlen, padding = 'post')

In [None]:
print('Vocabulary size:', len(word_idx))
print("\n\nThe tokenized sequence:\n")
print(sequences[1])
print("\n\nThe padded sequence:\n")
print(padded_sequences[1])

Vocabulary size: 245971


The tokenized sequence:

[49, 955, 15, 1255, 2210, 5, 1, 3425, 834, 20, 70, 40, 893, 328]


The padded sequence:

[  49  955   15 1255 2210    5    1 3425  834   20   70   40  893  328
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0   

In [None]:
print('Shape of data tensor:', padded_sequences.shape)
print('Shape of label tensor:', train_y.shape)

Shape of data tensor: (159571, 200)
Shape of label tensor: (159571, 6)


In [None]:
# Shuffling data
indices = np.arange(padded_sequences.shape[0])
np.random.shuffle(indices)
train_x = padded_sequences[indices]
train_y = train_y[indices]

In [None]:
print('Tokenized random sentences: \n', train_x[1])
print('One hot label: \n', train_y[1])

Tokenized random sentences: 
 [  796   900     6    18    11     8    16     1   420    12    31    41
     1   116    13   124    21     8     1   194    12     2   124   135
    50   989    29    74    19   110    32  1897    13    70     8   796
   900     6   861    18    11     8    16     1   420    12    31   861
 22894    41     1   116    13   124    21     8     1   194    12     2
   124   135    50   989    29    74    19   110    32  1897    13 22894
    70     8     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0   

### 소윤님

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

t = Tokenizer()
t.fit_on_texts(X)
t.fit_on_texts(test_X)

X_encoded = t.texts_to_sequences(X)
test_X_encoded = t.texts_to_sequences(test_X)

X = pad_sequences(X_encoded, maxlen=200, padding='post')
test_X = pad_sequences(test_X_encoded, maxlen=200, padding='post')

In [None]:
max_words = 3500000
maxlen = 200
embeding_dim = 128
vocab_size = len(t.word_index) + 1
num_classes = 6

### 수정님

In [None]:
X = []
sentences = list(train_data["comment_text"])
for sen in sentences:
    X.append(preprocess_text(sen))

y = train_data[train_data.columns[2:]].values

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#토큰화
tokenizer = Tokenizer(num_words=25000)
tokenizer.fit_on_texts(X)

In [None]:
X = tokenizer.texts_to_sequences(X)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

In [None]:
X = pad_sequences(X, padding='post', maxlen=maxlen) #뒤쪽 패딩
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen) #뒤쪽 패딩

# Visualization

In [None]:
def acc_loss_plot(hist):
    fig, loss_ax = plt.subplots()
    acc_ax = loss_ax.twinx()

    loss_ax.plot(hist.history['loss'], 'y', label='train loss')
    loss_ax.plot(hist.history['val_loss'], 'r', label='val loss')
    loss_ax.set_xlabel('epoch')
    loss_ax.set_ylabel('loss')
    loss_ax.legend(loc = 'upper left')

    acc_ax.plot(hist.history['accuracy'], 'b', label='train acc')
    acc_ax.plot(hist.history['val_accuracy'], 'g', label='val acc')
    acc_ax.set_ylabel('accuracy')
    acc_ax.legend(loc='upper right')

    plt.show()

# Glove

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open('/content/drive/MyDrive/glove.6B.100d.txt.zip (Unzipped Files)/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Base Models

### CNN

In [None]:
def model_CNN(num_classes):
    inp = Input(shape = (maxlen, ))
    layer = Embedding(max_words, 
                      embedding_dim,
                      weights = [embedding_matrix],
                      trainable=False)(inp)
    #CNNlayer1
    layer = Conv1D(64, 5, padding='same', activation='relu')(layer)
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(5)(layer)
    #CNNlayer2
    layer = Conv1D(64, 5, padding='same', activation='relu')(layer)
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(3)(layer)
    #CNNlayer3
    layer = Conv1D(64, 5, padding='same', activation='relu')(layer)
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(3)(layer)
    #CNNlayer4
    layer = Conv1D(64, 5, padding='same', activation='relu')(layer)
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(3)(layer)
    #Dense
    layer = Flatten()(layer)
    layer = Dense(64, activation='relu')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(num_classes, activation = 'sigmoid')(layer)
    model = Model(inputs = inp, outputs = layer)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    return model

In [None]:
model_cnn = model_CNN(num_classes)
model_cnn.summary()

In [None]:
early_stopping = EarlyStopping(monitor = 'val_accuracy', mode='max', patience = 3)
hist_cnn = model_cnn.fit(train_x_lemma, train_y, batch_size = 64, epochs = 10, validation_split=0.2, callbacks = [early_stopping])

In [None]:
acc_loss_plot(hist_cnn)

### NBSVM

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
import re, string
re_tok = re.compile(f'([{string.punctuation}@“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [None]:
n = train_data.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train['comment_text'])
test_term_doc = vec.transform(test['comment_text'])

In [None]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [None]:
x = trn_term_doc
test_x = test_term_doc

In [None]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
preds = np.zeros((len(test), len(train.columns[2:])))

for i, j in enumerate(train.columns[2:]):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

###**코드 돌릴 시간 부족하시면 아래 NBSVM파일 로드해서 쓰셔도 됩니다**

In [None]:
model_nbsvm =  joblib.load('/content/drive/MyDrive/Parrot_teamproject/이수정/nbsvm.pkl') 

### BiLSTM

In [None]:
vocab_size = len(word_idx)+1

In [None]:
model = Sequential()
model.add(Input(shape=(maxlen,)))
model.add(Embedding(vocab_size, 100, weights=[embedding_matrix]))
model.add(Bidirectional(LSTM(50, dropout=0.1, recurrent_dropout=0.1, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(6, activation="sigmoid"))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor = 'val_accuracy', mode='max', patience = 2)

In [None]:
hist_BiLSTM = model.fit(train_x, train_y, batch_size=128, epochs=10, verbose=1, validation_split=0.2, shuffle = True, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


### GRU

In [None]:
def model_GRU(num_classes):
    inp = Input(shape = (200, ))
    layer = Embedding(25000, 128)(inp)
    layer = Conv1D(64, 5, padding='same', activation='relu')(layer)
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(3)(layer)
    later = Conv1D(64, 5, padding='same', activation='relu')(layer)
    layer = GRU(128, dropout=0.2, recurrent_dropout=0.2)(layer)
    layer = Dense(64, activation='relu')(layer)
    layer = Dropout(0.3)(layer)
    layer = Dense(num_classes, activation = 'sigmoid')(layer)
    model = Model(inputs = inp, outputs = layer)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    return model

In [None]:
model_gru = model_GRU(6)
model_gru.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor = 'val_accuracy', mode='max', patience = 2)

In [None]:
hist_gru = model_gru.fit(train_x, train_y, batch_size=128, epochs=10, verbose=1, validation_split=0.2, shuffle = True, callbacks=[early_stopping])

# 최종

### BiLSTM 예측

In [None]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

y_test = model.predict([X_test], batch_size=1024, verbose=1)

sample_submission = pd.read_csv(f'/content/drive/MyDrive/Parrot_teamproject/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission_bilstm1_fin.csv', index=False)

### NBSVM 예측

In [None]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

submid = pd.DataFrame({'id': sample["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = train.columns[2:])], axis=1)
submission.to_csv('submission_nbsvm.csv', index=False)

### BiLSTM - NBSVM CSV Ensemble

In [None]:
f_bilstm1 = '/content/submission_biltm1_fin.csv'
f_nbsvm = '/content/submission_nbsvm.csv'

In [None]:
p_bilstm1 = pd.read_csv(f_bilstm1)
p_nbsvm = pd.read_csv(f_nbsvm)

In [None]:
p_res = p_bilstm1.copy()
p_res[list_classes] = (p_bilstm1[list_classes] + p_nbsvm) / 2

In [None]:
p_res.to_csv('submission_bilstm2.csv', index = False)