In [7]:
# https://www.kaggle.com/code/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert/notebook

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU,SimpleRNN, Embedding
from keras.layers.core import Dense, Activation, Dropout
from tensorflow.keras.layers import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [2]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [3]:
train = pd.read_csv("E:/Kaggle_Practice/Deep Learning For NLP_Zero To Transformers and BERT/jigsaw-toxic-comment-train.csv")
validation = pd.read_csv("E:/Kaggle_Practice/Deep Learning For NLP_Zero To Transformers and BERT/validation.csv")
test = pd.read_csv("E:/Kaggle_Practice/Deep Learning For NLP_Zero To Transformers and BERT/test.csv")

In [4]:
validation.comment_text.values

array(['Este usuario ni siquiera llega al rango de    hereje   . Por lo tanto debería ser quemado en la barbacoa para purificar su alma y nuestro aparato digestivo mediante su ingestión.    Skipe linkin 22px   Honor, valor, leltad.      17:48 13 mar 2008 (UTC)',
       'Il testo di questa voce pare esser scopiazzato direttamente da qui. Immagino possano esserci problemi di copyright, nel fare cio .',
       'Vale. Sólo expongo mi pasado. Todo tiempo pasado fue mejor, ni mucho menos, yo no quisiera retroceder 31 años a nivel particular. Las volveria a pasar putas.Fernando ',
       ...,
       'olum sız manyakmısınz siz adam sıze sanal yıldız vermıs bilmem nerenız kalkmıs bedava kole dıye calıstırıyorlar sızı siz hala uyuyuyn olmayan bi odul aldım dıye o odul birgun gercek olursa uzerine oturursun saygılarla',
       'El mapa del reinado de Alhaken esta ligeramente exagerado en la zona del pirineo aragonés.Jaca por ejemplo fue tomada en el 780 (¡200 años antes de este reinado!)Tal vez v

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223549 entries, 0 to 223548
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             223549 non-null  object
 1   comment_text   223549 non-null  object
 2   toxic          223549 non-null  int64 
 3   severe_toxic   223549 non-null  int64 
 4   obscene        223549 non-null  int64 
 5   threat         223549 non-null  int64 
 6   insult         223549 non-null  int64 
 7   identity_hate  223549 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 13.6+ MB


In [6]:
train.shape

(223549, 8)

In [4]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)

In [5]:
train = train.loc[:12000,:] # loc[행,열] 즉 행이 지금 총 223548개 있는데 그중에 12000까지만 사용하고, 열은 다 사용한다.(:)
train.shape

(12001, 3)

In [9]:
train.head()

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0


In [None]:
# We will check the maximum number of words that can be present in a comment , this will help us in padding later

In [6]:
train['comment_text'].apply(lambda x:len(str(x).split())).max() 

1403

In [None]:
# Writing a function for getting auc score for validation

In [7]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [None]:
# Data Preparation

In [8]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values, 
                                                  stratify=train.toxic.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [None]:
# Simple RNN

In [9]:
# using keras tokenizer here
from tensorflow.keras.preprocessing.sequence import pad_sequences

token = text.Tokenizer(num_words=None)
max_len = 1500

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain) #texts_to_sequences()을 이용하여 text 문장을 숫자로 이루어진 리스트로 만든다
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = pad_sequences(xtrain_seq, maxlen=max_len) #길이를 max_len(1500)맞추고 빈자리는 0으로 채운다
xvalid_pad = pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index # word_index 속성은 단어와 숫자의 키-값 쌍을 포함하는 딕셔너리를 반환. 반환할때 모든 알파벳은 소문자로 반환됨.

In [16]:
xtrain_seq

[[664,
  65,
  7,
  19,
  2262,
  14102,
  5,
  2262,
  20439,
  6071,
  4,
  71,
  32,
  20440,
  6620,
  39,
  6,
  664,
  65,
  11,
  8,
  20441,
  1502,
  38,
  6072],
 [12,
  1,
  1008,
  7,
  23,
  857,
  35,
  1503,
  10,
  1,
  1676,
  7,
  23,
  14,
  1813,
  25,
  7301,
  7,
  23,
  4420,
  9,
  7,
  61,
  900,
  10,
  1,
  1676,
  2,
  64,
  73,
  11,
  23,
  48,
  4,
  7,
  88,
  37,
  41,
  61,
  19,
  55,
  56,
  245,
  571,
  1,
  1676,
  22,
  41,
  23,
  53,
  245,
  35,
  14103,
  1,
  1676,
  23,
  14,
  5,
  1442,
  11,
  23,
  857,
  7,
  221,
  7,
  61,
  19,
  55,
  119,
  29,
  427,
  4,
  78,
  47,
  19,
  55,
  1,
  194,
  303,
  7,
  1584,
  9,
  41,
  23,
  5,
  2313,
  12,
  2928,
  4,
  13,
  61,
  2598,
  17,
  303,
  12,
  57,
  315,
  2,
  16,
  2114,
  3,
  533,
  3813,
  2,
  1,
  1676,
  94,
  84,
  41,
  8,
  344,
  5,
  2464,
  359,
  12,
  1503,
  10,
  1,
  1676,
  17,
  7,
  69,
  11132,
  2114,
  4684,
  4,
  11133,
  34,
  1,
  683,
  572,
  1

In [17]:
xtrain_pad

array([[   0,    0,    0, ..., 1502,   38, 6072],
       [   0,    0,    0, ...,    3,  101,  548],
       [   0,    0,    0, ...,  359,   11,   38],
       ...,
       [   0,    0,    0, ...,    4,   45, 1576],
       [   0,    0,    0, ...,    9,   77, 5989],
       [   0,    0,    0, ...,  166,  202,  514]])

In [18]:
word_index

{'the': 1,
 'to': 2,
 'of': 3,
 'and': 4,
 'a': 5,
 'you': 6,
 'i': 7,
 'is': 8,
 'that': 9,
 'in': 10,
 'it': 11,
 'for': 12,
 'this': 13,
 'not': 14,
 'on': 15,
 'be': 16,
 'as': 17,
 'are': 18,
 'have': 19,
 'with': 20,
 'your': 21,
 'if': 22,
 'was': 23,
 'article': 24,
 'or': 25,
 'but': 26,
 'page': 27,
 'my': 28,
 'an': 29,
 'wikipedia': 30,
 'by': 31,
 'from': 32,
 'do': 33,
 'at': 34,
 'about': 35,
 'me': 36,
 'so': 37,
 'talk': 38,
 'can': 39,
 'what': 40,
 'there': 41,
 'all': 42,
 'has': 43,
 'no': 44,
 'will': 45,
 'one': 46,
 'would': 47,
 'like': 48,
 'please': 49,
 'he': 50,
 'just': 51,
 'they': 52,
 'any': 53,
 'which': 54,
 'been': 55,
 'more': 56,
 'other': 57,
 'we': 58,
 "don't": 59,
 'his': 60,
 'should': 61,
 'some': 62,
 'here': 63,
 'see': 64,
 'who': 65,
 'also': 66,
 'because': 67,
 'know': 68,
 'am': 69,
 'think': 70,
 "i'm": 71,
 'edit': 72,
 'how': 73,
 'up': 74,
 'why': 75,
 'out': 76,
 "it's": 77,
 'then': 78,
 'people': 79,
 'use': 80,
 'only': 81,
 'w

In [10]:
%%time
with strategy.scope():
    # A simpleRNN without any pretrained embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     input_length=max_len)) # model.add 라고 하면 새로운 은닉층이 만들어진다. model.add가 3개 있으므로 3개의 층을 가진 모델이다.
                     # Embedding(입력개수, 출력개수, 단어 시퀀수 수)  
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation='sigmoid')) # 이 맨 마지막층은 결과를 출력 하는 층이다.
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # model.compile 부분은 앞서 지정한 모델이 효과적으로 구현될 수 있게 여러 가지 환경을 설정해 주면서 컴파일하는 부분
    # 어떤 오차 함수를 (loss) 사용할지를 정해야 한다. 그다음 최적화 함수도 정의한다.
    # 마지막으로 metrics() 함수는 모델이 컴파일될 때 모델 수행 결과를 나타내게끔 설정하는 부분
    
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1500, 300)         13049100  
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               40100     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 13,089,301
Trainable params: 13,089,301
Non-trainable params: 0
_________________________________________________________________
CPU times: total: 1.36 s
Wall time: 491 ms


In [None]:
# 위의 결과 Output Shape (batch_size, units())

In [None]:
# 모델 학습

In [22]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync) #Multiplying by Strategy to run on TPU's

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2064b651d90>

In [23]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.79%


In [25]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,yvalid)})
scores_model

[{'Model': 'SimpleRNN', 'AUC_Score': 0.7861227401124219}]

In [None]:
# 훈련 데이터가 적은 상황이라면 케라스의 Embedding()으로 해당 문제를 풀기에 최적화 된 임베딩 벡터값을 얻는 것이 쉽지 않다. 
# 이 경우 해당 문제에 특화된 것은 아니지만 보다 많은 훈련 데이터로 이미 Word2Vec이나 GloVe, fasttext 등으로 학습되어져 있는 임베딩 벡터들을 사용하는 것이 성능의 개선을 가져올 수 있습니다

In [12]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('E:/Kaggle_Practice/Deep Learning For NLP_Zero To Transformers and BERT/glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196018it [05:04, 7212.04it/s]


Found 2196017 word vectors.


## LSTM

In [None]:
# Simple RNN's were certainly better than classical ML algorithms and gave state of the art results, but it failed to capture long term dependencies that is present in sentences . 
# So in 1998-99 LSTM's were introduced to counter to these drawbacks.