In [1]:
import os
#os.chdir('E:\Kaggle\jigsaw-multilingual-toxic-comment-classification\split data')

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, GRU, SimpleRNN
from keras.layers import Dense, Activation, Dropout
from keras.layers import Embedding
from keras.layers import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras_preprocessing import sequence,text
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

#### Configuring TPU's

In [2]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [35]:
df = pd.read_csv('../Cleaning/Clean_Dataset.csv')

##### Data Preparation

In [36]:
df = df[['Stop_Words_Text','indonlp_sentiment']]

In [40]:
def sentiment(text):
    if text=='negative':
        return 0
    elif text=='neutral':
        return 1
    else:
        return 2
    
df['indonlp_sentiment'] = df['indonlp_sentiment'].apply(sentiment)

In [41]:
df.head()

Unnamed: 0,Stop_Words_Text,indonlp_sentiment
0,cina as bersaing mengembangkan teknologi terba...,1
1,katanya metaverse down melulu deh mas zuck,0
2,mungkin facebook berencana metaverse menyiapka...,2
3,bersaing perusahaan kelas dunia mencapai sekto...,1
4,staff mah lulusan kampus terbaik attitude krit...,0


In [42]:
#split dataset into train and test sets with 80/20 ratio
train, test = train_test_split(df, test_size=0.2, random_state=42)

#split the train set into train and validation with 80/20 ratio
train, validation = train_test_split(train, test_size=0.2, random_state=42)

#save the dataset to csv files
'''
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
validation.to_csv('validation.csv', index=False)
'''

"\ntrain.to_csv('train.csv', index=False)\ntest.to_csv('test.csv', index=False)\nvalidation.to_csv('validation.csv', index=False)\n"

In [44]:
train['Stop_Words_Text'].apply(lambda x:len(str(x).split())).max()

79

In [45]:
#function to get auc score for validation
def roc_auc(predictions, target):
    '''
    this methods returns AUC score when given Predictions and Labels
    '''
    
    fpr, tpr, therholds = metrics.roc_curve(target, predictions, pos_label=1)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

###### Data Preparation

In [46]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.Stop_Words_Text.values, train.sentiment.values,
                                                  stratify=train.sentiment.values, random_state=42, 
                                                  test_size=0.2, shuffle=True)

AttributeError: 'DataFrame' object has no attribute 'sentiment'

#### Simple RNN

In [None]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 1500

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [47]:
%time
with strategy.scope():
    #A SimpleRNN without any pretrained embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                        300, input_length=max_len))
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model.summary()

CPU times: total: 0 ns
Wall time: 0 ns
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1500, 300)         10686300  
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 100)               40100     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 10,726,501
Trainable params: 10,726,501
Non-trainable params: 0
_________________________________________________________________


In [48]:
model.fit(xtrain_pad, ytrain, epochs=1, batch_size=64*strategy.num_replicas_in_sync)



KeyboardInterrupt: 

In [None]:
scores = model.predict(xvalid_pad)
print('Auc: %.2f%%' % (roc_auc(scores,yvalid)))

In [None]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,yvalid)})

In [None]:
xtrain_seq[:1]

### LSTM

In [None]:
#create an embedding matrix for the words we have in the dataset
