In [None]:
!pip install kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d arkhoshghalb/twitter-sentiment-analysis-hatred-speech

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,Dropout
from keras.utils.np_utils import to_categorical
import re

In [None]:
import zipfile
zip_ref = zipfile.ZipFile('twitter-sentiment-analysis-hatred-speech.zip', 'r')
zip_ref.extractall('files')
zip_ref.close()
 
data_frame = pd.read_csv('/content/files/train.csv')
data_frame

In [None]:
negative = len(data_frame['label']=='0')/2
positive = len(data_frame['label']=='1')/2
sns.countplot(data_frame['label'])
print('Positive reviews are {},and negative reviews are {} of total {} '.format(positive,negative,len(data_frame)))

In [None]:
le = LabelEncoder()
training_reviews,testing_reviews,training_labels,testing_labels  = train_test_split(data_frame['tweet'].values,data_frame['label'].values,test_size = 0.2)
training_labels = le.fit_transform(training_labels)
testing_labels = le.fit_transform(testing_labels)

In [None]:
def hapus_spesial_char(tweets):
        special_numeric=""
        for character in tweets:
            if character.isalpha() or character==" ":
                special_numeric += character
        return special_numeric
def hapus_tag(text):
     return re.compile(r"<[^>]+>#@ðº¦±©â¤ïªï;â&¬ë_µì°").sub(" ", text)
def hapus_no(text):
     return "".join(re.sub(r"([0–9]+)"," ",text))
data_frame.tweet=data_frame.tweet.apply(lambda x : hapus_tag(x))
data_frame.tweet=data_frame.tweet.apply(lambda x : hapus_no(x))
data_frame.tweet=(data_frame.tweet).apply(hapus_spesial_char)
data_frame.head()

In [None]:
tokenizer = Tokenizer(num_words=10000,oov_token='<OOV>')
tokenizer.fit_on_texts(training_reviews)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
training_sequence = tokenizer.texts_to_sequences(training_reviews)
testing_sequence = tokenizer.texts_to_sequences(testing_reviews)
train_pad_sequence = pad_sequences(training_sequence,maxlen = 500,truncating= 'post',padding = 'pre')
test_pad_sequence = pad_sequences(testing_sequence,maxlen = 500,truncating= 'post',padding = 'pre')
print('Total Unique Words : {}'.format(len(word_index)))

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove*.zip
!ls

In [None]:
embedded_words = {}
with open ('/content/glove.6B.200d.txt') as file:
  for line in file:
    words, coeff = line.split(maxsplit=1)
    coeff = np.array(coeff.split(),dtype = float)
    embedded_words[words] = coeff

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1,200))
for word, i in word_index.items():
  embedding_vector = embedded_words.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(len(word_index) + 1,200,weights=[embedding_matrix],input_length=500,
                            trainable=False),
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
                             tf.keras.layers.Dropout(0.5),
                             tf.keras.layers.Dense(256,activation = 'relu',),
                             tf.keras.layers.Dense(128,activation = 'relu'),
                             tf.keras.layers.Dropout(0.5),
                             tf.keras.layers.Dense(1,activation = tf.nn.sigmoid)])
model.summary()

In [None]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(train_pad_sequence,training_labels, epochs = 5, callbacks=[es], validation_data=(test_pad_sequence,testing_labels))

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend(loc=0)
plt.figure()

plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend(loc=0)

plt.show()

In [None]:
print('Training Accuracy: {}'.format(max(acc)))
print('Validation Accuracy: {}'.format(max(val_acc)))