Install required packages

In [None]:
!pip install Sastrawi --quiet
!pip install tensorflow --quiet

Import required packages

In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re, io, json
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Indonesian Stemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

Load dataset

In [None]:
data = pd.read_csv('./utf8_dataset.csv')
data.dropna(subset=['Tweet'], how='all', inplace=True)
data = data[['Tweet','HS']]
data.head(10)

Check dataset details

In [None]:
print(data['HS'].size, "Total")
print(np.sum(data['HS'] == 1), "Hate speech")
print(np.sum(data['HS'] == 0), "Non hate speech")

## Preprocessing

### Make everything lowercase

In [None]:
data['Tweet'] = data['Tweet'].apply(lambda tweet: tweet.lower())
data['Tweet'].head(10)

### Remove known unwanted words

In [None]:
# Remove \n \t \r
data['Tweet'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True, inplace=True)

# Remove RT
data['Tweet'] = data['Tweet'].str.replace('rt', '')

# Remove USER
data['Tweet'] = data['Tweet'].str.replace('user', '')

# Remove URL
data['Tweet'] = data['Tweet'].str.replace('url', '')

data['Tweet'].head(10)

### Remove non-alphabets characters

In [None]:
data['Tweet'] = data['Tweet'].replace({'[^A-Za-z]': ' '}, regex = True)
data['Tweet'].head(10)

### Remove words that is less than 3 characters

In [None]:
data['Tweet'] = data['Tweet'].apply(lambda tweet: ' '.join([w for w in tweet.split() if len(w) > 2]))
print(data['Tweet'].head(10));

### Reformat texts

In [None]:
# Remove excess spaces
data['Tweet'] = data['Tweet'].apply(lambda tweet: ' '.join(tweet.split()))

# Trim
data['Tweet'] = data['Tweet'].str.strip()

data['Tweet'].head(10)

### Load and replace alay words

In [None]:
alay_words = pd.read_csv('alay.csv')
alay_words.head(10)

In [None]:
def replace_alay(tweet):
    output = []
    words = tweet.split()
    for word in words:
      row = alay_words[alay_words.alay == word]
      if row.empty:
        output.append(word)
      else:
        output.append(str(row['replacement'].values[0]))

    return ' '.join(output)

data['Tweet'] = data['Tweet'].apply(lambda tweet: replace_alay(tweet))
data['Tweet'].head(10)

### Load and remove stopwords

In [None]:
indonesian_stopwords = pd.read_csv('stopwords.txt', sep="\n")
indonesian_stopwords = indonesian_stopwords.iloc[:, 0].values.tolist()
indonesian_stopwords[:10]

In [None]:
def remove_stopwords(tweet):
    output = []
    words = tweet.split()
    for word in words:
      if word not in indonesian_stopwords:
        output.append(word)

    return ' '.join(output)

data['Tweet'] = data['Tweet'].apply(lambda tweet: remove_stopwords(tweet))

data['Tweet'].head(10)

In [None]:
data['Tweet'][3]

### Stem using Indonesian stemmer

It took quite some time, measured to be around 1 hour and 40 minutes, so be patient

In [None]:
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

data['Tweet'] = data['Tweet'].apply(lambda tweet: stemmer.stem(tweet))

In [None]:
data['Tweet'].head(10)

### Tokenize the words

In [None]:
data.dropna()

max_features = 2000
tokenizer = Tokenizer(lower=False, num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['Tweet'].values)

X = tokenizer.texts_to_sequences(data['Tweet'].values)
X = pad_sequences(X)

X[:3]

## Training
### Initialize LSTM network

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())
print(X.shape[1])

### Split dataset for training and testing

In [None]:
Y = pd.get_dummies(data['HS']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

### Declare checkpoint to save the model as a file

In [None]:
model_path = 'models/LSTM_twitter_sentiment_analysis_latest.h5'
checkpoint = ModelCheckpoint(
    model_path,
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

### Start training with 15 epoch

In [None]:
batch_size = 128
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 1, callbacks=[checkpoint])

### Measure score and accuracy

In [None]:
predict_x = model.predict(X_test)
classes_x = np.argmax(predict_x, axis=1)

df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred': classes_x})
print(df_test.head())

df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))

print('confusion matrix', confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

## Testing

### Load saved model

In [None]:
loaded_model = load_model(model_path)

### Accept input


In [None]:
tweet = "itu cebong ngapain demo di monas, mending tiduran dirumah"

### Run preprocessing on the input


In [None]:
tweet = replace_alay(tweet)
tweet = remove_stopwords(tweet)
tweet = stemmer.stem(tweet)

tweet

### Tokenize inputs

In [None]:
tokenized_word = tokenizer.texts_to_sequences([tweet])
tokenized_word = pad_sequences(tokenized_word, maxlen=38, dtype='int32', value=0)

print(tokenized_word)

### Run prediction

In [None]:
sentiment = loaded_model.predict(tokenized_word,batch_size=1)[0]

if(np.argmax(sentiment) == 0):
    print("Not a hate speech,", sentiment[0], 'sure')
elif (np.argmax(sentiment) == 1):
    print("Hate speech,", sentiment[1], 'sure')