Install required packages

In [None]:
!pip install Sastrawi --quiet
!pip install tensorflow --quiet

Import required packages

In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re, io, json
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Indonesian Stemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

Load dataset

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/kawishbit/svm-hate-speech-id/main/utf8_dataset.csv')
data.dropna(subset=['Tweet'], how='all', inplace=True)
data = data[['Tweet','HS']]
data.head(10)

Unnamed: 0,Tweet,HS
0,- disaat semua cowok berusaha melacak perhatia...,1
1,RT USER: USER siapa yang telat ngasih tau elu?...,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1
5,USER Ya bani taplak dkk \xf0\x9f\x98\x84\xf0\x...,1
6,deklarasi pilkada 2018 aman dan anti hoax warg...,0
7,Gue baru aja kelar re-watch Aldnoah Zero!!! pa...,0
8,Nah admin belanja satu lagi port terbaik nak m...,0
9,USER Enak lg klo smbil ngewe',0


Check dataset details

In [None]:
print(data['HS'].size, "Total")
print(np.sum(data['HS'] == 1), "Hate speech")
print(np.sum(data['HS'] == 0), "Non hate speech")

13169 Total
5561 Hate speech
7608 Non hate speech


## Preprocessing

### Make everything lowercase

In [None]:
data['Tweet'] = data['Tweet'].apply(lambda tweet: tweet.lower())
data['Tweet'].head(10)

0    - disaat semua cowok berusaha melacak perhatia...
1    rt user: user siapa yang telat ngasih tau elu?...
2    41. kadang aku berfikir, kenapa aku tetap perc...
3    user user aku itu aku\n\nku tau matamu sipit t...
4    user user kaum cebong kapir udah keliatan dong...
5    user ya bani taplak dkk \xf0\x9f\x98\x84\xf0\x...
6    deklarasi pilkada 2018 aman dan anti hoax warg...
7    gue baru aja kelar re-watch aldnoah zero!!! pa...
8    nah admin belanja satu lagi port terbaik nak m...
9                        user enak lg klo smbil ngewe'
Name: Tweet, dtype: object

### Remove known unwanted words

In [None]:
# Remove \n \t \r
data['Tweet'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True, inplace=True)

# Remove RT
data['Tweet'] = data['Tweet'].str.replace('rt', '')

# Remove USER
data['Tweet'] = data['Tweet'].str.replace('user', '')

# Remove URL
data['Tweet'] = data['Tweet'].str.replace('url', '')

data['Tweet'].head(10)

0    - disaat semua cowok berusaha melacak perhatia...
1     :  siapa yang telat ngasih tau elu?edan sarap...
2    41. kadang aku berfikir, kenapa aku tetap perc...
3      aku itu aku  ku tau matamu sipit tapi diliat...
4      kaum cebong kapir udah keliatan dongoknya da...
5     ya bani taplak dkk \xf0\x9f\x98\x84\xf0\x9f\x...
6    deklarasi pilkada 2018 aman dan anti hoax warg...
7    gue baru aja kelar re-watch aldnoah zero!!! pa...
8    nah admin belanja satu lagi po terbaik nak mak...
9                             enak lg klo smbil ngewe'
Name: Tweet, dtype: object

### Remove non-alphabets characters

In [None]:
data['Tweet'] = data['Tweet'].replace({'[^A-Za-z]': ' '}, regex = True)
data['Tweet'].head(10)

0      disaat semua cowok berusaha melacak perhatia...
1        siapa yang telat ngasih tau elu edan sarap...
2        kadang aku berfikir  kenapa aku tetap perc...
3      aku itu aku  ku tau matamu sipit tapi diliat...
4      kaum cebong kapir udah keliatan dongoknya da...
5     ya bani taplak dkk  xf  x f x   x   xf  x f x...
6    deklarasi pilkada      aman dan anti hoax warg...
7    gue baru aja kelar re watch aldnoah zero    pa...
8    nah admin belanja satu lagi po terbaik nak mak...
9                             enak lg klo smbil ngewe 
Name: Tweet, dtype: object

### Remove words that is less than 3 characters

In [None]:
data['Tweet'] = data['Tweet'].apply(lambda tweet: ' '.join([w for w in tweet.split() if len(w) > 2]))
print(data['Tweet'].head(10));

0    disaat semua cowok berusaha melacak perhatian ...
1    siapa yang telat ngasih tau elu edan sarap gue...
2    kadang aku berfikir kenapa aku tetap percaya p...
3    aku itu aku tau matamu sipit tapi diliat dari ...
4    kaum cebong kapir udah keliatan dongoknya dari...
5                                      bani taplak dkk
6    deklarasi pilkada aman dan anti hoax warga duk...
7    gue baru aja kelar watch aldnoah zero paling k...
8    nah admin belanja satu lagi terbaik nak makan ...
9                                 enak klo smbil ngewe
Name: Tweet, dtype: object


### Reformat texts

In [None]:
# Remove excess spaces
data['Tweet'] = data['Tweet'].apply(lambda tweet: ' '.join(tweet.split()))

# Trim
data['Tweet'] = data['Tweet'].str.strip()

data['Tweet'].head(10)

0    disaat semua cowok berusaha melacak perhatian ...
1    siapa yang telat ngasih tau elu edan sarap gue...
2    kadang aku berfikir kenapa aku tetap percaya p...
3    aku itu aku tau matamu sipit tapi diliat dari ...
4    kaum cebong kapir udah keliatan dongoknya dari...
5                                      bani taplak dkk
6    deklarasi pilkada aman dan anti hoax warga duk...
7    gue baru aja kelar watch aldnoah zero paling k...
8    nah admin belanja satu lagi terbaik nak makan ...
9                                 enak klo smbil ngewe
Name: Tweet, dtype: object

### Load and replace alay words

In [None]:
alay_words = pd.read_csv('https://raw.githubusercontent.com/kawishbit/svm-hate-speech-id/main/alay.csv')
alay_words.head(10)

Unnamed: 0,alay,replacement
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali
5,aamiin,amin
6,aamiinn,amin
7,aamin,amin
8,aammiin,amin
9,abis,habis


In [None]:
def replace_alay(tweet):
    output = []
    words = tweet.split()
    for word in words:
      row = alay_words[alay_words.alay == word]
      if row.empty:
        output.append(word)
      else:
        output.append(str(row['replacement'].values[0]))

    return ' '.join(output)

data['Tweet'] = data['Tweet'].apply(lambda tweet: replace_alay(tweet))
data['Tweet'].head(10)

0    di saat semua cowok berusaha melacak perhatian...
1    siapa yang telat memberi tau kamu edan sarap g...
2    kadang aku berpikir kenapa aku tetap percaya p...
3    aku itu aku tau matamu sipit tapi dilihat dari...
4    kaum cebong kafir sudah kelihatan dongoknya da...
5                          bani taplak dan kawan kawan
6    deklarasi pilihan kepala daerah aman dan anti ...
7    gue baru saja selesai watch aldnoah zero palin...
8    nah admin belanja satu lagi terbaik nak makan ...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object

### Load and remove stopwords

In [None]:
indonesian_stopwords = pd.read_csv('https://raw.githubusercontent.com/kawishbit/svm-hate-speech-id/main/stopwords.txt', sep="\n")
indonesian_stopwords = indonesian_stopwords.iloc[:, 0].values.tolist()
indonesian_stopwords[:10]

['adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri']

In [None]:
def remove_stopwords(tweet):
    output = []
    words = tweet.split()
    for word in words:
      if word not in indonesian_stopwords:
        output.append(word)

    return ' '.join(output)

data['Tweet'] = data['Tweet'].apply(lambda tweet: remove_stopwords(tweet))

data['Tweet'].head(10)

0    cowok berusaha melacak perhatian gue lantas re...
1    telat tau edan sarap gue bergaul cigax jifla c...
2    kadang berpikir percaya tuhan jatuh berkali ka...
3                                     tau matamu sipit
4               kaum cebong kafir dongoknya dungu haha
5                              bani taplak kawan kawan
6    deklarasi pilihan kepala daerah aman anti hoak...
7    gue selesai watch aldnoah zero kampret karakte...
8    admin belanja terbaik nak makan ais kepal milo...
9                                           enak ngewe
Name: Tweet, dtype: object

In [None]:
data['Tweet'][3]

'tau matamu sipit'

### Stem using Indonesian stemmer

It took quite some time, measured to be around 1 hour and 40 minutes, so be patient

In [None]:
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

data['Tweet'] = data['Tweet'].apply(lambda tweet: stemmer.stem(tweet))

In [None]:
data['Tweet'].head(10)

0    cowok usaha lacak perhati gue lantas remeh per...
1    telat tau edan sarap gue gaul cigax jifla cal ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                       tau mata sipit
4                  kaum cebong kafir dongok dungu haha
5                              bani taplak kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue selesai watch aldnoah zero kampret karakte...
8    admin belanja baik nak makan ais kepal milo ai...
9                                           enak ngewe
Name: Tweet, dtype: object

### Tokenize the words

In [None]:
data.dropna()

max_features = 2000
tokenizer = Tokenizer(lower=False, num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['Tweet'].values)

X = tokenizer.texts_to_sequences(data['Tweet'].values)
X = pad_sequences(X)

X[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,  298,  133,  541,    7,
        1911,  541,    7,   67,  115,  298,  178],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0, 1796,   34,  407,  344,    7, 1589],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  542,
          76,  120,  165,  457,  105,  105,  542,  165,  182,    4,  434,
         894, 1590,  924,    9,   27,   29, 1691]], dtype=int32)

## Training
### Initialize LSTM network

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())
print(X.shape[1])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 128)           256000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 40, 128)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None
40


### Split dataset for training and testing

In [None]:
Y = pd.get_dummies(data['HS']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(10535, 40) (10535, 2)
(2634, 40) (2634, 2)


### Declare checkpoint to save the model as a file

In [None]:
model_path = 'models/LSTM_twitter_sentiment_analysis_latest.h5'
checkpoint = ModelCheckpoint(
    model_path,
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

### Start training with 15 epoch

In [None]:
batch_size = 128
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 1, callbacks=[checkpoint])

Epoch 1/15
Epoch 1: accuracy improved from -inf to 0.71096, saving model to models/LSTM_twitter_sentiment_analysis_latest.h5
Epoch 2/15
Epoch 2: accuracy improved from 0.71096 to 0.83711, saving model to models/LSTM_twitter_sentiment_analysis_latest.h5
Epoch 3/15
Epoch 3: accuracy improved from 0.83711 to 0.86198, saving model to models/LSTM_twitter_sentiment_analysis_latest.h5
Epoch 4/15
Epoch 4: accuracy improved from 0.86198 to 0.87451, saving model to models/LSTM_twitter_sentiment_analysis_latest.h5
Epoch 5/15
Epoch 5: accuracy improved from 0.87451 to 0.88135, saving model to models/LSTM_twitter_sentiment_analysis_latest.h5
Epoch 6/15
Epoch 6: accuracy improved from 0.88135 to 0.88790, saving model to models/LSTM_twitter_sentiment_analysis_latest.h5
Epoch 7/15
Epoch 7: accuracy improved from 0.88790 to 0.89853, saving model to models/LSTM_twitter_sentiment_analysis_latest.h5
Epoch 8/15
Epoch 8: accuracy improved from 0.89853 to 0.90717, saving model to models/LSTM_twitter_sentimen

<keras.callbacks.History at 0x7fe4d78ea070>

### Measure score and accuracy

In [None]:
predict_x = model.predict(X_test)
classes_x = np.argmax(predict_x, axis=1)

df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred': classes_x})
print(df_test.head())

df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))

print('confusion matrix', confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

     true  pred
0  [0, 1]     1
1  [0, 1]     0
2  [1, 0]     0
3  [1, 0]     0
4  [1, 0]     0
confusion matrix [[1303  213]
 [ 287  831]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84      1516
           1       0.80      0.74      0.77      1118

    accuracy                           0.81      2634
   macro avg       0.81      0.80      0.80      2634
weighted avg       0.81      0.81      0.81      2634



## Testing

### Load saved model

In [None]:
loaded_model = load_model(model_path)

### Accept input


In [None]:
tweet = "itu cebong ngapain demo di monas, mending tiduran dirumah"

### Run preprocessing on the input


In [None]:
tweet = replace_alay(tweet)
tweet = remove_stopwords(tweet)
tweet = stemmer.stem(tweet)

tweet

'cebong demo monas mending tidur rumah'

### Tokenize inputs

In [None]:
tokenized_word = tokenizer.texts_to_sequences([tweet])
tokenized_word = pad_sequences(tokenized_word, maxlen=38, dtype='int32', value=0)

print(tokenized_word)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  20 287 476
  439 150]]


### Run prediction

In [None]:
sentiment = loaded_model.predict(tokenized_word,batch_size=1)[0]

if(np.argmax(sentiment) == 0):
    print("Not a hate speech,", sentiment[0], 'sure')
elif (np.argmax(sentiment) == 1):
    print("Hate speech,", sentiment[1], 'sure')

ValueError: ignored