Install required packages

In [19]:
!pip install Sastrawi --quiet
!pip install tensorflow --quiet

Import required packages

In [20]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re, io, json
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Indonesian Stemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

Load dataset

In [21]:
data = pd.read_csv('./utf8_dataset.csv')
data.dropna(subset=['Tweet'], how='all', inplace=True)
data = data[['Tweet','HS']]
data.head(10)

Unnamed: 0,Tweet,HS
0,- disaat semua cowok berusaha melacak perhatia...,1
1,RT USER: USER siapa yang telat ngasih tau elu?...,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1
5,USER Ya bani taplak dkk \xf0\x9f\x98\x84\xf0\x...,1
6,deklarasi pilkada 2018 aman dan anti hoax warg...,0
7,Gue baru aja kelar re-watch Aldnoah Zero!!! pa...,0
8,Nah admin belanja satu lagi port terbaik nak m...,0
9,USER Enak lg klo smbil ngewe',0


Check dataset details

In [61]:
print(data['HS'].size, "Total")
print(np.sum(data['HS'] == 1), "Hate speech")
print(np.sum(data['HS'] == 0), "Non hate speech")

13116 Total
5553 Hate speech
7563 Non hate speech


## Preprocessing

### Make everything lowercase

In [25]:
data['Tweet'] = data['Tweet'].apply(lambda tweet: tweet.lower())
data['Tweet'].head(10)

0    di saat cowok usaha lacak perhati gue kamu lan...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4      kaum cebong kafir sudah lihat dongok dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja po baik nak makan ais kepal milo...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object

### Remove known unwanted words

In [26]:
# Remove \n \t \r
data['Tweet'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True, inplace=True)

# Remove RT
data['Tweet'] = data['Tweet'].str.replace('rt', '')

# Remove USER
data['Tweet'] = data['Tweet'].str.replace('user', '')

# Remove URL
data['Tweet'] = data['Tweet'].str.replace('url', '')

data['Tweet'].head(10)

0    di saat cowok usaha lacak perhati gue kamu lan...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4      kaum cebong kafir sudah lihat dongok dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja po baik nak makan ais kepal milo...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object

### Remove non-alphabets characters

In [27]:
data['Tweet'] = data['Tweet'].replace({'[^A-Za-z]': ' '}, regex = True)
data['Tweet'].head(10)

0    di saat cowok usaha lacak perhati gue kamu lan...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4      kaum cebong kafir sudah lihat dongok dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja po baik nak makan ais kepal milo...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object

### Remove words that is less than 3 characters

In [28]:
data['Tweet'] = data['Tweet'].apply(lambda tweet: ' '.join([w for w in tweet.split() if len(w) > 2]))
print(data['Tweet'].head(10));

0    saat cowok usaha lacak perhati gue kamu lantas...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4      kaum cebong kafir sudah lihat dongok dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja baik nak makan ais kepal milo ai...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object


### Reformat texts

In [29]:
# Remove excess spaces
data['Tweet'] = data['Tweet'].apply(lambda tweet: ' '.join(tweet.split()))

# Trim
data['Tweet'] = data['Tweet'].str.strip()

data['Tweet'].head(10)

0    saat cowok usaha lacak perhati gue kamu lantas...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4      kaum cebong kafir sudah lihat dongok dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja baik nak makan ais kepal milo ai...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object

### Load and replace alay words

In [30]:
alay_words = pd.read_csv('alay.csv')
alay_words.head(10)

Unnamed: 0,alay,replacement
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali
5,aamiin,amin
6,aamiinn,amin
7,aamin,amin
8,aammiin,amin
9,abis,habis


In [31]:
def replace_alay(tweet):
    output = []
    words = tweet.split()
    for word in words:
      row = alay_words[alay_words.alay == word]
      if row.empty:
        output.append(word)
      else:
        output.append(str(row['replacement'].values[0]))

    return ' '.join(output)

data['Tweet'] = data['Tweet'].apply(lambda tweet: replace_alay(tweet))
data['Tweet'].head(10)

0    saat cowok usaha lacak perhati gue kamu lantas...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4       kaum cebong kafir sudah lihat dungu dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja baik nak makan ais kepal milo ai...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object

### Load and remove stopwords

In [32]:
indonesian_stopwords = pd.read_csv('stopwords.txt', sep="\n")
indonesian_stopwords = indonesian_stopwords.iloc[:, 0].values.tolist()
indonesian_stopwords[:10]

['adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri']

In [33]:
def remove_stopwords(tweet):
    output = []
    words = tweet.split()
    for word in words:
      if word not in indonesian_stopwords:
        output.append(word)

    return ' '.join(output)

data['Tweet'] = data['Tweet'].apply(lambda tweet: remove_stopwords(tweet))

data['Tweet'].head(10)

0    cowok usaha lacak perhati gue lantas remeh per...
1    telat tau edan sarap gue gaul cigax jifla cal ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4             kaum cebong kafir lihat dungu dungu haha
5                              bani taplak kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue selesai watch aldnoah zero kampret karakte...
8    admin belanja nak makan ais kepal milo ais kep...
9                                           enak ngewe
Name: Tweet, dtype: object

In [34]:
data['Tweet'][3]

'tau mata sipit lihat'

### Stem using Indonesian stemmer

It took quite some time, measured to be around 1 hour and 40 minutes, so be patient

In [48]:
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

data['Tweet'] = data['Tweet'].apply(lambda tweet: stemmer.stem(tweet))

In [36]:
data['Tweet'].head(10)

0    cowok usaha lacak perhati gue lantas remeh per...
1    telat tau edan sarap gue gaul cigax jifla cal ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4             kaum cebong kafir lihat dungu dungu haha
5                              bani taplak kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue selesai watch aldnoah zero kampret karakte...
8    admin belanja nak makan ais kepal milo ais kep...
9                                           enak ngewe
Name: Tweet, dtype: object

### Tokenize the words

In [37]:
data.dropna()

max_features = 2000
tokenizer = Tokenizer(lower=False, num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['Tweet'].values)

X = tokenizer.texts_to_sequences(data['Tweet'].values)
X = pad_sequences(X)

X[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,  283,  132,  518,    7, 1806,  518,
           7,   68,  113,  283,  175],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1696,
          34,  389,  327,    7, 1495],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,  519,   78,  119,
         162,  435,  104,  104,  519,  162,  179,    4,  413,  853, 1496,
         880,    9,   27,   29, 1594]])

## Training
### Initialize LSTM network

In [40]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())
print(X.shape[1])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 38, 128)           256000    
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 38, 128)          0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 196)               254800    
                                                                 
 dense_1 (Dense)             (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None
38


### Split dataset for training and testing

In [41]:
Y = pd.get_dummies(data['HS']).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(10492, 38) (10492, 2)
(2624, 38) (2624, 2)


### Declare checkpoint to save the model as a file

In [42]:
model_path = 'models/LSTM_twitter_sentiment_analysis.h5'
checkpoint = ModelCheckpoint(
    model_path,
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

### Start training with 15 epoch

In [43]:
batch_size = 128
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 1, callbacks=[checkpoint])

Epoch 1/15
Epoch 00001: accuracy improved from -inf to 0.71426, saving model to models\LSTM_twitter_sentiment_analysis.h5
Epoch 2/15
Epoch 00002: accuracy improved from 0.71426 to 0.84140, saving model to models\LSTM_twitter_sentiment_analysis.h5
Epoch 3/15
Epoch 00003: accuracy improved from 0.84140 to 0.86094, saving model to models\LSTM_twitter_sentiment_analysis.h5
Epoch 4/15
Epoch 00004: accuracy improved from 0.86094 to 0.87019, saving model to models\LSTM_twitter_sentiment_analysis.h5
Epoch 5/15
Epoch 00005: accuracy improved from 0.87019 to 0.88124, saving model to models\LSTM_twitter_sentiment_analysis.h5
Epoch 6/15
Epoch 00006: accuracy improved from 0.88124 to 0.88944, saving model to models\LSTM_twitter_sentiment_analysis.h5
Epoch 7/15
Epoch 00007: accuracy improved from 0.88944 to 0.89792, saving model to models\LSTM_twitter_sentiment_analysis.h5
Epoch 8/15
Epoch 00008: accuracy improved from 0.89792 to 0.90278, saving model to models\LSTM_twitter_sentiment_analysis.h5
Epo

<keras.callbacks.History at 0x1f891683be0>

### Measure score and accuracy

In [44]:
predict_x = model.predict(X_test)
classes_x = np.argmax(predict_x, axis=1)

df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred': classes_x})
print(df_test.head())

df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))

print('confusion matrix', confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

     true  pred
0  [0, 1]     0
1  [1, 0]     0
2  [0, 1]     1
3  [0, 1]     0
4  [1, 0]     0
confusion matrix [[1229  278]
 [ 229  888]]
              precision    recall  f1-score   support

           0       0.84      0.82      0.83      1507
           1       0.76      0.79      0.78      1117

    accuracy                           0.81      2624
   macro avg       0.80      0.81      0.80      2624
weighted avg       0.81      0.81      0.81      2624



## Testing

### Load saved model

In [45]:
loaded_model = load_model(model_path)

### Accept input


In [46]:
tweet = "jokowi presiden goblog cebong anjing"

### Run preprocessing on the input


In [49]:
tweet = replace_alay(tweet)
tweet = remove_stopwords(tweet)
tweet = stemmer.stem(tweet)

tweet

'jokowi presiden goblok cebong anjing'

### Tokenize inputs

In [59]:
tokenized_word = tokenizer.texts_to_sequences([tweet])
tokenized_word = pad_sequences(tokenized_word, maxlen=38, dtype='int32', value=0)

print(tokenized_word)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  5  2 69 20 48]]


### Run prediction

In [60]:
sentiment = loaded_model.predict(tokenized_word,batch_size=1)[0]

if(np.argmax(sentiment) == 0):
    print("Not a hate speech,", sentiment[0], 'sure')
elif (np.argmax(sentiment) == 1):
    print("Hate speech,", sentiment[1], 'sure')

Hate speech, 0.9927407 sure
