Install required packages

In [1]:
!pip install Sastrawi --quiet
!pip install tensorflow --quiet

Import required packages

In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re, io, json
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Indonesian Stemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

Load dataset

In [3]:
data = pd.read_csv('./clean_dataset.csv')
data.dropna(subset=['Tweet'], how='all', inplace=True)
data = data[['Tweet','HS']]
data.head(10)

Unnamed: 0,Tweet,HS
0,di saat cowok usaha lacak perhati gue kamu lan...,1
1,telat beri tau kamu edan sarap gue gaul cigax ...,0
2,kadang pikir percaya tuhan jatuh kali kali kad...,0
3,tau mata sipit lihat,0
4,kaum cebong kafir sudah lihat dongok dungu haha,1
5,bani taplak dan kawan kawan,1
6,deklarasi pilih kepala daerah aman anti hoaks ...,0
7,gue saja selesai watch aldnoah zero kampret me...,0
8,admin belanja po baik nak makan ais kepal milo...,0
9,enak kalau sambil ngewe,0


Check dataset details

In [4]:
print(data['HS'].size, "Total")
print(np.sum(data['HS'] == 1), "Hate speech")
print(np.sum(data['HS'] == 0), "Non hate speech")

13116 Total
5553 Hate speech
7563 Non hate speech


## Preprocessing

### Make everything lowercase

In [5]:
data['Tweet'] = data['Tweet'].apply(lambda tweet: tweet.lower())
data['Tweet'].head(10)

0    di saat cowok usaha lacak perhati gue kamu lan...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4      kaum cebong kafir sudah lihat dongok dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja po baik nak makan ais kepal milo...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object

### Remove known unwanted words

In [6]:
# Remove \n \t \r
data['Tweet'].replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True, inplace=True)

# Remove RT
data['Tweet'] = data['Tweet'].str.replace('rt', '')

# Remove USER
data['Tweet'] = data['Tweet'].str.replace('user', '')

# Remove URL
data['Tweet'] = data['Tweet'].str.replace('url', '')

data['Tweet'].head(10)

0    di saat cowok usaha lacak perhati gue kamu lan...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4      kaum cebong kafir sudah lihat dongok dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja po baik nak makan ais kepal milo...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object

### Remove non-alphabets characters

In [7]:
data['Tweet'] = data['Tweet'].replace({'[^A-Za-z]': ' '}, regex = True)
data['Tweet'].head(10)

0    di saat cowok usaha lacak perhati gue kamu lan...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4      kaum cebong kafir sudah lihat dongok dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja po baik nak makan ais kepal milo...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object

### Remove words that is less than 3 characters

In [8]:
data['Tweet'] = data['Tweet'].apply(lambda tweet: ' '.join([w for w in tweet.split() if len(w) > 2]))
print(data['Tweet'].head(10));

0    saat cowok usaha lacak perhati gue kamu lantas...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4      kaum cebong kafir sudah lihat dongok dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja baik nak makan ais kepal milo ai...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object


### Reformat texts

In [9]:
# Remove excess spaces
data['Tweet'] = data['Tweet'].apply(lambda tweet: ' '.join(tweet.split()))

# Trim
data['Tweet'] = data['Tweet'].str.strip()

data['Tweet'].head(10)

0    saat cowok usaha lacak perhati gue kamu lantas...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4      kaum cebong kafir sudah lihat dongok dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja baik nak makan ais kepal milo ai...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object

### Load and replace alay words

In [10]:
alay_words = pd.read_csv('./alay.csv')
alay_words.head(10)

Unnamed: 0,alay,replacement
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali
5,aamiin,amin
6,aamiinn,amin
7,aamin,amin
8,aammiin,amin
9,abis,habis


In [11]:
def replace_alay(tweet):
    output = []
    words = tweet.split()
    for word in words:
      row = alay_words[alay_words.alay == word]
      if row.empty:
        output.append(word)
      else:
        output.append(str(row['replacement'].values[0]))

    return ' '.join(output)

data['Tweet'] = data['Tweet'].apply(lambda tweet: replace_alay(tweet))
data['Tweet'].head(10)

0    saat cowok usaha lacak perhati gue kamu lantas...
1    telat beri tau kamu edan sarap gue gaul cigax ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4       kaum cebong kafir sudah lihat dungu dungu haha
5                          bani taplak dan kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue saja selesai watch aldnoah zero kampret me...
8    admin belanja baik nak makan ais kepal milo ai...
9                              enak kalau sambil ngewe
Name: Tweet, dtype: object

### Load and remove stopwords

In [12]:
indonesian_stopwords = pd.read_csv('./stopwords.txt')
indonesian_stopwords = indonesian_stopwords.iloc[:, 0].values.tolist()
indonesian_stopwords[:10]

['adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri']

In [13]:
def remove_stopwords(tweet):
    output = []
    words = tweet.split()
    for word in words:
      if word not in indonesian_stopwords:
        output.append(word)

    return ' '.join(output)

data['Tweet'] = data['Tweet'].apply(lambda tweet: remove_stopwords(tweet))

data['Tweet'].head(10)

0    cowok usaha lacak perhati gue lantas remeh per...
1    telat tau edan sarap gue gaul cigax jifla cal ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4             kaum cebong kafir lihat dungu dungu haha
5                              bani taplak kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue selesai watch aldnoah zero kampret karakte...
8    admin belanja nak makan ais kepal milo ais kep...
9                                           enak ngewe
Name: Tweet, dtype: object

In [14]:
data['Tweet'][3]

'tau mata sipit lihat'

### Stem using Indonesian stemmer

It took quite some time, measured to be around 1 hour and 40 minutes, so be patient

In [15]:
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

data['Tweet'] = data['Tweet'].apply(lambda tweet: stemmer.stem(tweet))

In [16]:
data['Tweet'].head(10)

0    cowok usaha lacak perhati gue lantas remeh per...
1    telat tau edan sarap gue gaul cigax jifla cal ...
2    kadang pikir percaya tuhan jatuh kali kali kad...
3                                 tau mata sipit lihat
4             kaum cebong kafir lihat dungu dungu haha
5                              bani taplak kawan kawan
6    deklarasi pilih kepala daerah aman anti hoaks ...
7    gue selesai watch aldnoah zero kampret karakte...
8    admin belanja nak makan ais kepal milo ais kep...
9                                           enak ngewe
Name: Tweet, dtype: object

### Tokenize the words

In [17]:
data.dropna()

max_features = 2000
tokenizer = Tokenizer(lower=False, num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['Tweet'].values)

X = tokenizer.texts_to_sequences(data['Tweet'].values)
X = pad_sequences(X)

X[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,  283,  132,  518,    7, 1804,  518,
           7,   68,  113,  283,  175],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1694,
          34,  390,  327,    7, 1493],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,  519,   78,  119,
         162,  435,  104,  104,  519,  162,  179,    4,  413,  853, 1494,
         879,    9,   27,   29, 1592]])

### Split dataset for training and testing

In [18]:
Y = data['HS']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(10492, 38) (10492,)
(2624, 38) (2624,)


## Training
### Initialize SVM network

In [None]:
from sklearn import svm
# classify using support vector classifier
model = svm.SVC(kernel = 'linear', verbose=True, cache_size=1000, probability=True)
model.fit(X_train, Y_train)

y_pred = model.predict(X_test)
model.score(X_test, y_test)

print(classification_report(y_test, y_pred))

[LibSVM]

## Testing

### Load saved model

In [None]:
loaded_model = load_model(model_path)

### Accept input


In [None]:
tweet = "itu cebong ngapain demo di monas, mending tiduran dirumah"

### Run preprocessing on the input


In [None]:
tweet = replace_alay(tweet)
tweet = remove_stopwords(tweet)
tweet = stemmer.stem(tweet)

tweet

### Tokenize inputs

In [None]:
tokenized_word = tokenizer.texts_to_sequences([tweet])
tokenized_word = pad_sequences(tokenized_word, maxlen=38, dtype='int32', value=0)

print(tokenized_word)

### Run prediction

In [None]:
sentiment = loaded_model.predict(tokenized_word,batch_size=1)[0]

if(np.argmax(sentiment) == 0):
    print("Not a hate speech,", sentiment[0], 'sure')
elif (np.argmax(sentiment) == 1):
    print("Hate speech,", sentiment[1], 'sure')