# Import Data

In [1]:
import pandas as pd

In [2]:
filename = 'MANDAT_LABELING.xlsx'

In [3]:
dataframe = pd.read_excel(filename)
dataframe.head()

Unnamed: 0,Tweet,LABEL
0,Masih mau legal kan miras!! Lanjut aja klo ...,NEGATIVE
1,@Lelaki_5unyi @PutraWadapi jawaban dr allah at...,NEGATIVE
2,@TofaTofa_id Miras dari dulu bukan nya legal? ...,POSITIVE
3,@pamanank_p79 betul daerah legal miras. moho...,NEGATIVE
4,@jokowi Kurangi mirasnya juga pak. Meski otori...,NEGATIVE


In [4]:
dataframe.describe()

Unnamed: 0,Tweet,LABEL
count,1106,1106
unique,1094,2
top,@HansjayaHans @IamYogas @ferrykoto @ghanieierf...,NEGATIVE
freq,2,934


In [5]:
tweet_positif = dataframe.loc[dataframe['LABEL'] == 'POSITIVE']
tweet_positif

Unnamed: 0,Tweet,LABEL
2,@TofaTofa_id Miras dari dulu bukan nya legal? ...,POSITIVE
9,@HansjayaHans @yoelism @IamYogas @ferrykoto @g...,POSITIVE
12,"@vinna_123 @ferrykoto Memang miras itu legal, ...",POSITIVE
15,"Soal Miras yg dibatasi Jokowi lewat Perppres, ...",POSITIVE
17,@hnurwahid @jokowi Miras legal di sana https:...,POSITIVE
...,...,...
1101,"Soal Miras yg dibatasi Jokowi lewat Perppres, ...",POSITIVE
1102,"@VIVAcoid Setuju di legalkan saja, lagian, org...",POSITIVE
1103,@hnurwahid @jokowi Miras legal di sana https:...,POSITIVE
1104,@NMoekijat lokal miras nya legal...,POSITIVE


In [6]:
tweet_negatif = dataframe.loc[dataframe['LABEL'] == 'NEGATIVE']
tweet_negatif

Unnamed: 0,Tweet,LABEL
0,Masih mau legal kan miras!! Lanjut aja klo ...,NEGATIVE
1,@Lelaki_5unyi @PutraWadapi jawaban dr allah at...,NEGATIVE
3,@pamanank_p79 betul daerah legal miras. moho...,NEGATIVE
4,@jokowi Kurangi mirasnya juga pak. Meski otori...,NEGATIVE
5,@Lelaki_5unyi epek dari legal miras bencana al...,NEGATIVE
...,...,...
1061,#MirasPangkalSejutaMaksiat #MirasIndukMaksiat...,NEGATIVE
1062,"Ya Allah, jadikanlah orang-orang yang beriman ...",NEGATIVE
1063,@GhurobaS Umat Islam dari kota Bangka Tengah h...,NEGATIVE
1064,"Jangan takut terhadap atauran rezim tirani, ta...",NEGATIVE


In [7]:
new_dataframe = pd.concat([tweet_positif.iloc[:100], tweet_negatif.iloc[:100]], ignore_index = True).sample(frac = 1).reset_index(drop = True)
new_dataframe

Unnamed: 0,Tweet,LABEL
0,",ðŸ˜‚ðŸ˜‚ðŸ˜‚ Hallo pak brilian....emang ada P...",POSITIVE
1,yo #PapuaTolakInvestasiMiras,NEGATIVE
2,@Dennysiregar7 Kepala Suku Pegunungan Papua Ny...,NEGATIVE
3,Koar2 agar MIRAS di Legal scepatnya datang mus...,NEGATIVE
4,Rt #MinahasaTolakInvetasiMiras #PapuaTolakInve...,NEGATIVE
...,...,...
195,Mungkin negara sedang kolaps butuh duit akibat...,POSITIVE
196,Kepala Suku Pegunungan Papua Nyatakan Perang T...,NEGATIVE
197,Miras udah legal dan industrinya udh banyak in...,POSITIVE
198,Masih mau legal kan miras!! Lanjut aja klo ...,NEGATIVE


# Pre-process Data

In [8]:
import numpy as np
import re
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

### Case Folding

In [9]:
# tweets = list(new_dataframe.iloc[:,0].apply(lambda x: x.lower()))
# label = list(new_dataframe.iloc[:,1].apply(lambda y: int(y == 'POSITIVE')))
tweets = list(dataframe.iloc[:,0].apply(lambda x: x.lower()))
label = list(dataframe.iloc[:,1].apply(lambda y: int(y == 'POSITIVE')))

for i in range(5):
    print('tweet: {}\nlabel: {}\n==========='.format(tweets[i], label[i]))

tweet: masih mau legal kan miras!!    lanjut aja klo msh jadi hamba dunia,   tunggu azab dari allah swt_
label: 0
tweet: @lelaki_5unyi @putrawadapi jawaban dr allah atas ke arogan an gub ntt ini dng me legal kn n mn dirikn bumd miras  #bebaskanibhrstanpasyarat  #bebaskanibhrstanpasyarat
label: 0
tweet: @tofatofa_id miras dari dulu bukan nya legal?  apa yg mau di legalkan lagi?
label: 1
tweet: @pamanank_p79 betul daerah legal miras.   mohon doa dari teman2 sekalian karena yang minum miras tidak semua..
label: 0
tweet: @jokowi kurangi mirasnya juga pak. meski otoritas disana miras legal tp kan itu wilayah msh punya allah bukan punya indonesiaðÿ¤­
label: 0


### Tokenizing & Filtering

Removed token:
- stopwords
- username
- hashtag
- URL
- number
- too long words ( >25 characters )
- too short words ( <3 characters )
- symbols

In [10]:
STOPWORDS = set(stopwords.words('indonesian'))
PUNCTUATIONS = '[,./<>?;\':\"!@#$%^&*()_+=|\\\[\]{}-]'

class DisqualifiedToken(Exception):
    pass

def _is_qualified_token(token: str):
    try:
        if token in STOPWORDS: raise DisqualifiedToken('stopwords')
        if token[:1] == '@': raise DisqualifiedToken('username')
        if token[:1] == '#': raise DisqualifiedToken('hashtag')
        if len(token) >= 4:
            if token[:4] in ['http', 'www.']: raise DisqualifiedToken('url')
        token = re.sub(PUNCTUATIONS, '', token)
        if token == '': raise DisqualifiedToken('symbol')
        if len(token) < 3: raise DisqualifiedToken('short words')
        if len(token) > 25: raise DisqualifiedToken('long words')
        if token.isdigit(): raise DisqualifiedToken('number')
    except DisqualifiedToken:
        return False
    else:
        return True

In [11]:
documents = []
for tweet in tweets:
    document = [ re.sub(PUNCTUATIONS, ' ', token) for token in str(tweet).split() if _is_qualified_token(token) ]
    documents.append(' '.join(document).split())
    
for i in range(99, 110):
    print(documents[i])

['hem']
['udah', 'kali', 'perhatiin', 'kebijakannya', 'aneh', 'aneh', 'larangan', 'pesta', 'pernikahan', 'presidennya', 'kondangan', 'legalitas', 'miras', 'protes', 'wakilnya', 'akrab', 'gimana', 'sih', 'ampe', 'komunikasi', 'kebijakan', 'nyeleneh']
['yaa', 'kalo', 'orang', 'orang', 'udah', 'dikasih', 'kepercayaan', 'mah', 'silakan', 'sekelas', 'miras', 'manfaatnya', 'aja', 'ilegal', 'ganja', 'manfaatnya', 'bisa', 'dijadikan', 'pembenaran', 'setau', 'gue', 'skrg', 'koar', 'legalit']
['legalitas', 'ganja', 'bercermin', 'miras', 'kmrnðÿ˜œ']
['legalitas', 'investasi', 'miras', 'indonesia', 'gimana', 'anda', 'via']
['zaman', 'skrg', 'sih', 'dia', 'keturunan', 'umar', 'bin', 'khattab', 'nyaðÿ¤\xadðÿ¤\xadðÿ¤\xad', 'bingung', 'umar', 'bin', 'khattab', 'rasulullah', 'miras', 'diharamkan', 'ini', 'kmaren', 'keturunan', 'umar', 'bin', 'khattab', 'teken', 'legalitas', 'miras', 'ðÿ¤\xadðÿ¤\xadðÿ¤\xadðÿ¤']
['suruh', 'gubernurmu', 'hentikan', 'kejahatan', 'pabrik', 'legalitas', 'miras']
['tdk', 'sem

In [None]:
label

### Stemming

In [12]:
STEMMER = StemmerFactory().create_stemmer()

In [49]:
for i, document in enumerate(documents):
    documents[i] = [ STEMMER.stem(str(token)) for token in document ]
    
for i in range(99,110):
    print(documents[i])

['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',

### Text to Number Representation

In [14]:
vocab = set([token for document in documents for token in document])
print(vocab)

{'koar2', '', 'permendagri', 'bacot', 'jamsoatek', 'alhamdulilah', 'dumai', 'tinggi', 'sambut', 'alias', 'jiwasraya', 'hikmah', 'via', 'jati', 'hbs an', 'dongo', 'klik', 'umkm', 'konsumsi', 'moralitas', 'dpd', 'apa2', 'searching', 'kutip', 'cuan', 'kusumardani', 'marak', 'lokal', 'ring', 'berpa', 'scr', 'mahrus', 'kaji', 'distribusi', 'naikin', 'fitnah', 'wahyudi', 'apakh', 'tipu2', 'model', 'istilah', 'dng', 'honda', 'konsumen', 'koh', 'milih', 'terus', 'alih', 'elu', 'kakak', 'order', 'nyimak', 'halu', 'pembeci', 'prolegnas', 'taut', 'makelar', 'maksiattt', 'mafia2', 'control', 'mana', 'maksud', 'anggur', 'lahadalia', 'is', 'pan', 'nulis', 'sampean', 'bbrp', 'musibah', 'ngamuk', 'disupport', 'ingkar', 'suryo', 'bela', 'dahulu', 'tokoh2nya', 'nih', 'ceramah', 'babak', 'bkpm', 'kiyai', 'heboh', 'yusuf', 'caption', 'perda', 'milu', 'pancasilais', 'sejarah', 'penguasaha', 'surga', 'a', 'kalap', 'bin', 'ephesias', 'hahaha', 'utk', 'ber', 'wni', 'eksekusi', 'batal', 'jangan2', 'berhu', 'uc

In [15]:
for i, document in enumerate(documents):
    documents[i] = [ int(token in document) for token in vocab ]
    
for i in range(9,110):
    print(documents[i])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
print([ list(vocab)[i] for i, x in enumerate(documents[109]) if x == 1 ])

# Init Model

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [36]:
del text_classifier

In [37]:
n_tree = 200
random = 0
test_ratio = 0.2

In [38]:
text_classifier = RandomForestClassifier(n_estimators=n_tree, random_state=random, verbose = 3)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(documents, label, test_size=test_ratio, random_state=random)

In [31]:
print('train data: {}\ntest data: {}'.format(len(X_train), len(X_test)))

train data: 884
test data: 222


In [21]:
y_test

[0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0]

# Train Model

In [40]:
text_classifier.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 200
building tree 2 of 200
building tree 3 of 200
building tree 4 of 200
building tree 5 of 200
building tree 6 of 200
building tree 7 of 200
building tree 8 of 200
building tree 9 of 200
building tree 10 of 200
building tree 11 of 200
building tree 12 of 200
building tree 13 of 200
building tree 14 of 200
building tree 15 of 200
building tree 16 of 200
building tree 17 of 200
building tree 18 of 200
building tree 19 of 200
building tree 20 of 200
building tree 21 of 200
building tree 22 of 200
building tree 23 of 200
building tree 24 of 200
building tree 25 of 200
building tree 26 of 200
building tree 27 of 200
building tree 28 of 200
building tree 29 of 200
building tree 30 of 200
building tree 31 of 200
building tree 32 of 200
building tree 33 of 200
building tree 34 of 200
building tree 35 of 200
building tree 36 of 200
building tree 37 of 200
building tree 38 of 200
building tree 39 of 200
building tree 40 of 200
building tree 41 of 200
building tree 42 of 200
b

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    6.9s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=3,
                       warm_start=False)

# Test Model

In [43]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [44]:
predictions = text_classifier.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.0s finished


In [45]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[178   1]
 [ 34   9]]
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       179
           1       0.90      0.21      0.34        43

    accuracy                           0.84       222
   macro avg       0.87      0.60      0.63       222
weighted avg       0.85      0.84      0.80       222

0.8423423423423423


In [None]:
sum(y_test)

In [46]:
xx = text_classifier.predict(documents)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.2s finished


In [47]:
print(confusion_matrix(label,xx))
print(classification_report(label,xx))
print(accuracy_score(label, xx))

[[933   1]
 [ 42 130]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       934
           1       0.99      0.76      0.86       172

    accuracy                           0.96      1106
   macro avg       0.97      0.88      0.92      1106
weighted avg       0.96      0.96      0.96      1106

0.961121157323689


# Visualize Result

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns