# Import Data

In [1]:
import pandas as pd

In [2]:
filename = 'MANDAT_LABELING.xlsx'

In [3]:
dataframe = pd.read_excel(filename)
dataframe.head()

Unnamed: 0,Tweet,LABEL
0,Masih mau legal kan miras!! Lanjut aja klo ...,NEGATIVE
1,@Lelaki_5unyi @PutraWadapi jawaban dr allah at...,NEGATIVE
2,@TofaTofa_id Miras dari dulu bukan nya legal? ...,POSITIVE
3,@pamanank_p79 betul daerah legal miras. moho...,NEGATIVE
4,@jokowi Kurangi mirasnya juga pak. Meski otori...,NEGATIVE


In [4]:
dataframe.describe()

Unnamed: 0,Tweet,LABEL
count,1106,1106
unique,1094,2
top,Miras di Indonesia itu bukan dilarang tapi dia...,NEGATIVE
freq,2,934


In [5]:
tweet_positif = dataframe.loc[dataframe['LABEL'] == 'POSITIVE']
tweet_positif

Unnamed: 0,Tweet,LABEL
2,@TofaTofa_id Miras dari dulu bukan nya legal? ...,POSITIVE
9,@HansjayaHans @yoelism @IamYogas @ferrykoto @g...,POSITIVE
12,"@vinna_123 @ferrykoto Memang miras itu legal, ...",POSITIVE
15,"Soal Miras yg dibatasi Jokowi lewat Perppres, ...",POSITIVE
17,@hnurwahid @jokowi Miras legal di sana https:...,POSITIVE
...,...,...
1101,"Soal Miras yg dibatasi Jokowi lewat Perppres, ...",POSITIVE
1102,"@VIVAcoid Setuju di legalkan saja, lagian, org...",POSITIVE
1103,@hnurwahid @jokowi Miras legal di sana https:...,POSITIVE
1104,@NMoekijat lokal miras nya legal...,POSITIVE


In [6]:
tweet_negatif = dataframe.loc[dataframe['LABEL'] == 'NEGATIVE']
tweet_negatif

Unnamed: 0,Tweet,LABEL
0,Masih mau legal kan miras!! Lanjut aja klo ...,NEGATIVE
1,@Lelaki_5unyi @PutraWadapi jawaban dr allah at...,NEGATIVE
3,@pamanank_p79 betul daerah legal miras. moho...,NEGATIVE
4,@jokowi Kurangi mirasnya juga pak. Meski otori...,NEGATIVE
5,@Lelaki_5unyi epek dari legal miras bencana al...,NEGATIVE
...,...,...
1061,#MirasPangkalSejutaMaksiat #MirasIndukMaksiat...,NEGATIVE
1062,"Ya Allah, jadikanlah orang-orang yang beriman ...",NEGATIVE
1063,@GhurobaS Umat Islam dari kota Bangka Tengah h...,NEGATIVE
1064,"Jangan takut terhadap atauran rezim tirani, ta...",NEGATIVE


In [7]:
new_dataframe = pd.concat([tweet_positif.iloc[:100], tweet_negatif.iloc[:100]], ignore_index = True).sample(frac = 1).reset_index(drop = True)
new_dataframe

Unnamed: 0,Tweet,LABEL
0,"Ingat Drun, biar tidak guoblok bahwa Perpres N...",POSITIVE
1,Ketua PDIP Jatim Sepakat Perpres Investasi Mir...,POSITIVE
2,"Resmi Masuk Prolegnas, Fahira Idris: Tanpa UU,...",POSITIVE
3,Jangan biarkan miras membunuh saudara-saudara ...,NEGATIVE
4,@HansjayaHans @IamYogas @ferrykoto @ghanieierf...,NEGATIVE
...,...,...
195,@jayapuraupdate @MCAOps @lengkaanias1 #PapuaTo...,NEGATIVE
196,@Fahriza18904354 @NOTASLIMBOY Terlepas dari ha...,NEGATIVE
197,Polres Kotim Terapkan KUHP dan UU Perlindungan...,POSITIVE
198,@AisyahH25891622 @CNNIndonesia @Kiyai_MarufAmi...,POSITIVE


# Pre-process Data

In [8]:
import numpy as np
import re
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

### Case Folding

In [9]:
# tweets = list(new_dataframe.iloc[:,0].apply(lambda x: x.lower()))
# label = list(new_dataframe.iloc[:,1].apply(lambda y: int(y == 'POSITIVE')))
tweets = list(dataframe.iloc[:,0].apply(lambda x: x.lower()))
label = list(dataframe.iloc[:,1].apply(lambda y: int(y == 'POSITIVE')))

for i in range(5):
    print('tweet: {}\nlabel: {}\n==========='.format(tweets[i], label[i]))

tweet: masih mau legal kan miras!!    lanjut aja klo msh jadi hamba dunia,   tunggu azab dari allah swt_
label: 0
tweet: @lelaki_5unyi @putrawadapi jawaban dr allah atas ke arogan an gub ntt ini dng me legal kn n mn dirikn bumd miras  #bebaskanibhrstanpasyarat  #bebaskanibhrstanpasyarat
label: 0
tweet: @tofatofa_id miras dari dulu bukan nya legal?  apa yg mau di legalkan lagi?
label: 1
tweet: @pamanank_p79 betul daerah legal miras.   mohon doa dari teman2 sekalian karena yang minum miras tidak semua..
label: 0
tweet: @jokowi kurangi mirasnya juga pak. meski otoritas disana miras legal tp kan itu wilayah msh punya allah bukan punya indonesiaðÿ¤­
label: 0


### Tokenizing & Filtering

Removed token:
- stopwords
- username
- hashtag
- URL
- number
- too long words ( >25 characters )
- too short words ( <3 characters )
- symbols

In [10]:
STOPWORDS = set(stopwords.words('indonesian'))
PUNCTUATIONS = '[,./<>?;\':\"!@#$%^&*()_+=|\\\[\]{}-]'

class DisqualifiedToken(Exception):
    pass

def _is_qualified_token(token: str):
    try:
        if token in STOPWORDS: raise DisqualifiedToken('stopwords')
        if token[:1] == '@': raise DisqualifiedToken('username')
        if token[:1] == '#': raise DisqualifiedToken('hashtag')
        if len(token) >= 4:
            if token[:4] in ['http', 'www.']: raise DisqualifiedToken('url')
        token = re.sub(PUNCTUATIONS, '', token)
        if token == '': raise DisqualifiedToken('symbol')
        if len(token) < 3: raise DisqualifiedToken('short words')
        if len(token) > 25: raise DisqualifiedToken('long words')
        if token.isdigit(): raise DisqualifiedToken('number')
    except DisqualifiedToken:
        return False
    else:
        return True

In [11]:
documents = []
for tweet in tweets:
    document = [ re.sub(PUNCTUATIONS, ' ', token) for token in str(tweet).split() if _is_qualified_token(token) ]
    documents.append(' '.join(document).split())
    
for i in range(99, 110):
    print(documents[i])

['hem']
['udah', 'kali', 'perhatiin', 'kebijakannya', 'aneh', 'aneh', 'larangan', 'pesta', 'pernikahan', 'presidennya', 'kondangan', 'legalitas', 'miras', 'protes', 'wakilnya', 'akrab', 'gimana', 'sih', 'ampe', 'komunikasi', 'kebijakan', 'nyeleneh']
['yaa', 'kalo', 'orang', 'orang', 'udah', 'dikasih', 'kepercayaan', 'mah', 'silakan', 'sekelas', 'miras', 'manfaatnya', 'aja', 'ilegal', 'ganja', 'manfaatnya', 'bisa', 'dijadikan', 'pembenaran', 'setau', 'gue', 'skrg', 'koar', 'legalit']
['legalitas', 'ganja', 'bercermin', 'miras', 'kmrnðÿ˜œ']
['legalitas', 'investasi', 'miras', 'indonesia', 'gimana', 'anda', 'via']
['zaman', 'skrg', 'sih', 'dia', 'keturunan', 'umar', 'bin', 'khattab', 'nyaðÿ¤\xadðÿ¤\xadðÿ¤\xad', 'bingung', 'umar', 'bin', 'khattab', 'rasulullah', 'miras', 'diharamkan', 'ini', 'kmaren', 'keturunan', 'umar', 'bin', 'khattab', 'teken', 'legalitas', 'miras', 'ðÿ¤\xadðÿ¤\xadðÿ¤\xadðÿ¤']
['suruh', 'gubernurmu', 'hentikan', 'kejahatan', 'pabrik', 'legalitas', 'miras']
['tdk', 'sem

In [12]:
label

[0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,


### Stemming

In [13]:
STEMMER = StemmerFactory().create_stemmer()

In [14]:
for i, document in enumerate(documents):
    documents[i] = [ STEMMER.stem(token) for token in document ]
    
for i in range(99,110):
    print(documents[i])

['hem']
['udah', 'kali', 'perhatiin', 'bijak', 'aneh', 'aneh', 'larang', 'pesta', 'nikah', 'presiden', 'kondangan', 'legalitas', 'miras', 'protes', 'wakil', 'akrab', 'gimana', 'sih', 'ampe', 'komunikasi', 'bijak', 'nyeleneh']
['yaa', 'kalo', 'orang', 'orang', 'udah', 'kasih', 'percaya', 'mah', 'sila', 'kelas', 'miras', 'manfaat', 'aja', 'ilegal', 'ganja', 'manfaat', 'bisa', 'jadi', 'benar', 'tau', 'gue', 'skrg', 'koar', 'legalit']
['legalitas', 'ganja', 'cermin', 'miras', 'kmrn']
['legalitas', 'investasi', 'miras', 'indonesia', 'gimana', 'anda', 'via']
['zaman', 'skrg', 'sih', 'dia', 'turun', 'umar', 'bin', 'khattab', 'nya', 'bingung', 'umar', 'bin', 'khattab', 'rasulullah', 'miras', 'haram', 'ini', 'kmaren', 'turun', 'umar', 'bin', 'khattab', 'teken', 'legalitas', 'miras', '']
['suruh', 'gubernur', 'henti', 'jahat', 'pabrik', 'legalitas', 'miras']
['tdk', 'semua', 'tapi', 'inti', 'lepas', 'buruk', 'krn', 'pegang', 'orang', 'orang', 'jahat', 'tdk', 'amanah', 'ingat', 'ususlan', 'putus'

In [15]:
for x in dataframe.iloc[99:110,0]:
    print(x)

Hem.... #DaruratUUMiras #DaruratUUMiras
Udah beberapa kali gw perhatiin kebijakannya aneh aneh, mulai dari larangan pesta pernikahan eh presidennya kondangan, legalitas miras eh di protes wakilnya (ga akrab ya ?, Apa gimana sih ampe ga ada komunikasi), dan beberapa kebijakan yang cukup 'Nyeleneh'
@tubirfess Yaa kalo orang orang udah bisa dikasih kepercayaan mah silakan ae, sekelas miras yang tak jelas manfaatnya aja banyak yang ilegal, apalagi kalau ganja yang jelas besar manfaatnya bisa" Malah dijadikan pembenaran. Setau gue skrg yg koar" Legalit
@tubirfess Legalitas ganja ya? bercermin ke yg miras kmrnðŸ˜Œ
Legalitas Investasi Miras di Indonesia. Gimana menurut anda?   #ForumKaskus via @KASKUS  https://t.co/XBDdG0EEbY
@____Bella_Peter Zaman skrg ada sih,dia keturunan umar bin khattab kata nyaðŸ¤­ðŸ¤­ðŸ¤­ Tp gw bingung,Umar bin khattab minta pd Rasulullah supaya miras diharamkan,ini yg kmaren katanya keturunan Umar bin khattab malah teken legalitas miras.ðŸ¤­ðŸ¤­ðŸ¤­ðŸ¤
@IndopratamaC S

### Text to Number Representation

In [16]:
vocab = set([token for document in documents for token in document])
print(vocab)

{'', 'juwaini', 'dstnya', 'gak', 'niat', 'ajjh', 'ruu', 'ilegal', 'amanah', 'bungkam', 'pea', 'tentang', 'oplos', 'asumsi', 'kumpul', 'larut', 'gampang', 'goblokkk', 'sepuh', 'belum', 'judi', 'botol', 'kes', 'pundi', 'order', 'wiih', 'musik', 'hijrah', 'allahumma', 'tanah', 'dibiarin', 'jgn2', 'kaffah', 'mayoristas', 'mop', 'amal', 'tunai', 'basa', 'lahadalia', 'teman2', 'braayy', 'hikmah', 'diijinka', 'legilitas', 'kuda', 'pki', 'syrup', 'dongo', 'lukas', 'yuliar', 'kss', 'mohamad', 'lu', 'kalang', 'undang2', 'mukmin', 'cape', 'eksploitasi', 'kojor', 'epek', 'payah', 'curi', 'sni', 'conclusi', 'seru', 'proses', 'solidaritas', 'dipraktekkn', 'elang', 'sakit', '2beer', 'buzzerp', 'pinggir', 'yusuf', 'sok', 'bijaksana', 'bani', 'rasa', 'cukai', 'khaeroni', 'iblis', 'kaji', 'daftar', 'petisi', 'dekat2', 'pic', 'gembor', 'laut tag', 'se', 'terbit', 'tum', 'republik', 'diplintir', 'gitu', 'nafsu', 'perhatiin', 'ngedribble', 'fals', 'mulai', 'susul', 'thread', 'konsen', 'bang', 'homoseksual'

In [17]:
for i, document in enumerate(documents):
    documents[i] = [ int(token in document) for token in vocab ]
    
for i in range(9,110):
    print(documents[i])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [18]:
print([ list(vocab)[i] for i, x in enumerate(documents[109]) if x == 1 ])

['bani', 'teken', 'teriak', 'dukung', 'dia', 'maling', 'legalitas', 'dprd', 'ciu', 'batal', 'dki', 'miras', 'bos', 'yah', 'nya', 'jkw', 'itu', 'nang', 'mui', 'investasi']


# Init Model

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [20]:
# n_tree = 200
random = 0
test_ratio = 0.2

In [21]:
text_classifier = MultinomialNB()

In [22]:
X_train, X_test, y_train, y_test = train_test_split(documents, label, test_size=test_ratio, random_state=random)

In [23]:
print('train data: {}\ntest data: {}'.format(len(X_train), len(X_test)))

train data: 884
test data: 222


In [24]:
y_test

[0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0]

# Train Model

In [25]:
text_classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Test Model

In [26]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [27]:
predictions = text_classifier.predict(X_test)

In [28]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[157  22]
 [ 30  13]]
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       179
           1       0.37      0.30      0.33        43

   micro avg       0.77      0.77      0.77       222
   macro avg       0.61      0.59      0.60       222
weighted avg       0.75      0.77      0.76       222

0.7657657657657657


In [None]:
sum(y_test)

# Visualize Result

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
xx = text_classifier.predict(documents)

In [30]:
print(confusion_matrix(label,xx))
print(classification_report(label,xx))
print(accuracy_score(label, xx))

[[908  26]
 [ 67 105]]
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       934
           1       0.80      0.61      0.69       172

   micro avg       0.92      0.92      0.92      1106
   macro avg       0.87      0.79      0.82      1106
weighted avg       0.91      0.92      0.91      1106

0.9159132007233273
