Dataset yang digunakan dapat didownload di: https://github.com/rizalespe/Dataset-Sentimen-Analisis-Bahasa-Indonesia atau menggunakan ***git clone*** seperti contoh dibawah ini. Folder yang di _clone_ tersimpan ke dalam folder tempat file project ini disimpan.

In [None]:
!git clone https://github.com/rizalespe/Dataset-Sentimen-Analisis-Bahasa-Indonesia

## Install Package

**Requirement Package**:

```
1. nltk : https://www.nltk.org/
2. Sastrawi: https://github.com/sastrawi/sastrawi
3. numpy: https://numpy.org/
4. pandas: https://pandas.pydata.org/
5. sklearn: https://scikit-learn.org/stable/

```

# Import Package

In [None]:
#!pip install Sastrawi
#nltk.download('stopwords')
#nltk.download('punkt')

In [1]:
import numpy as np
import pandas as pd
import re
import pickle
from string import punctuation
import os
import json

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from sklearn.model_selection import train_test_split

# ```{Utils}```

In [2]:
def process_tweet(tweet):
    
    # kumpulan stemming
    factory_stem = StemmerFactory()
    stemmer = factory_stem.create_stemmer()

    # kumpulan stopwords
    factory_stopwords = StopWordRemoverFactory()
    stopword = factory_stopwords.get_stop_words() + stopwords.words('indonesian')
  
    # menghapus kata-kata yang tidak penting seperti @, #
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    
    # tokenizer word
    tweet_tokens = word_tokenize(tweet)
    
    # membersihkan word
    tweets_clean = [stemmer.stem(word) for word in tweet_tokens if (word not in stopword and word not in punctuation)]
  
    return tweets_clean

In [3]:
def build_freqs(tweets, ys):
    
    yslist = np.squeeze(ys).tolist()
    
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

# Processing data

### Import data

In [4]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/dataset_komentar_instagram_cyberbullying.csv")

In [5]:
df.head()

Unnamed: 0,Id,Sentiment,Instagram Comment Text
0,1,negative,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1,2,negative,Geblek lo tata...cowo bgt dibela2in balikan......
2,3,negative,Kmrn termewek2 skr lengket lg duhhh kok labil ...
3,4,negative,"Intinya kalau kesel dengan ATT nya, gausah ke ..."
4,5,negative,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha..."


In [6]:
df.Sentiment.value_counts()

positive    200
negative    200
Name: Sentiment, dtype: int64

In [7]:
df.loc[(df.Sentiment == 'negative'),'Sentiment']=0
df.loc[(df.Sentiment == 'positive'),'Sentiment']=1

In [8]:
X = pd.DataFrame(df['Instagram Comment Text'])
y = df.Sentiment

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train = X_train.values.squeeze().tolist()
X_test = X_test.values.squeeze().tolist()
y_train = np.array([y_train.values.squeeze().tolist()])
y_test = np.array([y_test.values.squeeze().tolist()])

In [11]:
X_train[0]

'Intinya kalau kesel dengan ATT nya, gausah ke anaknya juga. Kasian buat perkembangan psikis anak kedepannya. Itu orang bener bener tolol, skrg seandainya dia punya anak, terus anaknya dikatain sama orang yang benci sama dia, gimana perasaan dia ? Benci sama seseorang boleh, tapi harus tau batesnya ?? toh namanya manusia, gaakan semua jadi penyuka, pasti ada haters ??'

### Build Freqs

Cell bisa dijalankan atau langsung saja import file `freqs.json`

In [None]:
# freqs = build_freqs(X_train, y_train)

In [None]:
# check output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

### save data

In [None]:
# os.makedirs(name="data", exist_ok=True)

In [None]:
#with open('data/freqs_utf8.json', 'wb') as fp:
    #pickle.dump(freqs, fp)

### load data

In [12]:
with open('data/freqs_utf8.json', 'rb') as f:
    freqs = pickle.load(f)

### Process tweet

In [13]:
# test proses fungsi
print('Contoh sample positive tweet: \n', X_train[1])
print('\nContoh yang sudah di proses: \n', process_tweet(X_train[1]))

Contoh sample positive tweet: 
  <USERNAME> yaa ampun ini upil naruto,, kata2nya makin mencerminkan klo dia yg sebenarnya pecun.????pengalaman banget lu yaa. Kesian. Gua jadi prihatin. Mati aj deh lu. Ngotorin dunia manusia macam kau.??

Contoh yang sudah di proses: 
 ['username', 'yaa', 'ampun', 'upil', 'naruto', 'kata2nya', 'cermin', 'klo', 'yg', 'pecun', 'alam', 'banget', 'lu', 'yaa', 'kesi', 'gua', 'prihatin', 'mati', 'aj', 'deh', 'lu', 'ngotorin', 'dunia', 'manusia', 'kau']


# Logistic Algorithm

### Sigmoid function

In [14]:
def sigmoid(z):
    
    h = 1/(1+np.exp(-z))
    
    return h

In [15]:
# Testing fungsi 
if (sigmoid(0) == 0.5):
    print('SUCCESS!')
else:
    print('Oops!')

SUCCESS!


### Cost function

In [16]:
def linreg_cost_func(x, y, m, h):
    
    # calculate fungsi cost
    J = -1/m * (np.dot(y.T, np.log(h)) + (np.dot((1-y).T, np.log(1-h))))

    return J

### Gradient descent

In [17]:
def gradientDescent(x, y, theta, alpha, num_iters):
    
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # feed forward
        z = np.dot(x, theta)
        h = sigmoid(z)

        # hitung cost
        J = linreg_cost_func(x, y, m, h)
        
        # update weight
        theta = theta - ((alpha/m) * (np.dot(x.T, h-y)))
        
        print(f'\rIterasi: {i+1}/{num_iters}', end='')

    J = float(J)
    return J, theta

In [18]:
# testing fungsi
np.random.seed(1)
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)


# Apply gradient descent
tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"\nCost setelah training adalah {tmp_J:.8f}.")

Iterasi: 700/700
Cost setelah training adalah 0.67094970.


# Extract Features

In [19]:
def extract_features(tweet, freqs):
    
    # process_tweet tokenizes, stems, dan hapus stopwords
    word_l = process_tweet(tweet)
    
    # inisiasi x dengan nilai 0 dalam bentuk matriks 1x3 
    x = np.zeros((1, 3)) 
    
    # inisiasi bias dengan 1
    x[0,0] = 1 
        
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
        
    assert(x.shape == (1, 3))
    return x

In [20]:
# test fungsi
tmp1 = extract_features(X_train[5], freqs)
print(tmp1)

[[  1. 528. 659.]]


In [21]:
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

[[1. 0. 0.]]


# Training model

### extract features

In [22]:
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
    X[i, :]= extract_features(X_train[i], freqs)
    print(f'\rExtract: {i+1}/{len(X_train)}', end=' ')

Extract: 320/320 

In [23]:
Y = y_train.T

### Config

In [24]:
os.makedirs(name="model", exist_ok=True)

In [39]:
config = {
    'input': X,
    'output':Y,
    'alpha':1e-5,
    'theta':np.zeros((3, 1)),
    'num_iters':3000
}

In [40]:
with open('model/model3.pickle', 'wb') as fp:
    pickle.dump(config, fp)

### train model

In [41]:
# Apply gradient descent
J, theta = gradientDescent(config['input'], config['output'], config['theta'], config['alpha'], config['num_iters'])
print(f"\nCost setelah training adalah {J:.8f}.")

Iterasi: 3000/3000
Cost setelah training adalah 0.20258613.


In [42]:
theta

array([[ 0.00042596],
       [ 0.06527373],
       [-0.05839513]])

### save bobot

In [43]:
with open('model/theta03.wt', 'wb') as fp:
    pickle.dump(theta, fp)

# Test Logistic Regression

### load bobot

In [44]:
with open('model/theta03.wt', 'rb') as f:
    theta = pickle.load(f)

In [45]:
theta

array([[ 0.00042596],
       [ 0.06527373],
       [-0.05839513]])

In [46]:
def predict_tweet(tweet, freqs, theta):

    # extract features dari tweet dan simpan di dalam x
    x = extract_features(tweet, freqs)
    
    # buat prediksi menggunakan x dan bobot (theta)
    y_pred = sigmoid(np.dot(x, theta))
    
    return y_pred

In [47]:
# Run this cell to test your function
for tweet in ['bangsat lu', 'mantap sihh ini', 'apaan lo?', 'congrats yah k']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

bangsat lu -> 0.321922
mantap sihh ini -> 0.516419
apaan lo? -> 0.216825
congrats yah k -> 0.443671


In [48]:
# Feel free to check the sentiment of your own tweet below
my_tweet = 'busuk'
predict_tweet(my_tweet, freqs, theta)

array([[0.48551176]])

# Check peformance

In [49]:
def test_logistic_regression(test_x, test_y, freqs, theta):

    
    # the list for storing predictions
    y_hat = []
    
    for tweet in test_x:
        
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1)
        else:
            # append 0 to the list
            y_hat.append(0)

    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)
    
    return accuracy

In [50]:
tmp_accuracy = test_logistic_regression(X_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9000


# Error Analysis

In [51]:
y_test = y_test.T

In [52]:
# Some error analysis 
print('Label Predicted Tweet')
for x,y in zip(X_test,y_test):
    y_hat = predict_tweet(x, freqs, theta)
    if np.abs(y - (y_hat > 0.5)) > 0:
        print('tweet:', x)
        print('proses tweet:', process_tweet(x))
        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))
        print('\n')

Label Predicted Tweet
tweet: Anyiennnnggg.. Suaranya ancur banget, lebih merdu tukang gorengan
proses tweet: ['anyiennnnggg', 'suara', 'ancur', 'banget', 'merdu', 'tukang', 'goreng']
0	0.84613034	b'anyiennnnggg suara ancur banget merdu tukang goreng'


tweet: Berasa kaya mau manggung kk  <USERNAME> ?? semoga lancar dan cepat punya momongan biar Ada Princess Kecil MoMo atau gak prince ??
proses tweet: ['asa', 'kaya', 'manggung', 'kk', 'username', 'moga', 'lancar', 'cepat', 'momong', 'biar', 'ada', 'princess', 'kecil', 'momo', 'gak', 'prince']
1	0.21405383	b'asa kaya manggung kk username moga lancar cepat momong biar ada princess kecil momo gak prince'


tweet: Noraaak abiiis... Baru kebeli emas az pamer aplg mainan berlian ciiinnnnn.... ??
proses tweet: ['noraaak', 'abiiis', '', 'baru', 'kebel', 'emas', 'az', 'pamer', 'aplg', 'main', 'berlian', 'ciiinnnnn', '']
0	0.80596540	b'noraaak abiiis  baru kebel emas az pamer aplg main berlian ciiinnnnn '


tweet: Hahhahaha dan sesungguhnya allah

# Predict own tweet

In [58]:
# coba predik
my_tweet = """
apaan lo
"""
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['lo']
[[0.21682477]]
Negative sentiment
