Dataset yang digunakan dapat didownload di: https://github.com/rizalespe/Dataset-Sentimen-Analisis-Bahasa-Indonesia atau menggunakan ***git clone*** seperti contoh dibawah ini. Folder yang di _clone_ tersimpan ke dalam folder tempat file project ini disimpan.

In [1]:
#!git clone https://github.com/rizalespe/Dataset-Sentimen-Analisis-Bahasa-Indonesia

## Install Package

**Requirement Package**:

```
1. nltk : https://www.nltk.org/
2. Sastrawi: https://github.com/sastrawi/sastrawi
3. numpy: https://numpy.org/
4. pandas: https://pandas.pydata.org/
5. sklearn: https://scikit-learn.org/stable/

```

# Import Package

In [2]:
import numpy as np
import pandas as pd
import re
import pickle
from string import punctuation
import os
import json

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from sklearn.model_selection import train_test_split

# ```{Utils}```

In [3]:
def process_tweet(tweet):
    
    # kumpulan stemming
    factory_stem = StemmerFactory()
    stemmer = factory_stem.create_stemmer()

    # kumpulan stopwords
    factory_stopwords = StopWordRemoverFactory()
    stopword = factory_stopwords.get_stop_words() + stopwords.words('indonesian')
  
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # tokenizer word
    tweet_tokens = word_tokenize(tweet)
    
    # membersihkan word
    tweets_clean = [stemmer.stem(word) for word in tweet_tokens if (word not in stopword and word not in punctuation)]
  
    return tweets_clean

In [4]:
def build_freqs(tweets, ys):
    
    yslist = np.squeeze(ys).tolist()
    
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

# Processing data

### Import data

In [5]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/dataset_komentar_instagram_cyberbullying.csv")

In [6]:
df.head()

Unnamed: 0,Id,Sentiment,Instagram Comment Text
0,1,negative,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1,2,negative,Geblek lo tata...cowo bgt dibela2in balikan......
2,3,negative,Kmrn termewek2 skr lengket lg duhhh kok labil ...
3,4,negative,"Intinya kalau kesel dengan ATT nya, gausah ke ..."
4,5,negative,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha..."


In [7]:
df.Sentiment.value_counts()

positive    200
negative    200
Name: Sentiment, dtype: int64

In [8]:
df['Instagram Comment Text']

0       <USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1      Geblek lo tata...cowo bgt dibela2in balikan......
2      Kmrn termewek2 skr lengket lg duhhh kok labil ...
3      Intinya kalau kesel dengan ATT nya, gausah ke ...
4      hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...
                             ...                        
395    Bangga sama suami yg selalu ingat istri disela...
396    Apaoun pekerjaannya yg penting halal u tuk men...
397    Gojek itu mayoritas pegangguran yang lama gak ...
398     <USERNAME> aslinya cantik dan ayu loh mbak kr...
399     <USERNAME> suami saya seumuran sama saya mba,...
Name: Instagram Comment Text, Length: 400, dtype: object

In [9]:
df.loc[(df.Sentiment == 'negative'),'Sentiment']=0
df.loc[(df.Sentiment == 'positive'),'Sentiment']=1

In [10]:
X = pd.DataFrame(df['Instagram Comment Text'])
y = df.Sentiment

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train = X_train.values.squeeze().tolist()
X_test = X_test.values.squeeze().tolist()
y_train = np.array([y_train.values.squeeze().tolist()])
y_test = np.array([y_test.values.squeeze().tolist()])

In [13]:
X_train[0]

'Intinya kalau kesel dengan ATT nya, gausah ke anaknya juga. Kasian buat perkembangan psikis anak kedepannya. Itu orang bener bener tolol, skrg seandainya dia punya anak, terus anaknya dikatain sama orang yang benci sama dia, gimana perasaan dia ? Benci sama seseorang boleh, tapi harus tau batesnya ?? toh namanya manusia, gaakan semua jadi penyuka, pasti ada haters ??'

### Build Freqs

Cell bisa dijalankan atau langsung saja import file `freqs.json`

In [45]:
freqs = build_freqs(X_train, y_train)
freqs

{('inti', 0): 1,
 ('kesel', 0): 1,
 ('att', 0): 8,
 ('nya', 0): 41,
 ('gausah', 0): 1,
 ('anak', 0): 22,
 ('kasi', 0): 4,
 ('kembang', 0): 1,
 ('psikis', 0): 1,
 ('depan', 0): 2,
 ('itu', 0): 2,
 ('orang', 0): 26,
 ('bener', 0): 6,
 ('tolol', 0): 4,
 ('skrg', 0): 9,
 ('anda', 0): 2,
 ('dikatain', 0): 1,
 ('benci', 0): 6,
 ('gimana', 0): 1,
 ('asa', 0): 2,
 ('tau', 0): 16,
 ('batesnya', 0): 1,
 ('nama', 0): 8,
 ('manusia', 0): 11,
 ('gaakan', 0): 1,
 ('suka', 0): 10,
 ('haters', 0): 2,
 ('username', 0): 54,
 ('yaa', 0): 4,
 ('ampun', 0): 3,
 ('upil', 0): 2,
 ('naruto', 0): 1,
 ('kata2nya', 0): 1,
 ('cermin', 0): 2,
 ('klo', 0): 8,
 ('yg', 0): 88,
 ('pecun', 0): 2,
 ('alam', 0): 2,
 ('banget', 0): 15,
 ('lu', 0): 14,
 ('kesi', 0): 1,
 ('gua', 0): 6,
 ('prihatin', 0): 1,
 ('mati', 0): 2,
 ('aj', 0): 8,
 ('deh', 0): 9,
 ('ngotorin', 0): 1,
 ('dunia', 0): 5,
 ('kau', 0): 3,
 ('yg', 1): 80,
 ('komen', 1): 3,
 ('si', 1): 6,
 ('mbak', 1): 17,
 ('ga', 1): 36,
 ('sedih', 1): 5,
 ('ketawa2', 1): 

In [46]:
# check output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 2774


In [47]:
os.makedirs(name="data", exist_ok=True)

In [48]:
with open('data/freqs_utf8.json', 'wb', encoding='utf-8') as fp:
    pickle.dump(freqs, fp)

### load data

In [23]:
with open('data/freqs.json', 'rb') as f:
    freqs = pickle.load(f)

In [24]:
freqs

{('inti', 0): 1,
 ('kesel', 0): 1,
 ('att', 0): 8,
 ('nya', 0): 41,
 ('gausah', 0): 1,
 ('anak', 0): 22,
 ('kasi', 0): 4,
 ('kembang', 0): 1,
 ('psikis', 0): 1,
 ('depan', 0): 2,
 ('itu', 0): 2,
 ('orang', 0): 26,
 ('bener', 0): 6,
 ('tolol', 0): 4,
 ('skrg', 0): 9,
 ('anda', 0): 2,
 ('dikatain', 0): 1,
 ('benci', 0): 6,
 ('gimana', 0): 1,
 ('asa', 0): 2,
 ('tau', 0): 16,
 ('batesnya', 0): 1,
 ('nama', 0): 8,
 ('manusia', 0): 11,
 ('gaakan', 0): 1,
 ('suka', 0): 10,
 ('haters', 0): 2,
 ('username', 0): 54,
 ('yaa', 0): 4,
 ('ampun', 0): 3,
 ('upil', 0): 2,
 ('naruto', 0): 1,
 ('kata2nya', 0): 1,
 ('cermin', 0): 2,
 ('klo', 0): 8,
 ('yg', 0): 88,
 ('pecun', 0): 2,
 ('alam', 0): 2,
 ('banget', 0): 15,
 ('lu', 0): 14,
 ('kesi', 0): 1,
 ('gua', 0): 6,
 ('prihatin', 0): 1,
 ('mati', 0): 2,
 ('aj', 0): 8,
 ('deh', 0): 9,
 ('ngotorin', 0): 1,
 ('dunia', 0): 5,
 ('kau', 0): 3,
 ('yg', 1): 80,
 ('komen', 1): 3,
 ('si', 1): 6,
 ('mbak', 1): 17,
 ('ga', 1): 36,
 ('sedih', 1): 5,
 ('ketawa2', 1): 

### Process tweet

In [25]:
# test proses fungsi
print('Contoh sample positive tweet: \n', X_train[1])
print('\nContoh yang sudah di proses: \n', process_tweet(X_train[1]))

Contoh sample positive tweet: 
  <USERNAME> yaa ampun ini upil naruto,, kata2nya makin mencerminkan klo dia yg sebenarnya pecun.????pengalaman banget lu yaa. Kesian. Gua jadi prihatin. Mati aj deh lu. Ngotorin dunia manusia macam kau.??

Contoh yang sudah di proses: 
 ['username', 'yaa', 'ampun', 'upil', 'naruto', 'kata2nya', 'cermin', 'klo', 'yg', 'pecun', 'alam', 'banget', 'lu', 'yaa', 'kesi', 'gua', 'prihatin', 'mati', 'aj', 'deh', 'lu', 'ngotorin', 'dunia', 'manusia', 'kau']


# Logistic Algorithm

### Sigmoid function

In [26]:
def sigmoid(z):
    
    h = 1/(1+np.exp(-z))
    
    return h

In [27]:
# Testing fungsi 
if (sigmoid(0) == 0.5):
    print('SUCCESS!')
else:
    print('Oops!')

SUCCESS!


### Cost function

In [28]:
def linreg_cost_func(x, y, m, h):
    
    # calculate the cost function
    J = -1/m * (np.dot(y.T, np.log(h)) + (np.dot((1-y).T, np.log(1-h))))

    return J

### Gradient descent

In [29]:
def gradientDescent(x, y, theta, alpha, num_iters):
    
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # feed forward
        z = np.dot(x, theta)
        h = sigmoid(z)

        # hitung cost
        J = linreg_cost_func(x, y, m, h)
        
        # update weight
        theta = theta - ((alpha/m) * (np.dot(x.T, h-y)))
        
    ### END CODE HERE ###
    J = float(J)
    return J, theta

In [30]:
# Check the function
# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent

tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.67094970.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


# Extract Features

In [31]:
def extract_features(tweet, freqs):
    
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
        
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
        
    ### END CODE HERE ###
    assert(x.shape == (1, 3))
    return x

In [32]:
# Check your function

# test 1
# test on training data
tmp1 = extract_features(X_train[5], freqs)
print(tmp1)

[[  1. 528. 659.]]


In [33]:
# test 2:
# check for when the words are not in the freqs dictionary
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

[[1. 0. 0.]]


# Training model

In [34]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
    X[i, :]= extract_features(X_train[i], freqs)
    print(f'\rExtract: {i}/{len(X_train)}', end=' ')

Extract: 319/320 

In [95]:
Y.shape

(1, 320)

In [71]:
# training labels corresponding to 
Y = np.array([y_train])

In [102]:
# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

(320, 1)


ValueError: shapes (320,1) and (320,1) not aligned: 1 (dim 1) != 320 (dim 0)