In [1]:
!git clone https://github.com/rizalespe/Dataset-Sentimen-Analisis-Bahasa-Indonesia

Cloning into 'Dataset-Sentimen-Analisis-Bahasa-Indonesia'...
remote: Enumerating objects: 169, done.[K
remote: Counting objects: 100% (46/46), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 169 (delta 24), reused 0 (delta 0), pack-reused 123[K
Receiving objects: 100% (169/169), 164.89 KiB | 20.61 MiB/s, done.
Resolving deltas: 100% (80/80), done.


## Install Package

In [2]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l[K     |█▋                              | 10 kB 23.4 MB/s eta 0:00:01[K     |███▏                            | 20 kB 28.1 MB/s eta 0:00:01[K     |████▊                           | 30 kB 30.5 MB/s eta 0:00:01[K     |██████▎                         | 40 kB 32.6 MB/s eta 0:00:01[K     |███████▉                        | 51 kB 32.5 MB/s eta 0:00:01[K     |█████████▍                      | 61 kB 32.9 MB/s eta 0:00:01[K     |███████████                     | 71 kB 33.4 MB/s eta 0:00:01[K     |████████████▌                   | 81 kB 34.8 MB/s eta 0:00:01[K     |██████████████                  | 92 kB 36.1 MB/s eta 0:00:01[K     |███████████████▋                | 102 kB 36.9 MB/s eta 0:00:01[K     |█████████████████▏              | 112 kB 36.9 MB/s eta 0:00:01[K     |██████████████████▊             | 122 kB 36.9 MB/s eta 0:00:01[K     |████████████████████▎           | 133 kB 36.9 MB/s

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Import Package

In [5]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.tokenize import word_tokenize
from string import punctuation
import re
from sklearn.model_selection import train_test_split
import pickle

# Utils

In [6]:
def process_tweet(tweet):

  # kumpulan stemming
  factory_stem = StemmerFactory()
  stemmer = factory_stem.create_stemmer()

  # kumpulan stopwords
  factory_stopwords = StopWordRemoverFactory()
  stopword = factory_stopwords.get_stop_words() + stopwords.words('indonesian')
  
  # remove stock market tickers like $GE
  tweet = re.sub(r'\$\w*', '', tweet)
  # remove old style retweet text "RT"
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  # remove hyperlinks
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
  # remove hashtags
  # only removing the hash # sign from the word
  tweet = re.sub(r'#', '', tweet)

  tweet_tokens = word_tokenize(tweet)

  tweets_clean = [stemmer.stem(word) for word in tweet_tokens if (word not in stopword and word not in punctuation)]
  
  return tweets_clean

In [7]:
def build_freqs(tweets, ys):

  yslist = np.squeeze(ys).tolist()

  freqs = {}
  for y, tweet in zip(yslist, tweets):
    for word in process_tweet(tweet):
      pair = (word, y)
      if pair in freqs:
        freqs[pair] += 1
      else:
        freqs[pair] = 1
  
  return freqs

# Processing data

## Import data

In [8]:
df = pd.read_csv("Dataset-Sentimen-Analisis-Bahasa-Indonesia/dataset_komentar_instagram_cyberbullying.csv")

In [9]:
df.head()

Unnamed: 0,Id,Sentiment,Instagram Comment Text
0,1,negative,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1,2,negative,Geblek lo tata...cowo bgt dibela2in balikan......
2,3,negative,Kmrn termewek2 skr lengket lg duhhh kok labil ...
3,4,negative,"Intinya kalau kesel dengan ATT nya, gausah ke ..."
4,5,negative,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha..."


In [10]:
df.Sentiment.value_counts()

positive    200
negative    200
Name: Sentiment, dtype: int64

In [11]:
df['Instagram Comment Text']

0       <USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1      Geblek lo tata...cowo bgt dibela2in balikan......
2      Kmrn termewek2 skr lengket lg duhhh kok labil ...
3      Intinya kalau kesel dengan ATT nya, gausah ke ...
4      hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...
                             ...                        
395    Bangga sama suami yg selalu ingat istri disela...
396    Apaoun pekerjaannya yg penting halal u tuk men...
397    Gojek itu mayoritas pegangguran yang lama gak ...
398     <USERNAME> aslinya cantik dan ayu loh mbak kr...
399     <USERNAME> suami saya seumuran sama saya mba,...
Name: Instagram Comment Text, Length: 400, dtype: object

In [12]:
df.loc[(df.Sentiment == 'negative'),'Sentiment']=0
df.loc[(df.Sentiment == 'positive'),'Sentiment']=1

In [13]:
X = pd.DataFrame(df['Instagram Comment Text'])
y = df.Sentiment

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train = X_train.values.squeeze().tolist()
X_test = X_test.values.squeeze().tolist()
y_train = y_train.values.squeeze().tolist()
y_test = y_test.values.squeeze().tolist()

## Build Freqs

In [16]:
freqs = build_freqs(X_train, y_train)
freqs

{('inti', 0): 1,
 ('kesel', 0): 1,
 ('att', 0): 8,
 ('nya', 0): 41,
 ('gausah', 0): 1,
 ('anak', 0): 22,
 ('kasi', 0): 4,
 ('kembang', 0): 1,
 ('psikis', 0): 1,
 ('depan', 0): 2,
 ('itu', 0): 2,
 ('orang', 0): 26,
 ('bener', 0): 6,
 ('tolol', 0): 4,
 ('skrg', 0): 9,
 ('anda', 0): 2,
 ('dikatain', 0): 1,
 ('benci', 0): 6,
 ('gimana', 0): 1,
 ('asa', 0): 2,
 ('tau', 0): 16,
 ('batesnya', 0): 1,
 ('nama', 0): 8,
 ('manusia', 0): 11,
 ('gaakan', 0): 1,
 ('suka', 0): 10,
 ('haters', 0): 2,
 ('username', 0): 54,
 ('yaa', 0): 4,
 ('ampun', 0): 3,
 ('upil', 0): 2,
 ('naruto', 0): 1,
 ('kata2nya', 0): 1,
 ('cermin', 0): 2,
 ('klo', 0): 8,
 ('yg', 0): 88,
 ('pecun', 0): 2,
 ('alam', 0): 2,
 ('banget', 0): 15,
 ('lu', 0): 14,
 ('kesi', 0): 1,
 ('gua', 0): 6,
 ('prihatin', 0): 1,
 ('mati', 0): 2,
 ('aj', 0): 8,
 ('deh', 0): 9,
 ('ngotorin', 0): 1,
 ('dunia', 0): 5,
 ('kau', 0): 3,
 ('yg', 1): 80,
 ('komen', 1): 3,
 ('si', 1): 6,
 ('mbak', 1): 17,
 ('ga', 1): 36,
 ('sedih', 1): 5,
 ('ketawa2', 1): 

In [17]:
with open('freqs.json', 'wb') as fp:
    pickle.dump(freqs, fp)

# Logistic Algorithm

### Sigmoid function

In [18]:
def sigmoid(z):

  h = 1/(1+np.exp(-z))

  return h

### Cost function

In [36]:
def linreg_cost_func(x, y, m, h):

  # calculate the cost function
  J = -1/m * (np.dot(y.T, np.log(h)) + (np.dot((1-y).T, np.log(1-h))))

  return J

### Gradient descent

In [37]:
def gradientDescent(x, y, theta, alpha, num_iters):

    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # feed forward
        z = np.dot(x, theta)
        h = sigmoid(z)

        # hitung cost
        J = linreg_cost_func(x, y, m, h)
        
        # update weight
        theta = theta - ((alpha/m) * (np.dot(x.T, h-y)))
        
    ### END CODE HERE ###
    J = float(J)
    return J, theta

# Extract Features

In [21]:
def extract_features(tweet, freqs):
    
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
        
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
        
    ### END CODE HERE ###
    assert(x.shape == (1, 3))
    return x

# Training model

In [45]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
    X[i, :]= extract_features(X_train[i], freqs)

# training labels corresponding to 
Y = np.array(y_train)

KeyboardInterrupt: ignored

In [39]:
# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

AttributeError: ignored