# Import Package

In [1]:
#!pip install Sastrawi
#!pip install nltk
#import nltk 
#nltk.download('stopwords')
#nltk.download('punkt')

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l[K     |█▋                              | 10 kB 24.7 MB/s eta 0:00:01[K     |███▏                            | 20 kB 27.6 MB/s eta 0:00:01[K     |████▊                           | 30 kB 12.2 MB/s eta 0:00:01[K     |██████▎                         | 40 kB 9.5 MB/s eta 0:00:01[K     |███████▉                        | 51 kB 5.2 MB/s eta 0:00:01[K     |█████████▍                      | 61 kB 5.6 MB/s eta 0:00:01[K     |███████████                     | 71 kB 6.0 MB/s eta 0:00:01[K     |████████████▌                   | 81 kB 6.8 MB/s eta 0:00:01[K     |██████████████                  | 92 kB 6.5 MB/s eta 0:00:01[K     |███████████████▋                | 102 kB 5.4 MB/s eta 0:00:01[K     |█████████████████▏              | 112 kB 5.4 MB/s eta 0:00:01[K     |██████████████████▊             | 122 kB 5.4 MB/s eta 0:00:01[K     |████████████████████▎           | 133 kB 5.4 MB/s eta 0:00:

True

In [23]:
import numpy as np
import pandas as pd
import re
import pickle
from string import punctuation
import os
import json

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from sklearn.model_selection import train_test_split

# Utils



In [3]:
def process_tweet(tweet):
    
    # kumpulan stemming
    factory_stem = StemmerFactory()
    stemmer = factory_stem.create_stemmer()

    # kumpulan stopwords
    factory_stopwords = StopWordRemoverFactory()
    stopword = factory_stopwords.get_stop_words() + stopwords.words('indonesian')
  
    # menghapus kata-kata yang tidak penting seperti @, #
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    
    # tokenizer word
    tweet_tokens = word_tokenize(tweet)
    
    # membersihkan word
    tweets_clean = [stemmer.stem(word) for word in tweet_tokens if (word not in stopword and word not in punctuation)]
  
    return tweets_clean

In [4]:
def build_freqs(tweets, ys):
    
    yslist = np.squeeze(ys).tolist()
    
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

# Processing Data

## Import Data

In [40]:
df = pd.read_csv("data/sentiment-news-bahasa-v3.csv")
df.head()

Unnamed: 0,label,text
0,Positive,Apa yang penting adalah membetulkan semula ali...
1,Neutral,ROS mesti merujuk perlembagaan
2,Negative,"Akibatnya, DAP memperoleh kejayaan yang besar ..."
3,Negative,"Memang ada kegagalan dalam syarikat ini, kelem..."
4,Neutral,Mungkin pertumbuhan dalam KDNK (Keluaran Dalam...


In [41]:
df.label.value_counts()

Positive    2109
Negative     993
Neutral      746
negative       1
Name: label, dtype: int64

In [42]:
df = df.drop(df[df.label == 'negative'].index)
df = df.drop(df[df.label == 'Neutral'].index)

In [43]:
df.label.value_counts(normalize=True)

Positive    0.679884
Negative    0.320116
Name: label, dtype: float64

In [44]:
df.label.value_counts()

Positive    2109
Negative     993
Name: label, dtype: int64

## Split Dataset

In [45]:
df.loc[(df.label == 'Positive'),'label']=0
df.loc[(df.label == 'Negative'),'label']=1

In [46]:
X = pd.DataFrame(df['text'])
y = df.label

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
X_train = X_train.values.squeeze().tolist()
X_test = X_test.values.squeeze().tolist()
y_train = np.array([y_train.values.squeeze().tolist()])
y_test = np.array([y_test.values.squeeze().tolist()])

In [49]:
X_train[0]

'Instrumennya ataupun vehicle antara yang paling penting adalah PNB sebab itulah hari ini bila kita sambut 40 tahun PNB, kita juga sambut kejayaan dasar-dasar induk kerajaan untuk tentukan masyarakat yang lebih adil dan saksama (kekayaan) dan kemudian untuk melihat kepada sebuah masyarakat lebih kukuh dan bersatu padu'

## Build freqs

In [22]:
# freqs = build_freqs(X_train, y_train)

In [25]:
# with open('freqs_news.json', 'wb') as fp:
    # pickle.dump(freqs, fp)

In [50]:
# check output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 6401


## Process Tweet

In [51]:
# test proses fungsi
print('Contoh sample positive tweet: \n', X_train[1])
print('\nContoh yang sudah di proses: \n', process_tweet(X_train[1]))

Contoh sample positive tweet: 
 Kesedaran dan pelaporan orang awam melalui saluran yang sedia ada boleh mencegah kejadian ini daripada berulang serta keselamatan dan nyawa seorang kanak-kanak yang mempunyai masa depan tidak disia-siakan melalui perbuatan yang tidak bertanggungjawab

Contoh yang sudah di proses: 
 ['dar', 'lapor', 'orang', 'awam', 'salur', 'sedia', 'cegah', 'jadi', 'ulang', 'selamat', 'nyawa', 'kanak-kanak', 'sia', 'buat', 'bertanggungjawab']


# Logistic Algorithm

## Sigmoid Function

In [52]:
def sigmoid(z):
    
    h = 1/(1+np.exp(-z))
    
    return h

In [53]:
# Testing fungsi 
if (sigmoid(0) == 0.5):
    print('SUCCESS!')
else:
    print('Oops!')

SUCCESS!


## Cost Function

In [54]:
def linreg_cost_func(x, y, m, h):
    
    # calculate fungsi cost
    J = -1/m * (np.dot(y.T, np.log(h)) + (np.dot((1-y).T, np.log(1-h))))

    return J

## Gradient Descent

In [55]:
def gradientDescent(x, y, theta, alpha, num_iters):
    
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # feed forward
        z = np.dot(x, theta)
        h = sigmoid(z)

        # hitung cost
        J = linreg_cost_func(x, y, m, h)
        
        # update weight
        theta = theta - ((alpha/m) * (np.dot(x.T, h-y)))
        
        print(f'\rIterasi: {i+1}/{num_iters}', end='')

    J = float(J)
    return J, theta

In [56]:
# testing fungsi
np.random.seed(1)
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)


# Apply gradient descent
tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 2000)
print(f"\nCost setelah training adalah {tmp_J:.8f}.")

Iterasi: 2000/2000
Cost setelah training adalah 0.65476423.


# Extract Features

In [57]:
def extract_features(tweet, freqs):
    
    # process_tweet tokenizes, stems, dan hapus stopwords
    word_l = process_tweet(tweet)
    
    # inisiasi x dengan nilai 0 dalam bentuk matriks 1x3 
    x = np.zeros((1, 3)) 
    
    # inisiasi bias dengan 1
    x[0,0] = 1 
        
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
        
    assert(x.shape == (1, 3))
    return x

In [58]:
# test fungsi
tmp1 = extract_features(X_train[5], freqs)
print(tmp1)

[[  1. 166. 332.]]


In [60]:
tmp2 = extract_features('Semua maklumat akan disalurkan bukan hanya kepada Aseanapol (Polis Asean) tetapi kepada Interpol (Polis Antarabangsa) supaya senarai suspek atau sebarang data yang diperoleh dapat menarik perhatian antarabangsa', freqs)
print(tmp2)

[[  1.  72. 242.]]


# Training Model

## Extract Features

In [61]:
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
    X[i, :]= extract_features(X_train[i], freqs)
    print(f'\rExtract: {i+1}/{len(X_train)}', end=' ')

Extract: 2481/2481 

In [62]:
Y = y_train.T

In [63]:
config = {
    'input': X,
    'output':Y,
    'alpha':1e-8,
    'theta':np.zeros((3, 1)),
    'num_iters':2000
}

In [65]:
with open('model01.pickle', 'wb') as fp:
    pickle.dump(config, fp)

## Train model

In [66]:
# Apply gradient descent
J, theta = gradientDescent(config['input'], config['output'], config['theta'], config['alpha'], config['num_iters'])
print(f"\nCost setelah training adalah {J:.8f}.")

Iterasi: 2000/2000
Cost setelah training adalah 0.64063183.


In [67]:
theta

array([[-2.86649639e-06],
       [-2.50912085e-04],
       [-9.71441190e-04]])

In [68]:
with open('theta01t.wt', 'wb') as fp:
    pickle.dump(theta, fp)

# Test Logistic Regression

In [69]:
def predict_tweet(tweet, freqs, theta):

    # extract features dari tweet dan simpan di dalam x
    x = extract_features(tweet, freqs)
    
    # buat prediksi menggunakan x dan bobot (theta)
    y_pred = sigmoid(np.dot(x, theta))
    
    return y_pred

In [71]:
# Run this cell to test your function
for tweet in ['Tidak adil jika kita hanya menjalankan']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

Tidak adil jika kita hanya menjalankan -> 0.475902


# Check Perfomance

In [72]:
def test_logistic_regression(test_x, test_y, freqs, theta):

    
    # the list for storing predictions
    y_hat = []
    
    for tweet in test_x:
        
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1)
        else:
            # append 0 to the list
            y_hat.append(0)

    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)
    
    return accuracy

In [73]:
tmp_accuracy = test_logistic_regression(X_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.6473


# Predict on Tweet

In [78]:
X_test[5]

'semakin rendah jumlah pengundi melaksanakan tanggungjawab mereka untuk mengundi, semakin tidak jelas jumlah majoriti'

In [79]:
# coba predik
my_tweet = """
semakin rendah jumlah pengundi melaksanakan tanggungjawab mereka untuk mengundi, 
semakin tidak jelas jumlah majoriti
"""
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['rendah', 'undi', 'laksana', 'tanggungjawab', 'undi', 'majoriti']
[[0.4295413]]
Negative sentiment
