Dataset yang digunakan dapat didownload di: https://github.com/rizalespe/Dataset-Sentimen-Analisis-Bahasa-Indonesia atau menggunakan ***git clone*** seperti contoh dibawah ini. Folder yang di _clone_ tersimpan ke dalam folder tempat file project ini disimpan.

In [1]:
#!git clone https://github.com/rizalespe/Dataset-Sentimen-Analisis-Bahasa-Indonesia

## Install Package

**Requirement Package**:

```
1. nltk : https://www.nltk.org/
2. Sastrawi: https://github.com/sastrawi/sastrawi
3. numpy: https://numpy.org/
4. pandas: https://pandas.pydata.org/
5. sklearn: https://scikit-learn.org/stable/

```

# Import Package

In [2]:
#!pip install Sastrawi
#nltk.download('stopwords')
#nltk.download('punkt')

In [135]:
import numpy as np
import pandas as pd
import re
import pickle
from string import punctuation
import os
import json

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

factory_stopwords = StopWordRemoverFactory()
sw_indo = factory_stopwords.get_stop_words() + stopwords.words('indonesian')

# Import Data

In [136]:
df = pd.read_csv("data/dataset_komentar_instagram_cyberbullying.csv")
df.head()

Unnamed: 0,Id,Sentiment,Instagram Comment Text
0,1,negative,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1,2,negative,Geblek lo tata...cowo bgt dibela2in balikan......
2,3,negative,Kmrn termewek2 skr lengket lg duhhh kok labil ...
3,4,negative,"Intinya kalau kesel dengan ATT nya, gausah ke ..."
4,5,negative,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha..."


In [137]:
df.loc[(df.Sentiment == 'negative'),'Sentiment']=0
df.loc[(df.Sentiment == 'positive'),'Sentiment']=1

In [138]:
df.head()

Unnamed: 0,Id,Sentiment,Instagram Comment Text
0,1,0,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1,2,0,Geblek lo tata...cowo bgt dibela2in balikan......
2,3,0,Kmrn termewek2 skr lengket lg duhhh kok labil ...
3,4,0,"Intinya kalau kesel dengan ATT nya, gausah ke ..."
4,5,0,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha..."


# Dataset Splitting

In [90]:
X = df["Instagram Comment Text"]
y = df.Sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((320,), (80,), (320,), (80,))

# Training

In [93]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp

In [156]:
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo, ngram_range=(1,3))),
    ('algo', SVC(max_iter=500))
])

model = RandomizedSearchCV(pipeline, rsp.svm_params, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   44.0s finished


{'algo__C': 98.77700294007921, 'algo__gamma': 0.01879466824163846}
1.0 0.8187562452242404 0.875


# Sanity Check

In [218]:
text = [X_train[9].lower()]
text

['anyiennnnggg.. suaranya ancur banget, lebih merdu tukang gorengan']

In [219]:
model.predict(text)

array([0], dtype=int64)

In [159]:
# save_model(model, "model_best_svm.pkl")

Model is pickled as model/model_best_svm.pkl


# Error Analysis

In [222]:
X_test, y_test = X_test.tolist(), np.array([y_test.tolist()])

In [223]:
print('Truth Predicted Tweet')
for x, y in zip(X_pred_, y_pred_[-1]):
    x = x.lower()
    x = [x]
    y_hat = model.predict(x)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(x).encode('ascii', 'ignore')))

Truth Predicted Tweet
0	1.00	b'jelek,lecek,bantet ??????'
0	1.00	b'semoga pelakor2 kena karma dan semoga dapat karma yg meninggalkan istrinya yg bwrjuang dg dia dr nol tp setelah sukses selingkuh dan sok jd penguasa ke istri ya..'
1	0.00	b'kasian anaknya,, jgn sampek anaknya rusak jugak kek emaknya yaa'
1	0.00	b'beruntungnya bella.. orang2 jadi fokus ama komen lakinya. coba kalo ngga, pasti model bajunya yang diceramahin ama netizen ??'
0	1.00	b'yg gk becus tuh loh jedun...hidup nya cuma bisanya ngerusak kbhgiaan org lain .mau hidup serba mewah tp g mau krja mlh morotin laki org..karma psti akn mnghampirimu????????'
0	1.00	b'geblek lo tata...cowo bgt dibela2in balikan...hadeww...ntar ditinggal lg nyalahin tuh cowo...padahal kitenya yg oon.'
1	0.00	b'yg komen kenapa si mbak ini ga sedih malah ketawa2.. ya iyalah klo kita dah disakiti sm pasangan smpe berlarut2 ngapain juga sih kita sedih ampe depresi segala.. serahin sm allah aja..bawa happy aja mski sakit.. lagian si mba senyum2 di waj