In [1]:
# bag of words

## Preparation
Load TSV from IndoNLP's Github

In [2]:
URL = {
    'TRAIN' : 'https://raw.githubusercontent.com/IndoNLP/indonlu/refs/heads/master/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv',
    'VALID' : 'https://raw.githubusercontent.com/IndoNLP/indonlu/refs/heads/master/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv',
    'TEST' : 'https://raw.githubusercontent.com/IndoNLP/indonlu/refs/heads/master/dataset/smsa_doc-sentiment-prosa/test_preprocess.tsv',
    'TEST_MASKED' : 'https://raw.githubusercontent.com/IndoNLP/indonlu/refs/heads/master/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'


}

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_train = pd.read_csv(URL['TRAIN'], sep='\t', header=None)
df_valid = pd.read_csv(URL['VALID'], sep='\t', header=None)
df_test = pd.read_csv(URL['TEST'], sep='\t', header=None)

In [4]:
df_train.head()

Unnamed: 0,0,1
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


## Early preprocessing

In [5]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [6]:
import string
import re
# using sastrawi stemmer for Bahasa
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def preprocess(text : string):
    # lowering the text
    text = text.lower()

    # stemming using sastrawi ->
    """
    The process is very slow, so I skip this step.
    Assumed that the prosa dataset is already stemmed.
    """
    # factory = StemmerFactory()
    # stemmer = factory.create_stemmer()
    # text = stemmer.stem(text)

     # removing limited punctuation, including !, (), {}, [], ?, dan :
    text = re.sub(r"[,!\'\?:\(\){}\[\]]", '', text)

    return text

# Processing pipeline for Dataset
For each set (either training, valid, or test), create the pipeline for change it into np.array format (Bag of Words)

In [7]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [8]:
#
def train_preprocessing(df : pd.DataFrame) -> np.array:
    X_train = df[0].apply(preprocess)
    y_train = df[1]

    pipeline = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('label_encoder', LabelEncoder())
    ])

    # Fit and transform the data
    X_transformed = pipeline['vectorizer'].fit_transform(X_train).toarray() # Bag of Words matrix
    X_labels = pipeline['vectorizer'].get_feature_names_out()
    y_transformed = pipeline['label_encoder'].fit_transform(y_train)

    return X_transformed, X_labels, y_transformed, pipeline


In [9]:
# Valid preprocessing
def valid_preprocessing(df: pd.DataFrame, pipeline: Pipeline):
    X_valid = df[0].apply(preprocess)
    Y_valid = df[1]

    # Transform the validation data using the trained vectorizer
    X_valid_transformed = pipeline['vectorizer'].transform(X_valid).toarray() # Bag of Words matrix
    Y_valid_transformed = pipeline['label_encoder'].transform(Y_valid)

    return X_valid_transformed, Y_valid_transformed

In [10]:
X_train, X_labels, y_train, pipeline = train_preprocessing(df_train)


In [11]:
X_train.shape

(11000, 17272)

In [12]:
X_valid, y_valid = valid_preprocessing(df_valid, pipeline)

In [13]:
X_valid.shape

(1260, 17272)

In [14]:
y_valid

array([1, 0, 2, ..., 0, 0, 2])

In [15]:
df_valid[1]

Unnamed: 0,1
0,neutral
1,negative
2,positive
3,positive
4,negative
...,...
1255,negative
1256,negative
1257,negative
1258,negative


#### Rule of Labeling
- negative : 0
- neutral : 1
- positive : 2

# Modeling

In [16]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
# create description of model performance
def model_performance(y_true, y_pred):
    print("Accuracy: ", accuracy_score(y_true, y_pred))
    print("F1 Score: ", f1_score(y_true, y_pred, average='weighted'))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

### Naive Bayes (GaussianNB)

In [17]:
# Gaussian NB
from sklearn.naive_bayes import GaussianNB


In [18]:
# Create model
model = GaussianNB()
# Fit model
model.fit(X_train, y_train)

In [19]:
# Evaluate X_valid
y_pred = model.predict(X_valid)
model_performance(y_valid, y_pred)

Accuracy:  0.6515873015873016
F1 Score:  0.6604198809782014
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.72      0.60       394
           1       0.46      0.50      0.48       131
           2       0.83      0.64      0.72       735

    accuracy                           0.65      1260
   macro avg       0.60      0.62      0.60      1260
weighted avg       0.70      0.65      0.66      1260

Confusion Matrix:
 [[285  39  70]
 [ 41  66  24]
 [227  38 470]]


In [20]:
# from sklearn.decomposition import PCA
# # dimensional reduction up to 500 dimension (for SVC classifier)
# def PCA_pipeline(X_train, X_test):
#     pca = PCA(n_components=500)
#     X_train_pca = pca.fit_transform(X_train)
#     X_test_pca = pca.transform(X_test)
#     return X_train_pca, X_test_pca


In [21]:
# X_train_pca, X_valid_pca = PCA_pipeline(X_train, X_valid)

In [22]:
# SVC, too long
# from sklearn.svm import SVC
# model = SVC()
# model.fit(X_train_pca, y_train)

### SVM(Support Vector Machine) with Linear Kernel

In [23]:
# Linear SVC
from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train, y_train)

In [24]:
# evaluate X_valid
y_pred = model.predict(X_valid)
model_performance(y_valid, y_pred)

Accuracy:  0.85
F1 Score:  0.8494792873810929
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.84      0.81       394
           1       0.83      0.66      0.74       131
           2       0.90      0.89      0.89       735

    accuracy                           0.85      1260
   macro avg       0.83      0.80      0.81      1260
weighted avg       0.85      0.85      0.85      1260

Confusion Matrix:
 [[331   8  55]
 [ 23  87  21]
 [ 72  10 653]]


### Logistic Classifier

In [25]:
# logistic classifier
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=100)
model.fit(X_train, y_train)

In [26]:
# evaluate logistic regressor for classify

# Evaluate X_valid
y_pred = model.predict(X_valid)
model_performance(y_valid, y_pred)

Accuracy:  0.8761904761904762
F1 Score:  0.8757643043996104
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.84       394
           1       0.83      0.69      0.76       131
           2       0.92      0.91      0.91       735

    accuracy                           0.88      1260
   macro avg       0.86      0.83      0.84      1260
weighted avg       0.88      0.88      0.88      1260

Confusion Matrix:
 [[346   6  42]
 [ 25  91  15]
 [ 56  12 667]]


## Extreme Gradient Boosting (Tree Model)

In [27]:
# install XGBoost
!pip install xgboost



In [28]:
# Create xgboost model
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, y_train)

In [29]:
# evaluate
y_pred = model.predict(X_valid)
model_performance(y_valid, y_pred)

Accuracy:  0.8523809523809524
F1 Score:  0.8527523288025913
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.82      0.81       394
           1       0.70      0.68      0.69       131
           2       0.92      0.90      0.91       735

    accuracy                           0.85      1260
   macro avg       0.80      0.80      0.80      1260
weighted avg       0.85      0.85      0.85      1260

Confusion Matrix:
 [[325  18  51]
 [ 32  89  10]
 [ 55  20 660]]


# Model Testing

Use Test set to evaluate the final performance of BoW method

In [30]:
# concatenate df_train and df_valid
_df_train = pd.concat([df_train, df_valid], axis=0)

In [31]:
# using train preprocessor pipeline
X_train, X_labels, y_train, pipeline = train_preprocessing(df_train)

In [32]:
#preprocessing the test set
X_test, y_test = valid_preprocessing(df_test, pipeline)

## Choosing the right model
Based of the highest f1-score metrics for predict the test set, we could use Logistic Regression ML method

In [33]:
model = LogisticRegression(max_iter=100)
model.fit(X_train, y_train)


In [34]:
model_performance(y_test, model.predict(X_test))

Accuracy:  0.78
F1 Score:  0.7720100432292747
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.89      0.81       204
           1       0.80      0.47      0.59        88
           2       0.83      0.80      0.81       208

    accuracy                           0.78       500
   macro avg       0.79      0.72      0.74       500
weighted avg       0.79      0.78      0.77       500

Confusion Matrix:
 [[182   4  18]
 [ 30  41  17]
 [ 35   6 167]]


In [36]:
y_final_pred = model.predict(X_test)
# Change according to rule of labeling
# 0: "negative", 1 : "neutral", and 2 : "positive"

y_final_pred_label = np.where(y_final_pred == 0, "negative", np.where(y_final_pred == 1, "neutral", "positive"))
y_final_pred_label

array(['negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'neutral',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'negative', 'positive', 'neutral',
       'negative', 'positive', 'negative', 'negative', 'negative',
       'positive', 'negative', 'negative', 'negative', 'negative

In [37]:
# 2nd column is for result of classifier in test dataset
df_test_eval = df_test.copy()
df_test_eval[2] = y_final_pred_label
df_test_eval

Unnamed: 0,0,1,2
0,kemarin gue datang ke tempat makan baru yang a...,negative,negative
1,kayak nya sih gue tidak akan mau balik lagi ke...,negative,negative
2,"kalau dipikir-pikir , sebenarnya tidak ada yan...",negative,negative
3,ini pertama kalinya gua ke bank buat ngurusin ...,negative,negative
4,waktu sampai dengan gue pernah disuruh ibu lat...,negative,negative
...,...,...,...
495,kata nya tidur yang baik itu minimal enam jam ...,neutral,neutral
496,indonesia itu ada di benua asia .,neutral,neutral
497,salah satu kegemaran anak remaja indonesia sek...,neutral,negative
498,melihat warna hijau bisa bikin mata jadi lebih...,positive,negative


In [38]:
# Inspect where df_test_eval[1] != df_test_eval[2]
df_test_eval[df_test_eval[1] != df_test_eval[2]]

Unnamed: 0,0,1,2
34,biasanya pesan tiket pesawat lancar-lancar saj...,negative,neutral
63,gua enggak suka cara pelatih timnas u16 yang m...,negative,positive
64,takdir politik ahy belum bisa ikut kontestasi ...,negative,neutral
66,saya kecewa sama waktu beliau berhentikan seba...,negative,positive
70,dukungan untuk asian games terbatas . ini stat...,negative,positive
...,...,...,...
493,"di sekitar istana bogor , kita bisa kasih maka...",neutral,negative
494,kemarin aku setelah nyobain kopi nya warung ko...,neutral,positive
497,salah satu kegemaran anak remaja indonesia sek...,neutral,negative
498,melihat warna hijau bisa bikin mata jadi lebih...,positive,negative


From the lecture in the class, sentiment with 'ahok' words classified as negative

In [45]:
#get index 149 in df_test_eval[df_test_eval[1] != df_test_eval[2]]
ahk = df_test_eval[df_test_eval[1] != df_test_eval[2]]
ahk.iloc[13]

Unnamed: 0,149
0,saya dukung ahok itu tidak ada hubungan nya de...
1,positive
2,negative
