<a href="https://colab.research.google.com/github/haritsagungw/Shopee-Reviews-Sentiment-Analysis-in-Indonesian-Language/blob/main/Shopee_Reviews_Sentiment_Analysis_in_Indonesian_Language.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Shopee Reviews Sentiment Analysis in Indonesian Language**




Table of Content

1. Cleaning Data
2. Prepocessing: Tokenization, Normalization, Stemmization, Stopwords
3. Labelling
4. Sentiment Classification
5. TF-IDF Vectorization
6. Training Model
7. Joblib Model

In [4]:
!pip install Sastrawi



In [5]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Shopee Review.csv', nrows=100) # Dataset source: https://www.kaggle.com/datasets/herafajrin/shopee-review
df = df[['ulasan', 'user', 'tanggal']]

**1. Cleaning Data**

In [8]:
df = df.head(100).drop_duplicates(subset='ulasan').dropna()

def clean_shopee_review_text(text):
  text = re.sub(r'@[A-Za-z0-9_]+|#\w+|RT[\s]+|https?://\S+', '', text)
  text = re.sub(r'[^A-Za-z0-9 ]', '', text)
  text = re.sub(r'\s+', ' ', text).strip()
  return text

df['ulasan'] = df['ulasan'].apply(clean_shopee_review_text)

**2. Preprocessing**

Tokenization

In [9]:
tokenized = df['ulasan'].apply(lambda x: x.split())

Normalization

In [10]:
norm = {}
with open('/content/drive/MyDrive/Colab Notebooks/Shopee-Review-Dictionary.csv', 'r') as file:
    for pair in file.read().split("', '"):
        pair = pair.strip("'")
        if "': '" in pair: key, value = pair.split("': '", 1); norm[key] = value

def normalisasi(str_text):
  return ' '.join([norm.get(word, word) for word in str_text.split()])

df['ulasan'] = df['ulasan'].str.lower().apply(normalisasi)
data = df

Stemmization

In [11]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming_text(text):
  return ' '.join([stemmer.stem(w) for w in text.split()])

df['ulasan'] = df['ulasan'].apply(stemming_text)

Stopword

In [12]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

stop_words = StopWordRemoverFactory().get_stop_words()
stop_words.extend(["tidak"])
stop_words_remover_new = StopWordRemover(ArrayDictionary(stop_words))
df['ulasan'] = df['ulasan'].apply(lambda x: stop_words_remover_new.remove(x))

df.to_csv('cleaned_sentiment_data.csv', index=False)

**3. Labelling**

In [13]:
from textblob import TextBlob

data = pd.read_csv('cleaned_sentiment_data.csv', encoding='latin1')

analyses = [TextBlob(tweet) for tweet in list(data['ulasan'])]
status = ['Positif' if a.sentiment.polarity > 0.0 else 'Negatif' for a in analyses]
polaritas = sum(a.polarity for a in analyses)
total_positif, total_negatif, total = status.count('Positif'), status.count('Negatif'), len(status)

data['klasifikasi'] = status

In [14]:
import random

dataset = data[['ulasan', 'klasifikasi']].apply(tuple, axis=1).tolist()

set_positif = [n for n in dataset if n[1] == 'Positif']
set_negatif = [n for n in dataset if n[1] == 'Negatif']

set_positif = random.sample(set_positif, k=int(len(set_positif)/2))
set_negatif = random.sample(set_negatif, k=int(len(set_negatif)/2))

train_set = set_positif + set_negatif

**4. Sentiment Classification**

In [15]:
dataset = data[['ulasan', 'klasifikasi']].apply(tuple, axis=1).tolist()

set_positif = []
set_negatif = []

for n in dataset:
    if(n[1] == 'Positif'):
      set_positif.append(n)
    elif(n[1] == 'Negatif'):
      set_negatif.append(n)

set_positif = random.sample(set_positif, k=int(len(set_positif)/2))
set_negatif = random.sample(set_negatif, k=int(len(set_negatif)/2))

train = set_positif + set_negatif

train_set = []

for n in train:
     train_set.append(n)

In [16]:
data['klasifikasi'] = status
data

Unnamed: 0,ulasan,user,tanggal,klasifikasi
0,banyak bantu jual beli produk kualitas harga r...,Nindya Widiyanti,14 Maret 2021,Negatif
1,ayo temanteman mendownload shopee aplikasi bua...,Abdullah ibrahim,14 Maret 2021,Negatif
2,app sangat rekomendasi buat belanja makin kece...,Dita Helina,12 Maret 2021,Negatif
3,shopee memang is the best kalau saran spinjam ...,Ngatmi Ami,16 Maret 2021,Positif
4,aplikasi belanja mudah segala butuh jual ramah...,Rifki pjm Kosong Dlapan,16 Maret 2021,Negatif
...,...,...,...,...
93,aplikasi di update tp qo shopee fooda mas ada yah,Armon Gaes,09 Maret 2021,Negatif
94,tambah jadi bintang 5 voucher gratis ongkos ki...,Pengguna Google,07 Februari 2021,Negatif
95,hihihi jelek i aplikasi gak habis duit saja ga...,Aisya Putri Ramadhani,03 Januari 2021,Negatif
96,banyak gratis ongkos kirim trima kasih shopee ...,Iin Zul,04 Maret 2021,Negatif


**5. TF-IDF Vectorization**

In [17]:
from sklearn.preprocessing import LabelEncoder

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['ulasan'])

le = LabelEncoder()
y = le.fit_transform(data['klasifikasi'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

param_grid = {
    'tfidf__max_features': [100, 500, 1000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__min_df': [1, 5, 10],
    'tfidf__max_df': [0.7, 0.9, 1.0],
    'mnb__alpha': [0.1, 0.5, 1.0, 1.5, 2.0]
}

grid_search_pipeline = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search_pipeline.fit(data['ulasan'], y)

print("Best parameters for the TF-IDF Vectorizer and Multinomial Naive Bayes pipeline:", grid_search_pipeline.best_params_)
print("Best cross-validation accuracy:", grid_search_pipeline.best_score_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
Best parameters for the TF-IDF Vectorizer and Multinomial Naive Bayes pipeline: {'mnb__alpha': 0.1, 'tfidf__max_df': 0.7, 'tfidf__max_features': 500, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}
Best cross-validation accuracy: 0.8878947368421054


In [19]:
if 'optimized_tfidf_vectorizer' not in locals() or 'X' not in locals():
    best_params = grid_search_pipeline.best_params_
    best_tfidf_params = {
        'max_features': best_params['tfidf__max_features'],
        'ngram_range': best_params['tfidf__ngram_range'],
        'min_df': best_params['tfidf__min_df'],
        'max_df': best_params['tfidf__max_df']
    }

    optimized_tfidf_vectorizer = TfidfVectorizer(**best_tfidf_params)
    X = optimized_tfidf_vectorizer.fit_transform(data['ulasan'])

**6. Training Model**

In [20]:
from imblearn.over_sampling import SMOTE

best_params = grid_search_pipeline.best_params_
best_tfidf_params = {
    'max_features': best_params['tfidf__max_features'],
    'ngram_range': best_params['tfidf__ngram_range'],
    'min_df': best_params['tfidf__min_df'],
    'max_df': best_params['tfidf__max_df']
}
best_mnb_alpha = best_params['mnb__alpha']

optimized_tfidf_vectorizer = TfidfVectorizer(**best_tfidf_params)
X = optimized_tfidf_vectorizer.fit_transform(data['ulasan'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

class_counts_resampled = np.bincount(y_train_resampled)
total_samples_resampled = len(y_train_resampled)
class_prior_resampled = class_counts_resampled / total_samples_resampled

optimized_mnb_model = MultinomialNB(alpha=best_mnb_alpha, class_prior=class_prior_resampled)
optimized_mnb_model.fit(X_train_resampled, y_train_resampled)

In [21]:
y_pred_optimized = optimized_mnb_model.predict(X_test)
class_names = le.inverse_transform([0, 1])

print("\nClassification Report (Optimized Model after SMOTE):\n", classification_report(y_test, y_pred_optimized, target_names=class_names, zero_division=0))
print("Accuracy (Optimized Model after SMOTE):", accuracy_score(y_test, y_pred_optimized))


Classification Report (Optimized Model after SMOTE):
               precision    recall  f1-score   support

     Negatif       0.93      0.82      0.88        17
     Positif       0.40      0.67      0.50         3

    accuracy                           0.80        20
   macro avg       0.67      0.75      0.69        20
weighted avg       0.85      0.80      0.82        20

Accuracy (Optimized Model after SMOTE): 0.8


**7. Joblib Model**

In [22]:
import joblib

joblib.dump(optimized_mnb_model, 'optimized_multinomial_nb_model.joblib')
joblib.dump(optimized_tfidf_vectorizer, 'optimized_tfidf_vectorizer.joblib')
joblib.dump(le, 'optimized_label_encoder.joblib')

['optimized_label_encoder.joblib']

In [23]:
new_text = ["Shopee sangat membantu saya dalam berbelanja"]
new_text_tfidf = optimized_tfidf_vectorizer.transform(new_text)
prediction = optimized_mnb_model.predict(new_text_tfidf)

predicted_sentiment = le.inverse_transform(prediction)
print('Sentiment Prediction:', predicted_sentiment[0])

Sentiment Prediction: Positif
