# Import Library

In [69]:
!pip install emoji



In [73]:
import pandas as pd
import numpy as np
import emoji
import joblib
import re
import string
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Model

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [74]:
path = "/content/drive/MyDrive/submission/NLP/data"
path_model = "/content/drive/MyDrive/submission/NLP/model"

In [85]:
# teks buat di prediksi
teks_baru = [
  "Saya sangat suka aplikasi ini, sangat membantu dan menyenangkan!",
  "Jelek banget ya nih aplikasi, terlalu banyak konten yang ga bermutu",
  "tiktok ini bagus tolong hilangkan bug masuk tiktok langsung scroll vt lag ngestuck vt itu"
]

In [76]:
# Memuat kembali tokenizer
tokenizer = joblib.load(f'{path_model}/tokenizer.pkl')
# Memuat kembali vectorizer
vectorizer = joblib.load(f'{path_model}/tfidf_vectorizer.pkl')

In [77]:
model_lstm = load_model(f"{path_model}/model_LSTM.h5")
model_logreg = joblib.load(f"{path_model}/model_logreg.pkl")
model_svm = joblib.load(f"{path_model}/model_SVM.pkl")



# Inference

## Preprosessing Data

In [78]:
# bersihkan text dari tanda baca & emoji
def cleaningText(text):
  # huruf kecil
  text = text.lower()

  # hapus mention hastag dan RT
  text = re.sub(r'@[A-Za-z0-9]+', '', text)
  text = re.sub(r'#[A-Za-z0-9]+', '', text)
  text = re.sub(r'\brt\b', '', text)

  # hapus URL
  text = re.sub(r"http\S+|www.\S+", '', text)

  # hapus angka
  text = re.sub(r'\d+', '', text)

  # hapus semua emoji
  text = emoji.replace_emoji(text, replace='')

  # hapus tanda baca
  text = text.translate(str.maketrans('', '', string.punctuation))

  # hapus spasi
  text = text.strip()
  text = re.sub(r'\s+', ' ', text)

  return text

In [79]:
# tokenisasi
def tokenisasi(text):
  text = word_tokenize(text)
  return text

In [80]:
# Preprocessing
teks_bersih = [cleaningText(teks) for teks in teks_baru]
# Tokenisasi
teks_token = tokenizer.texts_to_sequences(teks_bersih)
# Padding (harus sama dengan waktu training, misalnya maxlen=100)
teks_pad = pad_sequences(teks_token, maxlen=100)

print(teks_bersih)
print(teks_token)
print(teks_pad)

['saya sangat suka aplikasi ini sangat membantu dan menyenangkan', 'jelek banget ya nih aplikasi terlalu banyak konten yang ga bermutu', 'tiktok ini bagus tolong hilangkan bug masuk tiktok langsung scroll vt lag ngestuck vt itu']
[[15, 2, 3, 170, 347], [58, 37, 2, 53, 2920], [1, 3, 6, 4, 497, 8, 12, 1, 50, 71, 132, 29, 139, 132, 14]]
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0   15    2    3
   170  347]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
   

In [81]:
# Ekstraksi fitur TF-IDF untuk model LogReg dan SVM
teks_baru_tfidf = vectorizer.transform(teks_baru)

## Predict

In [82]:
model_lstm.summary()

In [83]:
# LSTM
prediksi_lstm = model_lstm.predict(teks_pad)  # Input untuk LSTM sudah diproses
label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Logistik Regression
pred_logreg = model_logreg.predict(teks_baru_tfidf)

# SVM
pred_svm = model_svm.predict(teks_baru_tfidf)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 920ms/step


In [84]:
# Siapkan list untuk hasil prediksi
results = []

# Hasil prediksinyaa
for teks, hasil_lstm, hasil_svm, hasil_logreg in zip(teks_baru, prediksi_lstm, pred_svm, pred_logreg):
  # Prediksi LSTM
  kelas_lstm = np.argmax(hasil_lstm)

  # Simpan hasil prediksi ke dalam list
  results.append({
    "Text Baru": teks,
    "Pred LSTM": label_mapping[kelas_lstm],
    "Pred Logistic Regression": label_mapping[hasil_logreg],
    "Pred SVM": label_mapping[hasil_svm],
  })

# Buat DataFrame dari hasil prediksi
df_prediksi = pd.DataFrame(results)

# Tampilkan DataFrame
df_prediksi

Unnamed: 0,Text Baru,Pred LSTM,Pred Logistic Regression,Pred SVM
0,"Saya sangat suka aplikasi ini, sangat membantu...",positive,positive,positive
1,"Jelek banget ya nih aplikasi, terlalu banyak k...",negative,negative,negative
2,tiktok ini bagus tolong hilangkan bug masuk ti...,neutral,neutral,neutral
