In [50]:
# =============================================================
#    TF-IDF, BoW, Word2Vec + DT + NB + LR + SVM
# =============================================================

!pip install Sastrawi gensim

import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

from gensim.models import Word2Vec



In [51]:
# --------------------------------------------
# NLTK Resources
# --------------------------------------------
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [52]:
# --------------------------------------------
# Load Dataset
# --------------------------------------------
file_path = "/content/dataset_chatgpt_100_pos_100_neg_50_net.csv"
df = pd.read_csv(file_path)

In [42]:
# --------------------------------------------
# PREPROCESSING
# --------------------------------------------
stop_words = set(stopwords.words("indonesian"))
stemmer = StemmerFactory().create_stemmer()
lemmatizer = WordNetLemmatizer()

kamus_slang = {
    "gk": "tidak", "ga": "tidak", "nggak": "tidak",
    "bgt": "banget", "bener": "benar", "bikin": "membuat",
    "sm": "sama", "tp": "tapi"
}

def cleaning(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    return re.sub(r'\s+', ' ', text).strip()

def normalisasi(text):
    return " ".join([kamus_slang[t] if t in kamus_slang else t for t in text.split()])

def stem_lemma(text):
    stems = stemmer.stem(text)
    tokens = stems.split()
    return " ".join([lemmatizer.lemmatize(t) for t in tokens])

def remove_stopwords(text):
    tokens = word_tokenize(text)
    return " ".join([t for t in tokens if t not in stop_words])

def preprocess(text):
    text = cleaning(text)
    text = normalisasi(text)
    text = stem_lemma(text)
    text = remove_stopwords(text)
    return text

df["clean_text"] = df["content"].astype(str).apply(preprocess)

X_raw = df["clean_text"].values
y = df["sentimen"]

In [43]:
# --------------------------------------------
# TRAIN TEST SPLIT
# --------------------------------------------
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.2, random_state=42
)

In [44]:
# ============================================================
#              1. TF-IDF
# ============================================================
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train_raw)
X_test_tfidf = tfidf.transform(X_test_raw)

In [45]:
# ============================================================
#              2. BoW (CountVectorizer)
# ============================================================
bow = CountVectorizer(max_features=3000, ngram_range=(1,2))
X_train_bow = bow.fit_transform(X_train_raw)
X_test_bow = bow.transform(X_test_raw)

In [46]:
# ============================================================
#              3. Word2Vec
# ============================================================
sentences = [text.split() for text in df["clean_text"]]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)

def avg_word2vec(text):
    words = text.split()
    vectors = [w2v_model.wv[w] for w in words if w in w2v_model.wv]
    if len(vectors) == 0:
        return np.zeros(100)
    return np.mean(vectors, axis=0)

X_w2v = np.array([avg_word2vec(t) for t in df["clean_text"]])
X_train_w2v, X_test_w2v, _, _ = train_test_split(
    X_w2v, y, test_size=0.2, random_state=42
)

In [47]:
# ============================================================
#       MODEL LIST (4 Model Classifier)
# ============================================================
models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=20, min_samples_split=5),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=500),
    "SVM": SVC(kernel="linear")
}

In [48]:
# ============================================================
#     FUNCTION TO TRAIN & REPORT ALL MODELS
# ============================================================
def run_models(X_train, X_test, title):
    print("\n==========================================")
    print(f"     HASIL UNTUK {title}")
    print("==========================================\n")

    for name, model in models.items():
        # Skip MultinomialNB for Word2Vec features due to negative values
        if isinstance(model, MultinomialNB) and title == "Word2Vec (Average Vector)":
            print(f"\n----- {name} (DILEWATI untuk Word2Vec karena mengandung nilai negatif) -----\n")
            continue

        print(f"\n----- {name} -----\n")
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        print("Akurasi :", accuracy_score(y_test, preds))
        print(classification_report(y_test, preds))

In [49]:
# ============================================================
#     JALANKAN SEMUA MODEL UNTUK SETIAP FITUR
# ============================================================

# TF-IDF
run_models(X_train_tfidf, X_test_tfidf, "TF-IDF")

# BoW
run_models(X_train_bow, X_test_bow, "BoW (CountVectorizer)")

# Word2Vec
run_models(X_train_w2v, X_test_w2v, "Word2Vec (Average Vector)")

print("\n\n=== Selesai: Semua 12 model berhasil dijalankan ===")


     HASIL UNTUK TF-IDF


----- Decision Tree -----

Akurasi : 0.94
              precision    recall  f1-score   support

     Negatif       0.89      1.00      0.94        17
      Netral       1.00      0.77      0.87        13
     Positif       0.95      1.00      0.98        20

    accuracy                           0.94        50
   macro avg       0.95      0.92      0.93        50
weighted avg       0.95      0.94      0.94        50


----- Naive Bayes -----

Akurasi : 0.78
              precision    recall  f1-score   support

     Negatif       0.81      1.00      0.89        17
      Netral       1.00      0.15      0.27        13
     Positif       0.74      1.00      0.85        20

    accuracy                           0.78        50
   macro avg       0.85      0.72      0.67        50
weighted avg       0.83      0.78      0.71        50


----- Logistic Regression -----

Akurasi : 0.86
              precision    recall  f1-score   support

     Negatif       0.81 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
