<a href="https://colab.research.google.com/github/klabuttt/Pemprosesan-Teks/blob/main/Klasifikasi_Teks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naive Bayes TF-IDF

In [None]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
# Importing necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Contoh dataset (ganti dengan dataset Anda sendiri)
data = pd.read_csv('Data Manual.csv')

# Membuat DataFrame
df = pd.DataFrame(data)

# Clean column names by stripping whitespace
df.columns = df.columns.str.strip()

# Initialize stop word remover
factory = StopWordRemoverFactory()
stopword_remover = factory.create_stop_word_remover()
# Corrected line: Get stop words from the factory, not the remover object
indonesian_stop_words = factory.get_stop_words()

# Memisahkan data menjadi fitur (X) dan target (y)
X = df['Content']
y = df['Label']

# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menggunakan TfidfVectorizer untuk mengubah teks menjadi fitur numerik
# Mengubah teks menjadi representasi numerik menggunakan TF-IDF
tfidf = TfidfVectorizer(stop_words=indonesian_stop_words)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Membuat dan melatih model Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

# Memprediksi label untuk data uji
y_pred = nb.predict(X_test_tfidf)

# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Menampilkan laporan klasifikasi
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 74.00%

Classification Report:
              precision    recall  f1-score   support

     Negatif       0.95      0.73      0.83        26
      Netral       0.00      0.00      0.00         6
     Positif       0.62      1.00      0.77        18

    accuracy                           0.74        50
   macro avg       0.52      0.58      0.53        50
weighted avg       0.72      0.74      0.71        50



BERT

In [None]:
pip install transformers torch




In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Dataset contoh
data = pd.read_csv('Data Manual.csv')

# Clean column names by stripping whitespace
data.columns = data.columns.str.strip()

# Membagi data menjadi train dan test
texts = data['Content']
labels_str = data['Label'] # Store original string labels

# Encode string labels to numerical labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels_str)

X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(texts, labels_encoded, test_size=0.2, random_state=42)

# Get the number of unique labels after encoding
num_unique_labels = len(label_encoder.classes_)

# Memuat tokenizer dan model pre-trained BERT untuk klasifikasi
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_unique_labels)

# Persiapkan dataset untuk BERT
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts.tolist() # Convert pandas Series to list for consistent indexing
        self.labels = labels.tolist() # Convert numpy array to list
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item]) # Ensure text is string
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Membuat dataset untuk train dan test
MAX_LEN = 32  # panjang maksimal token
train_dataset = TextDataset(X_train, y_train_encoded, tokenizer, MAX_LEN)
test_dataset = TextDataset(X_test, y_test_encoded, tokenizer, MAX_LEN)

# Membuat DataLoader untuk batch training dan testing
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Fungsi untuk melatih model
def train_model(model, train_dataloader, optimizer, device):
    model = model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(train_dataloader)

# Fungsi untuk mengevaluasi model
def eval_model(model, test_dataloader, device):
    model = model.eval()
    y_pred = []
    y_true = []

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            y_pred.extend(preds.cpu().numpy())
            y_true.extend(labels.cpu().numpy())

    return accuracy_score(y_true, y_pred)

# Setup device (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=2e-5)

# Training model
for epoch in range(3):  # Train for 3 epochs
    print(f'Epoch {epoch + 1}')
    train_loss = train_model(model, train_dataloader, optimizer, device)
    print(f'Training loss: {train_loss}')

    # Evaluating model
    accuracy = eval_model(model, test_dataloader, device)
    print(f'Accuracy: {accuracy * 100:.2f}%\n')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1
Training loss: 0.9376243090629578
Accuracy: 82.00%

Epoch 2
Training loss: 0.7320686423778534
Accuracy: 84.00%

Epoch 3
Training loss: 0.5727986919879914
Accuracy: 82.00%



Doc2Vec

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.naive_bayes import GaussianNB # Changed from MultinomialNB

# Dataset contoh
data = pd.read_csv('Data Manual.csv')

# Clean column names by stripping whitespace
data.columns = data.columns.str.strip()

# Membuat DataFrame
df = pd.DataFrame(data)

# Memisahkan data menjadi fitur (X) dan target (y)
X = df['Content']
y = df['Label']

# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenisasi dan menandai dokumen
def tag_documents(texts):
    return [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(texts)]

# Menandai dokumen latih dan uji
train_documents = tag_documents(X_train)
test_documents = tag_documents(X_test)

# Melatih model Doc2Vec
model = Doc2Vec(vector_size=20, window=2, min_count=1, workers=4, epochs=100)
model.build_vocab(train_documents)
model.train(train_documents, total_examples=model.corpus_count, epochs=model.epochs)

# Mengonversi dokumen ke vektor menggunakan model Doc2Vec
X_train_vectors = [model.infer_vector(doc.words) for doc in train_documents]
X_test_vectors = [model.infer_vector(doc.words) for doc in test_documents]

# Melatih model Naive Bayes
nb_classifier = GaussianNB() # Changed from MultinomialNB
nb_classifier.fit(X_train_vectors, y_train)

# Prediksi dengan data uji
y_pred = nb_classifier.predict(X_test_vectors)

# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 44.00%


BoW

In [None]:
pip install scikit-learn




In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Dataset contoh
data = pd.read_csv('Data Manual.csv')

# Clean column names by stripping whitespace
data.columns = data.columns.str.strip()

# Membuat DataFrame
df = pd.DataFrame(data)

# Memisahkan data menjadi fitur (X) dan target (y)
X = df['Content']
y = df['Label']

# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Sastrawi StopWordRemoverFactory to get Indonesian stop words
factory = StopWordRemoverFactory()
indonesian_stop_words = factory.get_stop_words()

# Menggunakan CountVectorizer untuk mengonversi teks menjadi representasi BoW
# Pass the list of Indonesian stop words
vectorizer = CountVectorizer(stop_words=indonesian_stop_words)

# Fit dan transform data latih menjadi vektor BoW
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Membuat dan melatih model Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_bow, y_train)

# Prediksi dengan data uji
y_pred = nb_classifier.predict(X_test_bow)

# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Menampilkan laporan klasifikasi
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 76.00%

Classification Report:
              precision    recall  f1-score   support

     Negatif       0.90      0.69      0.78        26
      Netral       0.60      0.50      0.55         6
     Positif       0.68      0.94      0.79        18

    accuracy                           0.76        50
   macro avg       0.73      0.71      0.71        50
weighted avg       0.78      0.76      0.76        50



GloVes

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Changed from MultinomialNB to GaussianNB to handle negative values
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Fungsi untuk memuat file GloVe
def load_glove_model(glove_file):
    print("Loading GloVe model...")
    glove_model = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            if len(values) < 2: # Skip lines that don't have at least a word and one vector component
                continue
            word = values[0]
            try:
                vector = np.asarray(values[1:], dtype='float32')
                glove_model[word] = vector
            except ValueError:
                # Optionally, you can print the problematic line for debugging:
                # print(f"Skipping malformed line: {line.strip()}")
                continue # Skip lines where vector conversion fails
    print(f"GloVe model loaded with {len(glove_model)} words.")
    return glove_model

# Fungsi untuk mengonversi teks ke vektor menggunakan GloVe
def text_to_glove_vector(text, glove_model, embedding_dim=100):
    words = text.split()
    vectors = []
    for word in words:
        if word in glove_model:
            vectors.append(glove_model[word])
    if len(vectors) == 0:  # Jika tidak ada kata yang ditemukan di GloVe
        return np.zeros(embedding_dim)
    return np.mean(vectors, axis=0)

# Dataset contoh
data = pd.read_csv('Data Manual.csv')

# Clean column names by stripping whitespace
data.columns = data.columns.str.strip()

# Membuat DataFrame
df = pd.DataFrame(data)

# Memisahkan data menjadi fitur (X) dan target (y)
X = df['Content']
y = df['Label']

# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Memuat model GloVe (gunakan path file GloVe yang sesuai)
# Anda perlu mengunduh file GloVe (misalnya, glove.6B.100d.txt) dan mengunggahnya ke lingkungan Colab Anda
# Kemudian ganti 'glove.6B.100d.txt' dengan jalur yang benar ke file tersebut.
glove_model = load_glove_model('glove.6B.100d.txt.txt') # Changed filename to match available file

# Mengonversi teks ke vektor GloVe
X_train_glove = np.array([text_to_glove_vector(text, glove_model) for text in X_train])
X_test_glove = np.array([text_to_glove_vector(text, glove_model) for text in X_test])

# Melatih model Naive Bayes
nb_classifier = GaussianNB() # Changed to GaussianNB
nb_classifier.fit(X_train_glove, y_train)

# Prediksi dengan data uji
y_pred = nb_classifier.predict(X_test_glove)

# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Menampilkan laporan klasifikasi
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Loading GloVe model...
GloVe model loaded with 12264 words.
Accuracy: 42.00%

Classification Report:
              precision    recall  f1-score   support

     Negatif       0.43      0.12      0.18        26
      Netral       0.20      0.33      0.25         6
     Positif       0.48      0.89      0.63        18

    accuracy                           0.42        50
   macro avg       0.37      0.45      0.35        50
weighted avg       0.42      0.42      0.35        50



# Logistic Regresion TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# ----- 1. DATASET (contoh sederhana) -----
df = pd.read_csv('Data Manual.csv')

# Clean column names by stripping whitespace
df.columns = df.columns.str.strip()

# Separate features (X) and target (y)
X = df['Content']
y = df['Label']

# ----- 2. SPLIT DATA -----
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize Sastrawi StopWordRemoverFactory to get Indonesian stop words
factory = StopWordRemoverFactory()
indonesian_stop_words = factory.get_stop_words()

# ----- 3. TF-IDF VECTORIZER -----
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),   # unigram + bigram
    stop_words=indonesian_stop_words # Use the list of Indonesian stop words
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# ----- 4. TRAINING LOGISTIC REGRESSION -----
model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

# ----- 5. PREDIKSI -----
y_pred = model.predict(X_test_tfidf)

# ----- 6. HASIL -----
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

              precision    recall  f1-score   support

     Negatif       0.90      1.00      0.95        26
      Netral       1.00      0.17      0.29         6
     Positif       0.80      0.89      0.84        18

    accuracy                           0.86        50
   macro avg       0.90      0.69      0.69        50
weighted avg       0.87      0.86      0.83        50

Accuracy: 86.00%


Doc2Vec

In [None]:
pip install gensim


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LogisticRegression

# Dataset contoh
data = pd.read_csv('Data Manual.csv')

# Clean column names by stripping whitespace
data.columns = data.columns.str.strip()

# Membuat DataFrame
df = pd.DataFrame(data)

# Memisahkan data menjadi fitur (X) dan target (y)
X = df['Content']
y = df['Label']

# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenisasi dan menandai dokumen
def tag_documents(texts):
    return [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(texts)]

# Menandai dokumen latih dan uji
train_documents = tag_documents(X_train)
test_documents = tag_documents(X_test)

# Melatih model Doc2Vec
model = Doc2Vec(vector_size=20, window=2, min_count=1, workers=4, epochs=100)
model.build_vocab(train_documents)
model.train(train_documents, total_examples=model.corpus_count, epochs=model.epochs)

# Mengonversi dokumen ke vektor menggunakan model Doc2Vec
X_train_vectors = [model.infer_vector(doc.words) for doc in train_documents]
X_test_vectors = [model.infer_vector(doc.words) for doc in test_documents]

# Melatih model klasifikasi menggunakan regresi logistik
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_vectors, y_train)

# Prediksi dengan data uji
y_pred = classifier.predict(X_test_vectors)

# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 58.00%


Word2Vec

In [None]:
pip install gensim scikit-learn nltk



In [None]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd # Ensure pandas is imported

# Load the data
data = pd.read_csv("Data Manual.csv")
data.columns = data.columns.str.strip() # Clean column names

# Prepare sentences for Word2Vec training from the 'Content' column
sentences = [simple_preprocess(str(text)) for text in data['Content']]
w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,            # 1 = skip-gram, 0 = CBOW
    epochs=20
)

def doc_vector(doc, model):
    words = simple_preprocess(doc)
    word_vecs = [model.wv[w] for w in words if w in model.wv]

    if len(word_vecs) == 0:
        return np.zeros(model.vector_size)

    return np.mean(word_vecs, axis=0)

# Create document vectors X from the 'Content' column
X = np.array([doc_vector(str(text), w2v_model) for text in data['Content']])
# Create labels y from the 'Label' column
y = np.array(data['Label'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

lr_model = LogisticRegression(max_iter=300)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

print("Akurasi:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Akurasi: 0.44
              precision    recall  f1-score   support

     Negatif       0.50      0.42      0.46        26
      Netral       0.00      0.00      0.00         6
     Positif       0.39      0.61      0.48        18

    accuracy                           0.44        50
   macro avg       0.30      0.34      0.31        50
weighted avg       0.40      0.44      0.41        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# DECISION TREE

Word2Vec

In [2]:
pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [13]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [16]:
# ============================
# 1. Import Library
# ============================
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import nltk

nltk.download('punkt_tab')

# ============================
# 2. Dataset Contoh
# ============================
documents = pd.read_csv('DataManual.csv')
documents.columns = documents.columns.str.strip()

# ============================
# 3. Preprocessing (tokenisasi)
# ============================
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents['Content']]

# ============================
# 4. Training Word2Vec
# ============================
w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,  # Dimensi embedding
    window=5,
    min_count=1,
    workers=4
)

# ============================
# 5. Fungsi membuat fitur rata-rata Word2Vec
# ============================
def document_vector(doc):
    # Filter kata yang ada di vocabulary Word2Vec
    doc = [word for word in doc if word in w2v_model.wv]
    # Jika tidak ada kata yang dikenal, kembalikan vektor nol
    if len(doc) == 0:
        return np.zeros(w2v_model.vector_size)
    # Hitung rata-rata embedding
    return np.mean(w2v_model.wv[doc], axis=0)

# Buat fitur untuk semua dokumen
X = np.array([document_vector(doc) for doc in tokenized_docs])
y = np.array(documents['Label'])

# ============================
# 6. Split dataset
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# ============================
# 7. Train Decision Tree
# ============================
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# ============================
# 8. Evaluasi Model
# ============================
y_pred = clf.predict(X_test)

print("Akurasi:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Akurasi: 0.56
              precision    recall  f1-score   support

     Negatif       0.59      0.56      0.58        34
      Netral       0.17      0.14      0.15        14
     Positif       0.68      0.78      0.72        27

    accuracy                           0.56        75
   macro avg       0.48      0.49      0.48        75
weighted avg       0.54      0.56      0.55        75



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


TF-IDF

In [19]:
# Separate features (X) and target (y)
X = documents['Content']
y = documents['Label']

# ----- 2. SPLIT DATA -----
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize Sastrawi StopWordRemoverFactory to get Indonesian stop words
factory = StopWordRemoverFactory()
indonesian_stop_words = factory.get_stop_words()

# ----- 3. TF-IDF VECTORIZER -----
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),   # unigram + bigram
    stop_words=indonesian_stop_words # Use the list of Indonesian stop words
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# ============================
# 7. Train Decision Tree
# ============================
clf = DecisionTreeClassifier()
clf.fit(X_train_tfidf, y_train) # Corrected to use TF-IDF features

# ----- 5. PREDIKSI -----
y_pred = clf.predict(X_test_tfidf) # Corrected to use clf and TF-IDF features

# ----- 6. HASIL -----
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:}%')

              precision    recall  f1-score   support

     Negatif       0.88      0.88      0.88        26
      Netral       0.43      0.50      0.46         6
     Positif       0.88      0.83      0.86        18

    accuracy                           0.82        50
   macro avg       0.73      0.74      0.73        50
weighted avg       0.83      0.82      0.82        50

Accuracy: 82.0%


# SVM (Linear)

Word2Vec

In [22]:
# =======================================
# 1. IMPORT LIBRARY
# =======================================
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')


# =======================================
# 2. LOAD DATA CSV
# =======================================
# Pastikan file manual.csv memiliki kolom: "text", "label"
df = pd.read_csv("DataManual.csv")
df.columns = df.columns.str.strip()

texts = df["Content"].astype(str).tolist()
labels = df["Label"].astype(str).tolist()


# =======================================
# 3. TOKENISASI
# =======================================
tokenized_docs = [word_tokenize(t.lower()) for t in texts]


# =======================================
# 4. TRAIN WORD2VEC
# =======================================
w2v_dim = 100

w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=w2v_dim,
    window=5,
    min_count=1,
    workers=4
)


# =======================================
# 5. FUNGSI MEMBUAT VEKTOR WORD2VEC (AVERAGE EMBEDDING)
# =======================================
def document_vector(doc):
    words = [w for w in doc if w in w2v_model.wv]
    if len(words) == 0:
        return np.zeros(w2v_dim)
    return np.mean(w2v_model.wv[words], axis=0)


# Build fitur untuk semua dokumen
X = np.array([document_vector(doc) for doc in tokenized_docs])
y = np.array(labels)


# =======================================
# 6. NORMALISASI FITUR (PENTING UNTUK SVM)
# =======================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# =======================================
# 7. SPLIT DATA
# =======================================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


# =======================================
# 8. TRAIN MODEL SVM
# =======================================
svm_model = SVC(kernel="linear", C=1)
svm_model.fit(X_train, y_train)


# =======================================
# 9. EVALUASI
# =======================================
y_pred = svm_model.predict(X_test)
print("Akurasi:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Akurasi: 0.74
              precision    recall  f1-score   support

     Negatif       0.94      0.62      0.74        26
      Netral       0.36      0.83      0.50         6
     Positif       0.84      0.89      0.86        18

    accuracy                           0.74        50
   macro avg       0.71      0.78      0.70        50
weighted avg       0.84      0.74      0.76        50



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [24]:
from sklearn.feature_extraction.text import CountVectorizer # Added import
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd # Added import

# =======================================
# 2. LOAD DATA CSV (re-include for self-containment)
# =======================================
df = pd.read_csv("DataManual.csv")
df.columns = df.columns.str.strip()

texts = df["Content"].astype(str).tolist()
labels = df["Label"].astype(str).tolist()

# =======================================
# 3. BAG OF WORDS (COUNT VECTORIZER)
# =======================================
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(texts).toarray()


# =======================================
# 4. NORMALISASI (OPSIONAL, TAPI BAIK UNTUK SVM)
# =======================================
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X_bow)


# =======================================
# 5. SPLIT DATA
# =======================================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, labels, test_size=0.2, random_state=42
)


# =======================================
# 6. TRAIN SVM
# =======================================
svm_model = SVC(kernel="linear", C=1)
svm_model.fit(X_train, y_train)


# =======================================
# 7. EVALUASI
# =======================================
y_pred = svm_model.predict(X_test)
print("Akurasi:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Akurasi: 0.84
              precision    recall  f1-score   support

     Negatif       0.96      0.88      0.92        26
      Netral       0.40      0.33      0.36         6
     Positif       0.81      0.94      0.87        18

    accuracy                           0.84        50
   macro avg       0.72      0.72      0.72        50
weighted avg       0.84      0.84      0.84        50

