In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib
import nltk
from nltk.corpus import stopwords

In [2]:
# Download stopwords bahasa Indonesia
nltk.download('stopwords')
stop_words = stopwords.words('indonesian')

[nltk_data] Downloading package stopwords to C:\Users\Kedeputian
[nltk_data]     IPSK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Load dataset
df = pd.read_csv('data/dataset.csv')

In [4]:
# Ambil kolom teks dan label
X = df['Text Tweet']
y = df['Sentiment']

In [5]:
# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Buat pipeline: TF-IDF + Naive Bayes
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', MultinomialNB())
])

In [7]:
# Latih model
model.fit(X_train, y_train)



Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['ada', 'adalah', 'adanya',
                                             'adapun', 'agak', 'agaknya',
                                             'agar', 'akan', 'akankah', 'akhir',
                                             'akhiri', 'akhirnya', 'aku',
                                             'akulah', 'amat', 'amatlah',
                                             'anda', 'andalah', 'antar',
                                             'antara', 'antaranya', 'apa',
                                             'apaan', 'apabila', 'apakah',
                                             'apalagi', 'apatah', 'artinya',
                                             'asal', 'asalkan', ...])),
                ('clf', MultinomialNB())])

In [8]:
# Evaluasi performa
y_pred = model.predict(X_test)
print("Laporan Evaluasi:")
print(classification_report(y_test, y_pred))

Laporan Evaluasi:
              precision    recall  f1-score   support

    negative       0.76      0.69      0.72        91
    positive       0.71      0.78      0.74        89

    accuracy                           0.73       180
   macro avg       0.74      0.73      0.73       180
weighted avg       0.74      0.73      0.73       180



In [9]:
# Simpan model
joblib.dump(model, 'model/sentiment_classifier.pkl')
print("Model berhasil disimpan ke model/sentiment_classifier.pkl")

Model berhasil disimpan ke model/sentiment_classifier.pkl
