In [18]:
!pip install Sastrawi

import pandas as pd
import numpy as np
import re
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [19]:
df = pd.read_csv('dataset_tweet_sentiment_opini_film.csv')
df = df.drop_duplicates()
df = df.rename(columns={'Text Tweet': 'text', 'Sentiment': 'sentiment'})

In [20]:
factory_stop = StopWordRemoverFactory()
stopword = factory_stop.create_stop_word_remover()

factory_stem = StemmerFactory()
stemmer = factory_stem.create_stemmer()

def clean_text(x):
    x = re.sub('[^A-Za-z]+', ' ', str(x)).lower().strip()
    x = re.sub('\s+', ' ', x)
    x = stopword.remove(x)
    x = stemmer.stem(x)
    return x

df['clean'] = df['text'].apply(clean_text)

  x = re.sub('\s+', ' ', x)


In [21]:
X = df['clean']
Y = df['sentiment']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=5, stratify=Y
)

In [23]:
vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [24]:
model_bnb = BernoulliNB(alpha=1.0)  # force_alpha diperbaiki tetapi posisi tetap
model_bnb.fit(X_train_vec, y_train)
acc_bnb = accuracy_score(y_test, model_bnb.predict(X_test_vec))

In [25]:
svm_base = LinearSVC(C=1.0, max_iter=10000, random_state=42)
model_svm = CalibratedClassifierCV(svm_base, cv=3)
model_svm.fit(X_train_vec, y_train)
acc_svm = accuracy_score(y_test, model_svm.predict(X_test_vec))

In [26]:
ensemble_model = VotingClassifier(
    estimators=[
        ('bnb', BernoulliNB(alpha=1.0)),
        ('svm', CalibratedClassifierCV(
            LinearSVC(C=1.0, max_iter=10000, random_state=42), cv=3
        ))
    ],
    voting='soft'
)
ensemble_model.fit(X_train_vec, y_train)
acc_ensemble = accuracy_score(y_test, ensemble_model.predict(X_test_vec))

print("Akurasi Bernoulli NB :", acc_bnb)
print("Akurasi Linear SVM   :", acc_svm)
print("Akurasi Ensemble     :", acc_ensemble)

Akurasi Bernoulli NB : 0.85
Akurasi Linear SVM   : 0.875
Akurasi Ensemble     : 0.9


In [27]:
joblib.dump(model_bnb, 'model_bernoulli_nb.pkl')
joblib.dump(model_svm, 'model_linear_svm.pkl')
joblib.dump(ensemble_model, 'model_ensemble_voting.pkl')
joblib.dump(vectorizer, 'vectorizer_tfidf.pkl')
joblib.dump({'stopword': stopword, 'stemmer': stemmer}, 'preprocessing_tools.pkl')

print("Model berhasil disimpan.")

Model berhasil disimpan.
