Import Library

In [None]:
import pandas as pd
import json
import re
import string
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

import joblib

Load sumberdaya dan pendefinisian fungsi preprocessing untuk dataset model pelatihan

LINK REFERENSI
* stopwords : https://github.com/stopwords-iso/stopwords-id/blob/master/stopwords-id.txt
* kamus leksikon : https://github.com/onpilot/sentimen-bahasa/tree/master
* kamus bahasa indonesia : https://github.com/damzaky/kumpulan-kata-bahasa-indonesia-KBBI/blob/master/list_1.0.0.txt
* kamus slang : https://github.com/nasalsabila/kamus-alay/blob/master/colloquial-indonesian-lexicon.csv

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(file.read().splitlines())
    
def load_lexicon(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(json.load(file))  
    
slang_dict = json.load(open("txt/kamusSlang.json", "r", encoding="utf-8"))
stopwords = load_file('txt/stopwords-1.txt')
kamus_indonesia = load_file('txt/kamusIndonesia.txt')
pos_lexicon = load_lexicon('leksikon/leksikon-pos.json')
neg_lexicon = load_lexicon('leksikon/leksikon-neg.json')

def preprocessing(text, slang_dict, stopwords, kamus_indonesia, stemmer):
    text = text.lower()  # Case folding
    text = re.sub(r"\\t|\\n|\\u|\\|http[s]?://\\S+|[@#][A-Za-z0-9_]+", " ", text)  # Menghapus karakter khusus
    text = re.sub(r"\\d+", "", text)  # Menghapus angka
    text = text.translate(str.maketrans("", "", string.punctuation))  # Menghapus tanda baca (pakai import string)
    text = re.sub(r"\\s+", ' ', text).strip()  # merapihkan spasi ganda
    text = re.sub(r"\b[a-zA-Z]\b", "", text) # Menghapus satu huruf (besar/kecil)
    text = ' '.join([slang_dict.get(word, word) for word in text.split()]) # Normalisasi (pemanfaatan kamus slang)
    text = word_tokenize(text) # Tokenisasi (sebelum stemming)
    text = [stemmer.stem(word) for word in text] # Stemming
    text = [word for word in text if word not in stopwords and len(word) > 3 and word in kamus_indonesia] # Stopwords & memilah kata
    text = ' '.join(text)
    return text


Fungsi pelabelan sentimen (boleh diskip)

In [None]:
def hitung_sentimen(text, leksikon_positif, leksikon_negatif, prior_positif=0.5, prior_negatif=0.5):
    # skor total
    total_skor_positif = 0
    total_skor_negatif = 0

    # algoritma iterasi seluruh kata pada kalimat
    for kata in text.split():
        # perhitungan P(kata|positif) dan P(kata|negatif) dalam kamus jika ditemukan.
        probabilitas_kata_positif = 1 / len(leksikon_positif) if kata in leksikon_positif else 0
        probabilitas_kata_negatif = 1 / len(leksikon_negatif) if kata in leksikon_negatif else 0

        # jumlah skor kata yang ditemukan dalam kamus
        total_skor_kata = probabilitas_kata_positif + probabilitas_kata_negatif

        # perhitungan skor sentimen positif dan negatif untuk kata
        skor_sentimen_positif = probabilitas_kata_positif / total_skor_kata if total_skor_kata > 0 else 0
        skor_sentimen_negatif = probabilitas_kata_negatif / total_skor_kata if total_skor_kata > 0 else 0

        # penjumlahan skor positif dan negatif
        total_skor_positif += skor_sentimen_positif
        total_skor_negatif += skor_sentimen_negatif

    # menghitung total skor semua kata pada kalimat
    total_skor_semua = total_skor_positif + total_skor_negatif

    # menghitung probabilitas untuk sentimen positif dan negatif pada kalimat
    probabilitas_positif = ((total_skor_positif / total_skor_semua) * prior_positif) if total_skor_semua > 0 else 0
    probabilitas_negatif = ((total_skor_negatif / total_skor_semua) * prior_negatif) if total_skor_semua > 0 else 0

    # menentukan sentimen berdasarkan probabilitas sentimen tertinggi
    if probabilitas_positif > probabilitas_negatif:
        return 'Positif', 1
    elif probabilitas_negatif > probabilitas_positif:
        return 'Negatif', -1
    else:
        return 'Netral', 0

Pemanggilan fungsi preprocessing

In [None]:
df = pd.read_csv('code_filter_crawling/crawling.csv')
df.rename(columns={"full_text" : "teks"}, inplace=True)
df['teks'] = df['teks'].apply(lambda x: preprocessing(x, slang_dict, stopwords, kamus_indonesia,stemmer))

df.to_csv('preprocessing/preprocessing.csv', index=False)
df = df[df['teks'].str.strip().astype(bool)]

Pemanggilan fungsi pelabelan (boleh di skip)

In [None]:
df[['label' ,'skor']] = df['teks'].apply(lambda x: pd.Series(hitung_sentimen(x, pos_lexicon, neg_lexicon)))
df = df[df['teks'].str.strip().astype(bool)]
df.to_csv('dataset_berlabel/dataset_berlabel.csv', index=False)

Pengecekkan oversampling

In [None]:
import pandas as pd
from collections import Counter
df = pd.read_csv('dataset_berlabel/dataset_berlabel.csv')

# 1. Hitung distribusi label
label_distribution = Counter(df['label'])
print("Distribusi Label:", label_distribution)

# 2. Periksa ketidakseimbangan
is_balanced = len(set(label_distribution.values())) == 1
if is_balanced:
    print("Dataset sudah seimbang. Tidak perlu oversampling.")
else:
    print("Dataset tidak seimbang. Oversampling dapat dipertimbangkan dengan menggunakan SMOTE")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
df = df[['teks', 'label']]
sns.countplot(data=df, x='label', color='green')
plt.title('Distribusi Label')
plt.xlabel('Label')
plt.ylabel('Jumlah label')
plt.show()

Pelatihan model logistic regression (tanpa SMOTE)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

data = pd.read_csv('dataset_berlabel/dataset_berlabel.csv')

X = data['teks']
y = data['label']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# vektorisasi
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# melatih model Logistic Regression
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# memprediksi hasil untuk data training dan testing
y_train_pred = model.predict(X_train_tfidf)
y_test_pred = model.predict(X_test_tfidf)

# evaluasi model pada data training & testing
print("Akurasi Traning:", accuracy_score(y_train, y_train_pred))
print("Akurasi Testing:", accuracy_score(y_test, y_test_pred))

# classification report untuk data testing
print("Classification Report:\n", classification_report(y_test, y_test_pred))

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Menghitung presisi, recall, dan F1-score secara keseluruhan
precision = precision_score(y_test, y_test_pred, average='macro')
recall = recall_score(y_test, y_test_pred, average='macro')
f1 = f1_score(y_test, y_test_pred, average='macro')

print(f"Presisi (keseluruhan): {precision:.2f}")
print(f"Recall (keseluruhan): {recall:.2f}")
print(f"F1-Score (keseluruhan): {f1:.2f}")


Pelatihan model logistic regression (menggunakan SMOTE) proporsi split data 70:30

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

df = pd.read_csv('dataset_berlabel/dataset_berlabel.csv')
X = df['teks']
y = df['label']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# vektorisasi
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

# melatih model Logistic Regression
model = LogisticRegression()
model.fit(X_train_smote, y_train_smote)

# memprediksi hasil data training dan testing
y_train_pred = model.predict(X_train_smote)
y_pred = model.predict(X_test_tfidf)

# evaluasi model training & testing
print("Akurasi Training:", accuracy_score(y_train_smote, y_train_pred))
print("Akurasi Testing:", accuracy_score(y_test, y_pred))

# classification untuk data testing
print("Classification Report:\n", classification_report(y_test, y_pred))

# # Simpan model dan vectorizer
# joblib.dump(model, "model/model_sentimen.pkl")
# joblib.dump(vectorizer, "model/vectorizer_sentimen.pkl")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Menghitung presisi, recall, dan F1-score secara keseluruhan
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f"Presisi (keseluruhan): {precision:.2f}")
print(f"Recall (keseluruhan): {recall:.2f}")
print(f"F1-Score (keseluruhan): {f1:.2f}")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# fungsi menghitung confusion matrix
cmtrain = confusion_matrix(y_train_smote, y_train_pred)
cmtest = confusion_matrix(y_test, y_pred)

# fungsi menampilkan confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cmtrain, display_labels=model.classes_)
disp.plot(cmap="Greens")
plt.xlabel("Label Prediksi")
plt.ylabel("Label Aktual")
plt.title("Confusion Matrix Data Training")
plt.show()

# fungsi menampilkan confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cmtest, display_labels=model.classes_)
disp.plot(cmap="Greens")
plt.xlabel("Label Prediksi")
plt.ylabel("Label Aktual")
plt.title("Confusion Matrix Data Testing")
plt.show()

Model Naive Bayes

In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from imblearn.over_sampling import SMOTE
# from sklearn.naive_bayes import MultinomialNB  # Ganti LogisticRegression dengan Naive Bayes
# from sklearn.metrics import accuracy_score, classification_report
# import joblib

# # Baca dataset
# df = pd.read_csv('dataset_berlabel/dataset_berlabel.csv')
# X = df['teks']
# y = df['label']

# # Bagi data menjadi training dan testing
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# # Vectorisasi teks menggunakan TF-IDF
# vectorizer = TfidfVectorizer()
# X_train_tfidf = vectorizer.fit_transform(X_train)
# X_test_tfidf = vectorizer.transform(X_test)

# # Lakukan SMOTE untuk menangani data tidak seimbang
# smote = SMOTE(random_state=42)
# X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

# # Latih model Naive Bayes (Multinomial Naive Bayes)
# model = MultinomialNB()  # Ganti LogisticRegression dengan Naive Bayes
# model.fit(X_train_smote, y_train_smote)

# # Prediksi pada data training (untuk menghitung akurasi training)
# y_train_pred = model.predict(X_train_smote)

# # Prediksi pada data testing
# y_pred = model.predict(X_test_tfidf)

# # Simpan model dan vectorizer
# joblib.dump(model, "model/model_sentimen.pkl")
# joblib.dump(vectorizer, "model/vectorizer_sentimen.pkl")

# # Evaluasi model
# print("Akurasi Training:", accuracy_score(y_train_smote, y_train_pred))
# print("Akurasi Testing:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))