In [1]:
import pandas as pd
import json
import re
import string
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

import joblib

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(file.read().splitlines())
    
def load_lexicon(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(json.load(file))  
    
slang_dict = json.load(open("txt/kamusSlang.json", "r", encoding="utf-8"))
stopwords = load_file('txt/stopwords-1.txt')
kamus_indonesia = load_file('txt/kamusIndonesia.txt')
pos_lexicon = load_lexicon('leksikon/leksikon-pos.json')
neg_lexicon = load_lexicon('leksikon/leksikon-neg.json')

def preprocessing(text, slang_dict, stopwords, kamus_indonesia, stemmer):
    text = text.lower()  # Case folding
    text = re.sub(r"\\t|\\n|\\u|\\|http[s]?://\\S+|[@#][A-Za-z0-9_]+", " ", text)  # Menghapus karakter khusus
    text = re.sub(r"\\d+", "", text)  # Menghapus angka
    text = text.translate(str.maketrans("", "", string.punctuation))  # Menghapus tanda baca (pakai import string)
    text = re.sub(r"\\s+", ' ', text).strip()  # merapihkan spasi ganda
    text = re.sub(r"\b[a-zA-Z]\b", "", text) # Menghapus satu huruf (besar/kecil)
    text = ' '.join([slang_dict.get(word, word) for word in text.split()]) # Normalisasi (pemanfaatan kamus slang)
    text = word_tokenize(text) # Tokenisasi (sebelum stemming)
    text = [stemmer.stem(word) for word in text] # Stemming
    text = [word for word in text if word not in stopwords and len(word) > 3 and word in kamus_indonesia] # Stopwords & memilah kata
    text = ' '.join(text)
    return text

def hitung_sentimen(text, pos_lexicon, neg_lexicon):
    pos_count = sum(1 for word in text.split() if word in pos_lexicon)
    neg_count = sum(1 for word in text.split() if word in neg_lexicon)
    if pos_count > neg_count:
        return 'Positif', 1
    elif neg_count > pos_count:
        return 'Negatif', -1
    else:
        return 'Netral', 0

df = pd.read_csv('code_filter_crawling/crawling.csv')
df.rename(columns={"full_text" : "teks"}, inplace=True)
df['teks'] = df['teks'].apply(lambda x: preprocessing(x, slang_dict, stopwords, kamus_indonesia,stemmer))

df.to_csv('preprocessing/preprocessing.csv', index=False)
df = df[df['teks'].str.strip().astype(bool)]

df[['label' ,'skor']] = df['teks'].apply(lambda x: pd.Series(hitung_sentimen(x, pos_lexicon, neg_lexicon)))
df = df[df['teks'].str.strip().astype(bool)]
df.to_csv('dataset_berlabel/dataset_berlabel.csv', index=False)

X = df['teks']
y = df['label']
smote = SMOTE(random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)
model = LogisticRegression()
model.fit(X_train_smote, y_train_smote)

y_pred = model.predict(X_test_tfidf)

joblib.dump(model, "model/model_sentimen.pkl")
joblib.dump(vectorizer, "model/vectorizer_sentimen.pkl")

print("akurasi:", accuracy_score(y_test, y_pred))
print("classification report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


akurasi: 0.7860262008733624
classification report:
               precision    recall  f1-score   support

     Negatif       0.87      0.91      0.89       150
      Netral       0.49      0.50      0.49        38
     Positif       0.75      0.59      0.66        41

    accuracy                           0.79       229
   macro avg       0.70      0.67      0.68       229
weighted avg       0.78      0.79      0.78       229

