In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
# Pastikan Anda sudah mengunduh data NLTK yang diperlukan
nltk.download('punkt')
nltk.download('stopwords')
# Unduh punkt_tab resource yang dibutuhkan oleh tokenizer
nltk.download('punkt_tab')

print("NLTK data (punkt, stopwords) downloaded. If not, please uncomment the download lines and run again.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


NLTK data (punkt, stopwords) downloaded. If not, please uncomment the download lines and run again.


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- 1. Pengumpulan Data (Contoh Dataset Sintetis) ---
# Dalam skenario nyata, Anda akan memuat dataset Anda dari file CSV, database, dll.
data = pd.read_csv('/content/drive/MyDrive/Sentimen Analisis/Al_NLP/tweet.csv')
df = pd.DataFrame(data)
print(df.head())
print(f"\nJumlah data: {len(df)}")

   Unnamed: 0 sentimen                                              tweet
0           0  negatif  Kata @prabowo Indonesia tidak dihargai bangsa ...
1           1   netral  Batuan Langka, Tasbih Jokowi Hadiah dari Habib...
2           2   netral  Di era Jokowi, ekonomi Indonesia semakin baik....
3           3  positif  Bagi Sumatera Selatan, Asian Games berdampak p...
4           4  negatif  Negara kita ngutang buat bngun infrastruktur y...

Jumlah data: 1815


In [None]:
# --- 2. Preprocessing Teks ---
stop_words_id = set(stopwords.words('indonesian'))
stemmer = PorterStemmer() # Untuk Bahasa Indonesia, stemming mungkin perlu disesuaikan atau menggunakan pustaka seperti Sastrawi

def preprocess_text(text):
    text = text.lower() # Ubah ke huruf kecil
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Hapus karakter non-alfabet
    words = nltk.word_tokenize(text) # Tokenisasi
    words = [word for word in words if word not in stop_words_id] # Hapus stop words
    # words = [stemmer.stem(word) for word in words] # Stemming (opsional, hati-hati dengan bahasa Indonesia)
    return ' '.join(words)

# Print column names to verify
print(df.columns)

# Replace 'text' with the actual column name containing the text data
# For example, if the text column is named 'tweet_text':
# df['processed_text'] = df['tweet_text'].apply(preprocess_text)
# If it's an unnamed column, you might need to use its index (e.g., df.iloc[:, 2])
# Assuming the text column is the third column (index 2) and has no name:
df['processed_text'] = df.iloc[:, 2].apply(preprocess_text) # Use the index of the text column

print("\n--- Contoh Teks Setelah Preprocessing ---")
# Access the original text column using its index (2)
print(df[[df.columns[2], 'processed_text']].head()) # Use df.columns[2] to get the name of the column at index 2

Index(['Unnamed: 0', 'sentimen', 'tweet'], dtype='object')

--- Contoh Teks Setelah Preprocessing ---
                                               tweet  \
0  Kata @prabowo Indonesia tidak dihargai bangsa ...   
1  Batuan Langka, Tasbih Jokowi Hadiah dari Habib...   
2  Di era Jokowi, ekonomi Indonesia semakin baik....   
3  Bagi Sumatera Selatan, Asian Games berdampak p...   
4  Negara kita ngutang buat bngun infrastruktur y...   

                                      processed_text  
0  prabowo indonesia dihargai bangsa asing berita...  
1  batuan langka tasbih jokowi hadiah habib luthf...  
2  era jokowi ekonomi indonesia indonesiamaju jok...  
3  sumatera selatan asian games berdampak pd ekon...  
4  negara ngutang bngun infrastruktur udah dipake...  


In [None]:
# --- 3. Ekstraksi Fitur (TF-IDF) ---
X = df['processed_text']
# Mengganti 'sentiment' dengan 'sentimen' karena nama kolom yang sebenarnya adalah 'sentimen'
y = df['sentimen']

# Bagi data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inisialisasi TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Batasi jumlah fitur untuk efisiensi

# Fitur teks ke vektor TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"\nJumlah fitur TF-IDF: {X_train_tfidf.shape[1]}")
print(f"Bentuk X_train_tfidf: {X_train_tfidf.shape}")


Jumlah fitur TF-IDF: 5000
Bentuk X_train_tfidf: (1452, 5000)


In [None]:
# --- 4. Pelatihan Model (Multinomial Naive Bayes) ---
# Multinomial Naive Bayes cocok untuk klasifikasi teks
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

print("\n--- Model Terlatih ---")


--- Model Terlatih ---


In [None]:
# --- 5. Evaluasi Model ---
y_pred = model.predict(X_test_tfidf)

print("\n--- Laporan Klasifikasi ---")
print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi Model: {accuracy:.2f}")


--- Laporan Klasifikasi ---
              precision    recall  f1-score   support

     negatif       0.62      0.78      0.69       115
      netral       0.73      0.58      0.65       118
     positif       0.65      0.62      0.64       130

    accuracy                           0.66       363
   macro avg       0.67      0.66      0.66       363
weighted avg       0.67      0.66      0.66       363

Akurasi Model: 0.66


In [None]:
# --- 6. Penggunaan Model untuk Prediksi Teks Baru ---
def predict_sentiment(text_input, model, vectorizer, preprocessor):
    processed_text = preprocessor(text_input)
    text_vector = vectorizer.transform([processed_text])
    sentiment = model.predict(text_vector)
    return sentiment[0]

print("\n--- Prediksi Sentimen Teks Baru ---")

new_texts = [
    "Saya sangat setuju dengan kebijakan baru ini, ini akan membawa perubahan positif.",
    "Pemerintah tidak becus mengurus masalah ini, sangat mengecewakan.",
    "Rapat kabinet hari ini membahas anggaran negara.",
    "Partai oposisi mengkritik keras kinerja pemerintah.",
    "Masyarakat menyambut baik program vaksinasi."
]

for text in new_texts:
    sentiment = predict_sentiment(text, model, tfidf_vectorizer, preprocess_text)
    print(f"Teks: '{text}' -> Sentimen: {sentiment}")


--- Prediksi Sentimen Teks Baru ---
Teks: 'Saya sangat setuju dengan kebijakan baru ini, ini akan membawa perubahan positif.' -> Sentimen: positif
Teks: 'Pemerintah tidak becus mengurus masalah ini, sangat mengecewakan.' -> Sentimen: negatif
Teks: 'Rapat kabinet hari ini membahas anggaran negara.' -> Sentimen: positif
Teks: 'Partai oposisi mengkritik keras kinerja pemerintah.' -> Sentimen: netral
Teks: 'Masyarakat menyambut baik program vaksinasi.' -> Sentimen: positif
