In [1]:
# Install jika belum ada
!pip install scikit-learn

# Import library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv('playstore_reviews.csv')
print("Jumlah data:", len(df))
df.head()

# Label sentimen dari skor rating
def label_sentiment(score):
    if score >= 4:
        return 'positif'
    elif score == 3:
        return 'netral'
    else:
        return 'negatif'

df['sentiment'] = df['rating'].apply(label_sentiment)

# Cek distribusi label
print("Distribusi sentimen:\n", df['sentiment'].value_counts())

# TF-IDF untuk ekstraksi fitur
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['review'].astype(str)).toarray()

# Encode label ke angka
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])  # 0=negatif, 1=netral, 2=positif

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Latih model Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluasi
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\n🎯 Akurasi Testing: {acc * 100:.2f}%")
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Inference (Prediksi Kalimat Baru)
def prediksi_sentimen(kalimat):
    vector = vectorizer.transform([kalimat]).toarray()
    hasil = model.predict(vector)
    return le.inverse_transform(hasil)[0]

# Contoh prediksi
contoh = "Aplikasi ini sangat jelek dan sering error"
hasil = prediksi_sentimen(contoh)
print("\n🧠 Inference Contoh:")
print(f"Kalimat: '{contoh}'")
print(f"Hasil Prediksi Sentimen: {hasil}")


Jumlah data: 3000
Distribusi sentimen:
 sentiment
positif    1845
negatif    1020
netral      135
Name: count, dtype: int64

🎯 Akurasi Testing: 98.33%

📊 Classification Report:
               precision    recall  f1-score   support

     negatif       1.00      0.99      0.99       204
      netral       1.00      0.74      0.85        27
     positif       0.97      1.00      0.99       369

    accuracy                           0.98       600
   macro avg       0.99      0.91      0.94       600
weighted avg       0.98      0.98      0.98       600


🧠 Inference Contoh:
Kalimat: 'Aplikasi ini sangat jelek dan sering error'
Hasil Prediksi Sentimen: positif
