#Import Library

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

  Sentiment                             Instagram Comment Text
0  negative   <USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1  negative  Geblek lo tata...cowo bgt dibela2in balikan......
2  negative  Kmrn termewek2 skr lengket lg duhhh kok labil ...
3  negative  Intinya kalau kesel dengan ATT nya, gausah ke ...
4  negative  hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#Load Dataset dari repository

---
https://github.com/rizalespe/Dataset-Sentimen-Analisis-Bahasa-Indonesia



In [None]:
url = 'https://raw.githubusercontent.com/rizalespe/Dataset-Sentimen-Analisis-Bahasa-Indonesia/master/dataset_komentar_instagram_cyberbullying.csv'
df = pd.read_csv(url)
df = df[['Sentiment', 'Instagram Comment Text']]
df['Sentiment'] = df['Sentiment'].apply(lambda x: "Kamu Baik Banget Bang" if x == 'positive' else "Toxic Banget Kamu Rek")
df.dropna(inplace=True)

print(df.head())


               Sentiment                             Instagram Comment Text
0  Toxic Banget Kamu Rek   <USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1  Toxic Banget Kamu Rek  Geblek lo tata...cowo bgt dibela2in balikan......
2  Toxic Banget Kamu Rek  Kmrn termewek2 skr lengket lg duhhh kok labil ...
3  Toxic Banget Kamu Rek  Intinya kalau kesel dengan ATT nya, gausah ke ...
4  Toxic Banget Kamu Rek  hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...


#Hilangkan karakter khusus dan angka

In [None]:
stop_words = set(stopwords.words('indonesian'))

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    text = text.strip()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['Instagram Comment Text'] = df['Instagram Comment Text'].apply(preprocess_text)
print(df.head())

               Sentiment                             Instagram Comment Text
0  Toxic Banget Kamu Rek  username tolol gak hubungan nya keguguran dgn ...
1  Toxic Banget Kamu Rek  geblek lo tatacowo bgt dibelain balikanhadewwn...
2  Toxic Banget Kamu Rek  kmrn termewek skr lengket lg duhhh labil bgt s...
3  Toxic Banget Kamu Rek  intinya kesel att nya gausah anaknya kasian pe...
4  Toxic Banget Kamu Rek  hadewwwww permpuan lgsakit jiwaknp yg jd peran...


#Membagi Dataset Menjadi Train Dan Test

In [None]:
X = df['Instagram Comment Text']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#Latih Model

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
model = LogisticRegression()
model.fit(X_train_vec, y_train)


In [None]:
y_pred = model.predict(X_test_vec)

# Akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f'Akurasi: {accuracy}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

# Classification Report
cr = classification_report(y_test, y_pred)
print('Classification Report:')
print(cr)


Akurasi: 0.8875
Confusion Matrix:
[[35  1]
 [ 8 36]]
Classification Report:
                       precision    recall  f1-score   support

Kamu Baik Banget Bang       0.81      0.97      0.89        36
Toxic Banget Kamu Rek       0.97      0.82      0.89        44

             accuracy                           0.89        80
            macro avg       0.89      0.90      0.89        80
         weighted avg       0.90      0.89      0.89        80



In [None]:
tes_rek = ["Woi anjing kau tolol goblok setan babi",
           "Kamu sangat keren kafitra ganteng"]
print(model.predict(vectorizer.transform(tes_rek)))


['Toxic Banget Kamu Rek' 'Kamu Baik Banget Bang']
