In [99]:
import pandas as pd

## Evaluasi Pertama
Dilakukan preprocessing terhadap data yaitu dengan casefolding, tokenization, filtering (remove slang lang, remove stopword), dan stemming.

In [100]:
df = pd.read_csv('data/df_sentiment_code.csv')
df.head()

Unnamed: 0,ï»¿comment,clean,label_manual,sentimen,sentiment_indobert,sentiment_indobert_kotor,sentiment_roberta,sentiment_roberta_kotor
0,Izin bertnya apakah studi idenpenden mitra pro...,izin bertnya studi idenpenden mitra programing...,1,1,1,1,1,1
1,Dapat uang saku gaksih,uang saku gaksih,1,1,0,2,1,1
2,@rwrt1.0 kelas full english kak batch ini,kelas full english batch,1,2,1,1,1,1
3,"Saya, saya bang ga lolos msib",lolos msib,0,1,2,0,1,0
4,@syanasblaa lanjut dm kak,dm,1,1,1,1,0,1


In [101]:
# rename columns 
df = df.rename(columns={'ï»¿comment': 'comment'})

In [102]:
from sklearn.metrics import accuracy_score

# hitung akurasi masing-masing kolom terhadap label_manual
acc_textblob = accuracy_score(df['label_manual'], df['sentimen'])
acc_indobert = accuracy_score(df['label_manual'], df['sentiment_indobert'])
acc_indobert_kotor = accuracy_score(df['label_manual'], df['sentiment_indobert_kotor'])
acc_roberta = accuracy_score(df['label_manual'], df['sentiment_roberta'])
acc_roberta_kotor = accuracy_score(df['label_manual'], df['sentiment_roberta_kotor'])

print('Akurasi TextBlob: ', acc_textblob)
print('Akurasi Indobert: ', acc_indobert)
print('Akurasi Indobert Kotor: ', acc_indobert_kotor)
print('Akurasi Roberta: ', acc_roberta)
print('Akurasi Roberta Kotor: ', acc_roberta_kotor)

Akurasi TextBlob:  0.513184584178499
Akurasi Indobert:  0.5496957403651116
Akurasi Indobert Kotor:  0.6632860040567952
Akurasi Roberta:  0.4969574036511156
Akurasi Roberta Kotor:  0.6166328600405679


## Evaluasi Kedua
Dilakukan preprocessing terhadap data yaitu dengan casefolding, tokenization, dan filtering (remove slang lang)

In [103]:
df2 = pd.read_csv('data/df_sentiment_tanpa_stem_code.csv')
df2.head()

Unnamed: 0,comment,clean,sentimen,sentiment_indobert,sentiment_indobert_kotor,sentiment_roberta,sentiment_roberta_kotor
0,Izin bertnya apakah studi idenpenden mitra pro...,izin bertnya apakah studi idenpenden mitra pro...,1,1,1,1,1
1,Dapat uang saku gaksih,dapat uang saku gaksih,1,2,2,1,1
2,@rwrt1.0 kelas full english kak batch ini,kelas full english kak batch ini,2,1,1,1,1
3,"Saya, saya bang ga lolos msib",saya saya bang tidak lolos msib,1,0,0,0,0
4,@syanasblaa lanjut dm kak,lanjut dm kak,1,1,1,1,1


In [104]:
# distinc label
merged_table = pd.merge(df2, df[['comment', 'label_manual']], on='comment', how='left')
merged_table.head()

Unnamed: 0,comment,clean,sentimen,sentiment_indobert,sentiment_indobert_kotor,sentiment_roberta,sentiment_roberta_kotor,label_manual
0,Izin bertnya apakah studi idenpenden mitra pro...,izin bertnya apakah studi idenpenden mitra pro...,1,1,1,1,1,1.0
1,Dapat uang saku gaksih,dapat uang saku gaksih,1,2,2,1,1,1.0
2,@rwrt1.0 kelas full english kak batch ini,kelas full english kak batch ini,2,1,1,1,1,1.0
3,"Saya, saya bang ga lolos msib",saya saya bang tidak lolos msib,1,0,0,0,0,0.0
4,@syanasblaa lanjut dm kak,lanjut dm kak,1,1,1,1,1,1.0


In [105]:
merged_table = merged_table.dropna()

In [106]:
# change label_manual columns into int
merged_table['label_manual'] = merged_table['label_manual'].astype(int)

In [107]:
from sklearn.metrics import accuracy_score

# hitung akurasi masing-masing kolom terhadap label_manual
acc_textblob = accuracy_score(merged_table['label_manual'], merged_table['sentimen'])
acc_indobert = accuracy_score(merged_table['label_manual'], merged_table['sentiment_indobert'])
acc_indobert_kotor = accuracy_score(merged_table['label_manual'], merged_table['sentiment_indobert_kotor'])
acc_roberta = accuracy_score(merged_table['label_manual'], merged_table['sentiment_roberta'])
acc_roberta_kotor = accuracy_score(merged_table['label_manual'], merged_table['sentiment_roberta_kotor'])

print('Akurasi TextBlob: ', acc_textblob)
print('Akurasi Indobert: ', acc_indobert)
print('Akurasi Indobert Kotor: ', acc_indobert_kotor)
print('Akurasi Roberta: ', acc_roberta)
print('Akurasi Roberta Kotor: ', acc_roberta_kotor)

Akurasi TextBlob:  0.5128205128205128
Akurasi Indobert:  0.6410256410256411
Akurasi Indobert Kotor:  0.6780626780626781
Akurasi Roberta:  0.6096866096866097
Akurasi Roberta Kotor:  0.6438746438746439


## Evaluasi Pertama (Ulang)
Ini ngulangi lagi, soalnya data di evaluasi kedua berkurang. Ada yang kehapus waktu drop missing value.

In [108]:
merged_table2 = pd.merge(df, df2, on='comment', how='left')
merged_table2 = merged_table2.dropna()
merged_table2.shape

(351, 14)

In [111]:
from sklearn.metrics import accuracy_score

# hitung akurasi masing-masing kolom terhadap label_manual
acc_textblob = accuracy_score(merged_table2['label_manual'], merged_table2['sentimen_x'])
acc_indobert = accuracy_score(merged_table2['label_manual'], merged_table2['sentiment_indobert_x'])
acc_indobert_kotor = accuracy_score(merged_table2['label_manual'], merged_table2['sentiment_indobert_kotor_x'])
acc_roberta = accuracy_score(merged_table2['label_manual'], merged_table2['sentiment_roberta_x'])
acc_roberta_kotor = accuracy_score(merged_table2['label_manual'], merged_table2['sentiment_roberta_kotor_x'])

print('Akurasi TextBlob: ', acc_textblob)
print('Akurasi Indobert: ', acc_indobert)
print('Akurasi Indobert Kotor: ', acc_indobert_kotor)
print('Akurasi Roberta: ', acc_roberta)
print('Akurasi Roberta Kotor: ', acc_roberta_kotor)

Akurasi TextBlob:  0.5498575498575499
Akurasi Indobert:  0.5555555555555556
Akurasi Indobert Kotor:  0.6780626780626781
Akurasi Roberta:  0.5242165242165242
Akurasi Roberta Kotor:  0.6438746438746439


## Hide Me

In [None]:
# df2 = pd.read_csv("data/df_sentiment_tanpa_stem.csv", encoding="latin-1", index_col=0)
# df2.sample(10)

In [None]:
# label_dict = {"Negatif": 0, "Netral": 1, "Positif": 2}

# # menggunakan map untuk mengubah label menjadi angka
# df2["sentimen"] = df2["sentimen"].map(label_dict)
# df2["sentiment_indobert"] = df2["sentiment_indobert"].map(label_dict)
# df2["sentiment_indobert_kotor"] = df2["sentiment_indobert_kotor"].map(label_dict)

In [None]:
# label_dict = {"negative": 0, "neutral": 1, "positive": 2}

# # menggunakan map untuk mengubah label menjadi angka
# df2["sentiment_roberta"] = df2["sentiment_roberta"].map(label_dict)
# df2["sentiment_roberta_kotor"] = df2["sentiment_roberta_kotor"].map(label_dict)

In [None]:
# df2.to_csv("data/df_sentiment_tanpa_stem_code.csv", index=False)