In [1]:
import pandas as pd
import requests
from io import StringIO

In [2]:
file_link = 'https://drive.google.com/file/d/1J-HpNBLEqrnGckM5bI2Dnm5hV6Vtr_FU/view?usp=drive_link'
file_link = 'https://drive.google.com/uc?id=' + file_link.split('/')[-2]

direct_download_link = file_link.replace('open', 'uc')

response = requests.get(direct_download_link)

csv_from_drive = StringIO(response.text)

In [3]:
df = pd.read_csv(csv_from_drive,
                 sep="\t",
                 names=["text", "sentiment"])

In [4]:
df.head(3)

Unnamed: 0,text,sentiment
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive


In [5]:
import re

def cleansing(sent):
    string = sent.lower()
    string = re.sub(r'[^a-zA-Z0-9]',' ',string)
    return string

In [6]:
df['text_clean'] = df['text'].apply(cleansing)

In [7]:
data_preprocessed = df['text_clean'].tolist()

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
# Ini adalah bagian/proses Vectorization, dimana text "diubah" menjadi vector/numerical
# Nama dari proses ini adalah Feature Extraction

count_vect = CountVectorizer()
count_vect.fit(data_preprocessed)

x = count_vect.transform(data_preprocessed)

In [10]:
# import objek countvectorizer yg berisi vectorization process dari whole training data
# supaya, sebelum dilakukan prediction atas data baru nantinya, data text baru tsb bisa diubah menjadi vector/vectorization

import pickle

pickle.dump(count_vect, open("feature.p", "wb"))

In [13]:
# selanjutnya, sebelum modeling, kita perlu split existing data menjadi data train dan data test

from sklearn.model_selection import train_test_split

classes = df['sentiment']

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, classes, test_size = 0.2)

In [15]:
# modeling menggunakan algoritma machine learning MLPClassifier (basic neural network)

from sklearn.neural_network import MLPClassifier

In [16]:
model = MLPClassifier()
model.fit(x_train, y_train)

In [17]:
# dump model ke dalam file pickle

pickle.dump(model, open("model.p","wb"))

In [18]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test)

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.79      0.79      0.79       707
     neutral       0.78      0.72      0.75       222
    positive       0.88      0.89      0.89      1271

    accuracy                           0.84      2200
   macro avg       0.82      0.80      0.81      2200
weighted avg       0.84      0.84      0.84      2200



In [20]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, random_state=42, shuffle=True)

In [21]:
import numpy as np
accuracies = []

y = classes

# Ini main part cross-validation
for iteration, data in enumerate(kf.split(x), start=1):

    data_train = x[data[0]]
    target_train = y[data[0]]

    data_test = x[data[1]]
    target_test = y[data[1]]

    clf = MLPClassifier()
    clf.fit(data_train, target_train)

    preds = clf.predict(data_test)

    accuracy = accuracy_score(target_test, preds)

    print("Training ke-", iteration)
    print(classification_report(target_test, preds))
    print("============================================")

    accuracies.append(accuracy)

average_accuracy = np.mean(accuracies)

print()
print()

print("Rata-rata Accuracy: ", average_accuracy)

Training ke- 1
              precision    recall  f1-score   support

    negative       0.78      0.78      0.78       680
     neutral       0.77      0.64      0.70       239
    positive       0.87      0.90      0.89      1281

    accuracy                           0.84      2200
   macro avg       0.81      0.78      0.79      2200
weighted avg       0.83      0.84      0.83      2200

Training ke- 2
              precision    recall  f1-score   support

    negative       0.78      0.76      0.77       706
     neutral       0.72      0.68      0.70       220
    positive       0.87      0.90      0.89      1274

    accuracy                           0.83      2200
   macro avg       0.79      0.78      0.79      2200
weighted avg       0.83      0.83      0.83      2200

Training ke- 3
              precision    recall  f1-score   support

    negative       0.80      0.80      0.80       682
     neutral       0.86      0.72      0.78       215
    positive       0.89      0