#### Mempersiapkan Library

In [31]:
import pandas as pd
import numpy as np
import re
import csv
from string import punctuation
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle

In [2]:
from tqdm import tqdm
from time import sleep

#### Mempersiapkan Dataset

In [3]:
tsv_file_path = "../train_preprocess.tsv.txt"

data_text = []
label = []

with open(tsv_file_path, encoding='utf-8') as tsvfile:
    tsv_reader = csv.reader(tsvfile, delimiter='\t')
    for row in tsv_reader:
        data_text.append(row[0])
        label.append(row[1])

df = pd.DataFrame({'data_text': data_text, 'label': label})

In [4]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=0)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=0)

In [5]:
print(train_data.shape)
print(test_data.shape)
print(val_data.shape)

(7040, 2)
(2200, 2)
(1760, 2)


In [6]:
print('Komposisi Label train_data:')
print(train_data['label'].value_counts())

print('Komposisi Label test_data:')
print(test_data['label'].value_counts())

print('Komposisi Label val_data:')
print(val_data['label'].value_counts())

Komposisi Label train_data:
positive    4134
negative    2185
neutral      721
Name: label, dtype: int64
Komposisi Label test_data:
positive    1272
negative     688
neutral      240
Name: label, dtype: int64
Komposisi Label val_data:
positive    1010
negative     563
neutral      187
Name: label, dtype: int64


#### Normalisasi Text

In [7]:
def lowercasing(paragraph):
    return paragraph.lower()

In [8]:
def menghilangkan_tandabaca(paragraph):
    new_paragraph = re.sub(fr'[{punctuation}]', r'', paragraph)
    return new_paragraph

In [9]:
def text_normalization(paragraph):
    paragraph = lowercasing(paragraph)
    paragraph = menghilangkan_tandabaca(paragraph)
    paragraph = re.sub(r"[ ]+",r' ',paragraph)
    return paragraph

In [10]:
train_data['data_text'] = train_data['data_text'].apply(lambda x: text_normalization(x))
test_data['data_text'] = test_data['data_text'].apply(lambda x: text_normalization(x))
val_data['data_text'] = val_data['data_text'].apply(lambda x: text_normalization(x))

#### Training Model

In [11]:
cv = CountVectorizer(ngram_range=(1,3))

In [12]:
cv.fit(train_data['data_text'])

In [13]:
train_vector = cv.transform(train_data['data_text'])
test_vector = cv.transform(test_data['data_text'])
val_vector = cv.transform(val_data['data_text'])

In [14]:
train_data['data_text']

4057     indihome gangguan terus bayar nya saja mahal t...
7811     tempat pemandangan nya luar biasa dan pelayana...
5046     bagaimana kalau netizen pada patungan menyewa ...
7404                                           tidak sehat
8055     ulah sendiri kok gubernur yang disalahkan dasa...
                               ...                        
10959                  saya cemburu tapi tidak bisa apaapa
3997     bubarkan saja dpr rakyat tidak butuh diwakili ...
4118     steak di sini selalu membuat ketagihan menu ke...
5646                                      adit kayak tarik
10675    restoran sumoamg padang dago adalah palung pop...
Name: data_text, Length: 7040, dtype: object

In [15]:
lb = LabelEncoder()

In [16]:
lb.fit(train_data['label'])

In [17]:
train_labels = lb.transform(train_data['label'])
test_labels = lb.transform(test_data['label'])
val_labels = lb.transform(val_data['label'])

#### Model Testing and Evaluation

In [18]:
model = MLPClassifier(hidden_layer_sizes=(5, 2),
                      max_iter=50,
                      activation='relu',
                      solver='adam',
                      verbose=True)

In [19]:
model.fit(train_vector, train_labels)

Iteration 1, loss = 1.06030739
Iteration 2, loss = 0.65394514
Iteration 3, loss = 0.49047345
Iteration 4, loss = 0.42255123
Iteration 5, loss = 0.39046582
Iteration 6, loss = 0.37119413
Iteration 7, loss = 0.35791732
Iteration 8, loss = 0.34763468
Iteration 9, loss = 0.33910458
Iteration 10, loss = 0.33195393
Iteration 11, loss = 0.32576785
Iteration 12, loss = 0.31989602
Iteration 13, loss = 0.31522437
Iteration 14, loss = 0.31003218
Iteration 15, loss = 0.30624320
Iteration 16, loss = 0.30219436
Iteration 17, loss = 0.29868010
Iteration 18, loss = 0.29618112
Iteration 19, loss = 0.29138587
Iteration 20, loss = 0.28803474
Iteration 21, loss = 0.28503368
Iteration 22, loss = 0.28203441
Iteration 23, loss = 0.28174344
Iteration 24, loss = 0.27658250
Iteration 25, loss = 0.27371196
Iteration 26, loss = 0.27145399
Iteration 27, loss = 0.26865467
Iteration 28, loss = 0.26603984
Iteration 29, loss = 0.26356775
Iteration 30, loss = 0.26115653
Iteration 31, loss = 0.26014901
Iteration 32, los



In [23]:
y_pred = model.predict(val_vector)

In [25]:
model

In [26]:
y_pred = lb.inverse_transform(y_pred).reshape(-1)

In [27]:
accuracy_score(y_pred=y_pred, y_true=val_data['label'])

0.7676136363636363

In [32]:
print(classification_report(y_pred=y_pred, y_true=val_data['label']))

              precision    recall  f1-score   support

    negative       0.91      0.46      0.61       563
     neutral       0.38      0.71      0.50       187
    positive       0.85      0.95      0.90      1010

    accuracy                           0.77      1760
   macro avg       0.71      0.71      0.67      1760
weighted avg       0.82      0.77      0.76      1760



In [29]:
pickle.dump(cv, open('../Pickle/count_vec.pkl','wb'))
pickle.dump(lb, open('../Pickle/label_enc.pkl','wb'))
pickle.dump(model, open('../Model/model_NN_sklearn.h5','wb'))