In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from keras.models import Sequential
from keras import layers
from keras.layers import Embedding
from keras.optimizers import Adam

import seaborn as sns
sns.set(style = 'whitegrid')

from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import regularizers
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('../Cleaning/Clean_Dataset.csv', encoding='UTF-8')

In [3]:
train = df[['Clean_Text','Stop_Words_Text', 'Stemmed_Text', 'indonlp_sentiment', 'Lexicon_Score']]

In [4]:
train = df[['Clean_Text', 'indonlp_sentiment']]

#### Data Preprocessing

In [5]:
data = df['Clean_Text'].values.tolist()

### Building a Model

#### Label Encoding of Output

In [6]:
import tensorflow as tf
labels = np.array(train['indonlp_sentiment'])
y = []
for i in range(len(labels)):
    if labels[i] == 'neutral':
        y.append(0)
    if labels[i] == 'negative':
        y.append(1)
    if labels[i] == 'positive':
        y.append(2)
y = np.array(y)
labels = tf.keras.utils.to_categorical(y, 3, dtype="float32")
del y

#### Tokenizing dan Padding

In [7]:
max_words = 50000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
tweets = pad_sequences(sequences, maxlen=max_len)
print(tweets)

[[    0     0     0 ...    29    43     1]
 [    0     0     0 ...    57   218  1485]
 [    0     0     0 ...  1555    10    45]
 ...
 [    0     0     0 ...  1002     5   438]
 [    0     0     0 ...  1914   751 14964]
 [    0     0     0 ...   195     2   630]]


#### Split Dataset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size = 0.2, random_state = 42)
a, X_val, b, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(32974, 100) (32974, 3)
(8244, 100) (8244, 3)


#### LSTM

In [25]:
model2 = Sequential()
model2.add(layers.Embedding(max_words, 40, input_length=max_len))
model2.add(layers.BatchNormalization())
model2.add(layers.Bidirectional(layers.LSTM(20,dropout=0.20)))
model2.add(layers.Dense(3,activation='softmax'))
model2.compile(optimizer=Adam(learning_rate=0.001),loss='categorical_crossentropy', metrics=['accuracy'])

#### Train the Model

In [26]:
history = model2.fit(X_train,y_train, epochs=15,validation_data=(X_val, y_val))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15

KeyboardInterrupt: 

In [None]:
def plot_training_hist(history):
    '''Function to plot history for accuracy and loss'''
    
    fig, ax = plt.subplots(1,2,figsize=(10,4))
    #first plot
    ax[0].plot(history.history['accuracy'])
    ax[0].plot(history.history['val_accuracy'])
    ax[0].set_title('Model Accuracy')
    ax[0].legend(['Train', 'Validation'], loc='best')
    #second plot
    ax[1].plot(history.history['loss'])
    ax[1].plot(history.history['val_loss'])
    ax[1].set_title('Model Loss')
    ax[1].legend(['Train', 'Validation'], loc='best')

plot_training_hist(history)

In [20]:
model2 = Sequential()
model2.add(layers.Embedding(max_words, 40, input_length=max_len))
model2.add(layers.BatchNormalization())
model2.add(layers.Bidirectional(layers.LSTM(20,dropout=0.6)))
model2.add(layers.Dense(3,activation='softmax'))
model2.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

In [21]:
history = model2.fit(tweets, labels, epochs=15,validation_split=0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
# Predict sentiment on data test by using model has been created, and then visualize a confusion matrix
y_pred = np.argmax(model2.predict(X_test), axis=1)
y_true = np.argmax(y_test, axis=1)
accuracy = accuracy_score(y_true, y_pred)
print('Model Accuracy on Test Data:', accuracy)
confusion_matrix(y_true=y_true, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(confusion_matrix(y_true=y_true, y_pred=y_pred), fmt='g', annot=True)
ax.xaxis.set_label_position('top')
ax.xaxis.set_ticks_position('top')
ax.set_xlabel('Prediksi', fontsize=14)
ax.set_xticklabels(['Negatif', 'Netral', 'Positif'])
ax.set_ylabel('Actual', fontsize=14)
ax.set_yticklabels(['Negatif', 'Netral', 'Positif'])
plt.show()

In [22]:
# predict classes for test set
y_pred = np.argmax(model2.predict(X_test), axis=-1)

# convert one-hot encoded true labels to integer class labels
y_test_int = np.argmax(y_test, axis=1)

# calculate metrics
print(classification_report(y_test_int, y_pred))

# calculate confusion matrix
conf_mat = confusion_matrix(y_test_int, y_pred)
print(conf_mat)


              precision    recall  f1-score   support

           0       0.96      0.96      0.96      4376
           1       0.94      0.95      0.95      2957
           2       0.91      0.86      0.89       911

    accuracy                           0.95      8244
   macro avg       0.94      0.92      0.93      8244
weighted avg       0.95      0.95      0.95      8244

[[4188  143   45]
 [ 105 2822   30]
 [  79   48  784]]


In [23]:
# Calculate the accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_test_int, y_pred)
precision = precision_score(y_test_int, y_pred, average='macro')
recall = recall_score(y_test_int, y_pred, average='macro')
f1 = f1_score(y_test_int, y_pred, average='macro')

In [24]:
# Print the results
print('Accuracy: {:.2f}%'.format(accuracy * 100))
print('Precision: {:.2f}%'.format(precision * 100))
print('Recall: {:.2f}%'.format(recall * 100))
print('F1-score: {:.2f}%'.format(f1 * 100))

Accuracy: 94.54%
Precision: 93.57%
Recall: 92.40%
F1-score: 92.96%


#### Prediksi dan Hasil

In [29]:
sentiment = ['Neutral', 'Negative', 'Positive']

In [30]:
def predict(text):
    sequence = tokenizer.texts_to_sequences(text)
    test = pad_sequences(sequence, maxlen = max_len)
    result =sentiment[np.around(model2.predict(test), decimals=0).argmax(axis=1)[0]]
    return result

In [31]:
df['LSTM_Sentiment'] = df['Clean_Text'].apply(lambda x: predict(x))









































































































































































































































































In [35]:
df['LSTM_Sentiment'].value_counts()

Neutral     36868
Negative     4350
Name: LSTM_Sentiment, dtype: int64

In [30]:
sentiment = ['Neutral', 'Negative', 'Positive']

In [31]:
sequence = tokenizer.texts_to_sequences(['metaverse teknologi yang sangat buruk'])
test = pad_sequences(sequence, maxlen=max_len)
sentiment[np.around(model2.predict(test), decimals=0).argmax(axis=1)[0]]



'Negative'

In [32]:
sequence = tokenizer.texts_to_sequences(['wow metaverse teknologi yang sangat menjanjikan'])
test = pad_sequences(sequence, maxlen=max_len)
sentiment[np.around(model2.predict(test), decimals=0).argmax(axis=1)[0]]



'Positive'

In [33]:
sequence = tokenizer.texts_to_sequences(['kita lihat saja bagaimana perkembangan teknologi tersebut'])
test = pad_sequences(sequence, maxlen=max_len)
sentiment[np.around(model2.predict(test), decimals=0).argmax(axis=1)[0]]



'Neutral'