In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import FastText
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [5]:
df = pd.read_csv('../data/Indonesian Sentiment Twitter Dataset Labeled.csv', sep="\t")
df.head()

Unnamed: 0,sentimen,Tweet
0,-1,lagu bosan apa yang aku save ni huhuhuhuhuhuhu...
1,-1,kita lanjutkan saja diam ini hingga kau dan ak...
2,1,doa rezeki tak putus inna haa zaa larizquna ma...
3,1,makasih loh ntar kita bagi hasil aku 99 9 sisa...
4,-1,aku tak faham betul jenis orang malaysia yang ...


In [6]:
df.replace(-1, 0, inplace=True)

In [7]:
# Assuming the CSV file has a column named 'text' that contains the text of each document
corpus = df['Tweet'].tolist()
sentiments = df['sentimen'].values

In [8]:
sentiment_encode = {-1 : 0, 0 : 1, 1 : 2}
y = df['sentimen'].map(sentiment_encode).values

In [9]:
# Convert sentiments to binary labels
binary_labels = np.array([1 if x == 1 else 0 for x in sentiments])

In [10]:
# Preprocess labels (convert categorical labels to numerical labels)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [11]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
X_seq = tokenizer.texts_to_sequences(corpus)

# Pad sequences
max_length = max(len(seq) for seq in X_seq)
X_pad = pad_sequences(X_seq, maxlen=max_length)

In [13]:
# Train FastText model
model_ft = FastText([text.split() for text in corpus], vector_size=100, window=5, min_count=3, epochs=100)

In [14]:
# Create embedding matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = model_ft.wv[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        continue

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

In [16]:
# Define model architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

2024-05-03 15:06:35.746756: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-05-03 15:06:35.747955: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-05-03 15:06:35.748452: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-05-03 15:06:35.749068: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-03 15:06:35.749561: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)




In [17]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
# Train the model
epochs = 1
batch_size = 64
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1)

2024-05-03 15:06:57.620181: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.




<keras.src.callbacks.History at 0x29419edc0>

In [21]:
# Evaluate the model on test data
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)  # Apply threshold for binary classification
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

Accuracy: 0.7882
Precision: 0.6424
Recall: 0.1936
