In [23]:
import re
import warnings

import numpy as np
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from gensim.models import FastText
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('../data/Indonesian Sentiment Twitter Dataset Labeled.csv', sep="\t")
df.head()

Unnamed: 0,sentimen,Tweet
0,-1,lagu bosan apa yang aku save ni huhuhuhuhuhuhu...
1,-1,kita lanjutkan saja diam ini hingga kau dan ak...
2,1,doa rezeki tak putus inna haa zaa larizquna ma...
3,1,makasih loh ntar kita bagi hasil aku 99 9 sisa...
4,-1,aku tak faham betul jenis orang malaysia yang ...


In [5]:
alay_dict = pd.read_csv("../data/new_kamusalay.csv", encoding = 'latin-1', header=None)
alay_dict = alay_dict.rename(columns=  {0: 'original',
                                        1: 'replacement'})
stopword_dict = pd.read_csv('../data/stopwordbahasa.csv', header=None)
stopword_dict = stopword_dict.rename(columns={0: 'stopword'})

In [6]:
df.isnull().sum()

sentimen    0
Tweet       0
dtype: int64

In [7]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

#lowercase
def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('rt',' ',text) # Remove every retweet symbol
    text = re.sub('((@[^\s]+)|(#[^\s]+))',' ',text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('  +', ' ', text) # Remove extra spaces
    return text
    
def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

def remove_stopword(text):
    text = ' '.join(['' if word in stopword_dict.stopword.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

def stemming(text):
    return stemmer.stem(text)

def preprocess(text):
    text = lowercase(text) # 1
    text = remove_unnecessary_char(text) # 2
    text = remove_nonaplhanumeric(text) # 3
    text = normalize_alay(text) # 4
    text = remove_stopword(text) # 5
    text = stemming(text) # 6
    return text

In [8]:
df.replace(-1, 0, inplace=True)

In [9]:
X = df['Tweet'].apply(preprocess)

In [10]:
# Assuming the CSV file has a column named 'text' that contains the text of each document
corpus = X.tolist()
sentiments = df['sentimen'].values

In [11]:
max_features = 1000
max_len=50
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X.values)
X = tokenizer.texts_to_sequences(X.values)
X = pad_sequences(X, padding='post', maxlen=max_len)
X.shape

(10806, 50)

In [12]:
sentiment_encode = {-1 : 0, 0 : 1, 1 : 2}
y = df['sentimen'].map(sentiment_encode).values
print(df['sentimen'])
print(y)

0        0
1        0
2        1
3        1
4        0
        ..
10801    1
10802    0
10803    1
10804    1
10805    1
Name: sentimen, Length: 10806, dtype: int64
[1 1 2 ... 2 2 2]


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=1)
y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)
y_val = to_categorical(y_val, 3)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7348, 50) (7348, 3)
(1621, 50) (1621, 3)


In [14]:
# Convert sentiments to binary labels
binary_labels = np.array([1 if x == 1 else 0 for x in sentiments])

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the vectorizer on the corpus
tfidf_matrix = vectorizer.fit_transform(corpus)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

In [15]:
# Print TF-IDF matrix and feature names
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("\nFeature Names:")
print(feature_names)

TF-IDF Matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Feature Names:
['00' '000' '000ampe' ... 'zulkifli' 'zuragan' 'zzz']


Fast Text

In [28]:
# Convert the TF-IDF matrix to a dense array
dense_tfidf_matrix = tfidf_matrix.toarray()

# Initialize a FastText model
model_ft = FastText(vector_size=100, window=5, min_count=1)

# Build vocabulary and train the FastText model
model_ft.build_vocab(corpus_iterable=[text.split() for text in corpus])
model_ft.train(corpus_iterable=[text.split() for text in corpus], total_examples=len(corpus), epochs=10)

# Initialize an empty matrix to store word embeddings
word_embeddings = np.zeros((len(feature_names), model_ft.vector_size))

# Map each word to its FastText vector
for idx, word in enumerate(feature_names):
    if word in model_ft.wv:
        word_embeddings[idx] = model_ft.wv[word]

word_embed_ft = model_ft.wv["dan"]
print(word_embed_ft)

# Print word embeddings for the first few words
# print("Word Embeddings:")
# print(word_embeddings[:10])  # Print embeddings for the first 10 words

[-1.8773283e+00 -5.6173515e-01  5.0368216e-02  7.9887342e-01
  2.1571274e+00 -1.9283055e-01  8.1208950e-01 -5.3469980e-01
  2.9478994e-01 -1.4550249e-01 -1.2917448e+00 -2.8188163e-01
 -1.3942964e+00  6.0440588e-01 -1.2799377e+00 -2.2690591e-01
  1.3942947e+00 -6.5789783e-01 -1.0300760e+00 -2.0422659e+00
  7.2681054e-02 -8.2481004e-02 -1.6583884e+00  3.8017708e-01
 -1.8659068e+00 -1.7409097e-01  6.1934799e-01  8.2431084e-01
 -9.7448117e-01  3.7246206e-01 -8.0249459e-01  3.5207224e-01
  5.8377892e-01 -7.2377008e-01 -6.2820353e-02  6.3628823e-01
  4.7861558e-01 -1.1279988e-01 -1.3395792e+00 -5.3948700e-01
  1.9469628e-01 -2.5458405e+00 -2.8755498e-01  5.4065841e-01
 -8.4550664e-02 -5.1059987e-02 -2.2207949e-01 -8.1495178e-01
 -9.1826133e-02 -9.7936332e-01  4.5537513e-01 -2.4295153e-01
  1.3331388e-03  4.4800773e-01  2.5315753e-01 -1.4656006e+00
  7.2339021e-02  7.9195686e-02  8.2358187e-01 -1.0866533e-01
  9.8380250e-01  5.9673309e-01 -1.1589496e+00  1.3924528e+00
  8.0436367e-01  1.61991

Glove

In [29]:
# Load the GloVe word vectors
glove_file = '../data/vectors.txt'
word_vectors = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

# Initialize an empty matrix to store word embeddings
word_embeddings = np.zeros((len(feature_names), word_vectors.vector_size))

# Map each word to its GloVe vector
for idx, word in enumerate(feature_names):
    if word in word_vectors:
        word_embeddings[idx] = word_vectors[word]
        
word_embed_glove = word_vectors.word_vec('dan')
print(word_embed_glove)

# Print word embeddings for the first few words
# print("Word Embeddings:")
# print(word_embeddings[:10])  # Print embeddings for the first 10 words

[-0.028669 -0.079521 -0.745437  0.947957  0.1456    0.208793  0.563256
  0.144865 -1.168601 -0.770691 -0.868666 -0.099608 -0.677246  0.276379
 -0.202523 -0.938004  0.869196  1.154606 -0.711352  1.530791  0.633017
  1.103734  3.651248 -0.063392 -1.85838   1.355643 -0.342743 -0.638605
  0.393076 -1.268856  1.055029 -0.375233 -1.428337  0.428032  1.361086
  0.711733  1.206595  1.210484 -0.190727  0.194155  0.26984  -0.191284
 -0.169478 -0.336127 -0.449392  0.552006 -0.037513 -0.216293  0.687252
 -0.775978]


Word2vec- Cbow

In [31]:
# Preprocess the corpus for Word2Vec
tokenized_corpus = [text.split() for text in corpus]

# Initialize a Word2Vec CBOW model
model_w2v_cbow = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, sg=0, min_count=1, workers=4)

# Train the Word2Vec CBOW model
model_w2v_cbow.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)

# Initialize an empty matrix to store word embeddings
word_embeddings = np.zeros((len(feature_names), model_w2v_cbow.vector_size))

# Map each word to its Word2Vec CBOW vector
for idx, word in enumerate(feature_names):
    if word in model_w2v_cbow.wv:
        word_embeddings[idx] = model_w2v_cbow.wv[word]

word_embed_cbow = model_w2v_cbow.wv.most_similar('dan')
print(word_embed_cbow)

# Print word embeddings for the first few words
# print("Word Embeddings:")
# print(word_embeddings[:10])  # Print embeddings for the first 10 words

[('tok', 0.9949443340301514), ('low', 0.9947314262390137), ('bukit', 0.9947167634963989), ('taman', 0.9946802258491516), ('ro', 0.9945077896118164), ('burung', 0.9943517446517944), ('iklan', 0.9942890405654907), ('wani', 0.9942115545272827), ('guru', 0.9941952228546143), ('ketjup', 0.9941721558570862)]


In [32]:
# Preprocess the corpus for Word2Vec
tokenized_corpus = [text.split() for text in corpus]

# Initialize a Word2Vec Skip-gram model
model_w2v_sg = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, sg=1, min_count=1, workers=4)

# Train the Word2Vec Skip-gram model
model_w2v_sg.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)

# Initialize an empty matrix to store word embeddings
word_embeddings = np.zeros((len(feature_names), model_w2v_sg.vector_size))

# Map each word to its Word2Vec Skip-gram vector
for idx, word in enumerate(feature_names):
    if word in model_w2v_sg.wv:
        word_embeddings[idx] = model_w2v_sg.wv[word]
        
word_embed_skipgram = model_w2v_sg.wv.most_similar('dan')
print(word_embed_skipgram)

# Print word embeddings for the first few words
# print("Word Embeddings:")
# print(word_embeddings[:10])  # Print embeddings for the first 10 words

[('musketeers', 0.9901020526885986), ('indomi', 0.9891538619995117), ('disamperin', 0.9886239767074585), ('dft', 0.9883655309677124), ('tahhh', 0.9881646633148193), ('ditakoni', 0.98738032579422), ('1962aat', 0.9865550398826599), ('cingkrang', 0.9863256216049194), ('tahniah', 0.9862285256385803), ('lban', 0.9862149953842163)]
