<a href="https://colab.research.google.com/github/kangnurrohman/sentiment-analysis-projects/blob/main/src/polarity-determination-with-lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://www.kaggle.com/code/kangnurrohman/polarity-determination-with-lstm?scriptVersionId=112064672" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Install Library

In [None]:
!pip install Sastrawi

## Load Library

In [None]:
import re
import tqdm
import nltk
import string
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from keras.models import load_model
from nltk.tokenize import word_tokenize
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

## Load Data

In [None]:
df = pd.read_excel('/kaggle/input/review-of-the-application-pln-mobile/review of the application PLN mobile.xlsx')
df.rename(columns = {'content':'review', 'score':'sentiment'}, inplace = True)
df.info()

In [None]:
df = df.sort_values(by='at', ascending=False)
df = df[['review', 'sentiment']]
df.head()

## Exploratory Data Analysis

In [None]:
df = df.replace({'sentiment' : {1:'negative', 2:'negative', 3:'neutral', 4:'positive', 5: 'positive' }})
df.head()

In [None]:
df.sentiment.value_counts()

In [None]:
#removing neutral
df = df[df.sentiment != "neutral"]

## Data Preprocessing

In [None]:
def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")

def remove_number(text):
    return  re.sub(r"\d+", "", text)

def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

def remove_whitespace_LT(text):
    return text.strip()

def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

def word_tokenize_wrapper(text):
    return word_tokenize(text)

def stopwords_removal(words):
    list_stopwords = nltk.corpus.stopwords.words('indonesian')
    #list_stopwords = stopwords.words('indonesian')
    #list_stopwords.extend([])
    #txt_stopword = pd.read_csv("#", names= ["stopwords"], header = None)
    #list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
    return [word for word in words if word not in list_stopwords]

def stemmed_wrapper(term):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(term)

In [None]:
def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    #case folding
    doc = doc.lower()
    doc = doc.lower()
    #tokenization
    doc = remove_tweet_special(doc)
    doc = remove_number(doc)
    doc = remove_punctuation(doc)
    doc = remove_whitespace_LT(doc)
    doc = remove_whitespace_multiple(doc)
    doc = remove_singl_char(doc)
    doc = word_tokenize_wrapper(doc)
    #filtering
    doc = stopwords_removal(doc)
    #Stemming for indonesian
    #doc = stemmed_wrapper(doc)
    norm_docs.append(doc)
    
  norm_docs = [" ".join(word) for word in norm_docs]
  return norm_docs

In [None]:
%%time
df.review = pre_process_corpus(df.review)

In [None]:
df.head()

## Handling imbalance (Oversampling)

In [None]:
from sklearn.utils import resample
# Separate majority and minority classes in training data for upsampling 
data_majority = df[df['sentiment'] == 'positive']
data_minority = df[df['sentiment'] == 'negative']

print("majority class before upsample:",data_majority.shape)
print("minority class before upsample:",data_minority.shape)

# Upsample minority class
data_minority_upsampled = resample(data_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples= data_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_balance = pd.concat([data_majority, data_minority_upsampled])
 
# Display new class counts
print("After upsampling\n",df_balance.sentiment.value_counts(),sep = "")

## Splitting Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_balance.review, df_balance.sentiment, test_size=0.2, random_state=42)
X_train.shape , X_test.shape, y_train.shape, y_test.shape

## Data formatting

In [None]:
import keras

t = keras.preprocessing.text.Tokenizer(oov_token='<UNK>')
# fit the tokenizer on the documents
t.fit_on_texts(X_train)
t.word_index['<PAD>'] = 0

In [None]:
max([(k, v) for k, v in t.word_index.items()], key = lambda x:x[1]), min([(k, v) for k, v in t.word_index.items()], key = lambda x:x[1]), t.word_index['<UNK>']

### Sequence

In [None]:
X_train  = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)

In [None]:
print("Vocabulary size={}".format(len(t.word_index)))
print("Number of Documents={}".format(t.document_count))

### Sequence Normalization

In [None]:
MAX_SEQUENCE_LENGTH = 100

In [None]:
# pad dataset to a maximum review length in words
import tensorflow as tf
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
X_train.shape, X_test.shape

### Encoding Labels

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
num_classes=2 # positive -> 1, negative -> 0

In [None]:
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
VOCAB_SIZE = len(t.word_index)

## Build Model Architecture

In [None]:
EMBEDDING_DIM = 300 # dimension for dense embeddings for each token
LSTM_DIM = 128 # total LSTM units

inp = keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,))
x = keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, trainable=True)(inp)
x = keras.layers.CuDNNLSTM(LSTM_DIM, return_sequences=True)(x)
#x = (keras.layers.LSTM(LSTM_DIM, return_sequences=True)(x)
x = keras.layers.Dense(LSTM_DIM, activation='relu')(x)
x = keras.layers.Dropout(rate=0.5)(x)
x = keras.layers.Dense(LSTM_DIM, activation='relu')(x)
x = keras.layers.Dropout(rate=0.5)(x)

outp = keras.layers.Dense(1, activation='sigmoid')(x)
# initialize the model
model = keras.models.Model(inputs=inp, outputs=outp)

# make the model parallel
#model2 = tf.keras.utils.multi_gpu_model(model, gpus=2)
    
model.compile(loss='binary_crossentropy', optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
model.summary()

## Train Model

In [None]:
batch_size = 128
epochs = 300
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=30)
mc = ModelCheckpoint('./best_model/best_model_lstm.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
# fit model
history = model.fit(X_train, y_train,  batch_size=batch_size, shuffle=True, validation_split=0.1, epochs=epochs, verbose=1, callbacks=[es, mc])

## Evaluate Model Performance

In [None]:
saved_model = load_model('./best_model/best_model_lstm.h5')
train_acc = saved_model.evaluate(X_train, y_train, verbose=1)
test_acc = saved_model.evaluate(X_test, y_test, verbose=1)
print('Train: %.2f%%, Test: %.2f%%' % (train_acc[1]*100, test_acc[1]*100))

In [None]:
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
prediction_probs = model.predict(X_test, verbose=1).ravel()
predictions = [1 if prob > 0.5 else 0 for prob in prediction_probs]