# Analisis Sentiment Lexicon Based Linkedin Singapura, dengan Algoritma Naive Bayes, Support Vector Machine, LSTM

## Data Collection

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df = pd.read_csv('linkedin_reviews_singapore_limited.csv')
df.head(100)

In [None]:
df.columns

## Data Cleaning

In [None]:
df = df[['content', 'score']]
df.head(10)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().any()

In [None]:
# drop null

df.dropna(inplace=True)
df.isnull().sum()

In [None]:
df.isna().sum()

In [None]:
len(df)

## Text Pre-Processing

In [None]:
# 1. Remove punctuation: Proses penghapusan simbol yang tidak relevan, nomor, tagar, dan tanda baca.
# 2. Case folding: Proses yang membuat huruf besar menjadi kecil sehingga tidak ada kesalahan
# mencocokan karakter atau huruf dalam kata-kata.
# 3. Stopword removal: Proses membuang kata-kata yang tidak berpengaru

import string
import re

def remove_punctuation(text):
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = re.sub(r'\d+', '', text)
  text = re.sub(r'#', '', text)
  return text

def case_folding(text):
  return text.lower()


df['content'] = df['content'].apply(remove_punctuation)
df['content'] = df['content'].apply(case_folding)

df.head(10)


In [None]:
# stopword removal

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in stop_words])


df['content'] = df['content'].apply(remove_stopwords)
df.head(10)


In [None]:
# remove emoticon and install library emoticon

def remove_emojis(text):
  emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r'', text)

df['content'] = df['content'].apply(remove_emojis)

df.head(10)

In [None]:
# tokenizing

from nltk.tokenize import word_tokenize

nltk.download('punkt')

def tokenize_text(text):
  return word_tokenize(text)

df['content'] = df['content'].apply(tokenize_text)
df.head(10)

In [None]:
# stemming

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(text):
  return [stemmer.stem(word) for word in text]

df['content'] = df['content'].apply(stem_text)
df.head(100)

## Labeling

In [None]:
# labeling lexicon vader sentiment

!pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
  scores = analyzer.polarity_scores(" ".join(text))
  if scores['compound'] >= 0.05:
    return 'Positive'
  elif scores['compound'] <= -0.05:
    return 'Negative'
  else:
    return 'Neutral'

df['sentiment'] = df['content'].apply(get_vader_sentiment)

df.head(100)

In [None]:
# plot distribution of sentiment analysis

import matplotlib.pyplot as plt
sentiment_counts = df['sentiment'].value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.title('Distribution of Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
# wordcloud

import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Combine all the text into a single string
all_words = ' '.join([' '.join(text) for text in df['content']])

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# naive bayes

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Join the stemmed words back into a string
df['content_str'] = df['content'].apply(lambda x: ' '.join(x))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['content_str'], df['sentiment'], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_vec = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_vec = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)

# Make predictions on the testing data
y_pred = nb_classifier.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
# support vector machines

from sklearn.svm import SVC

# Train a Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')  # You can experiment with different kernels
svm_classifier.fit(X_train_vec, y_train)

# Make predictions on the testing data
y_pred_svm = svm_classifier.predict(X_test_vec)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.2f}")

print("\nSVM Classification Report:\n", classification_report(y_test, y_pred_svm))

print("\nSVM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


In [None]:
# lstm

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Encode labels to numerical values
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the number of words
tokenizer.fit_on_texts(X_train)

# Convert text to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to a fixed length
max_sequence_length = 100  # You can adjust the sequence length
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Define the LSTM model
model = Sequential()
model.add(Embedding(5000, 128, input_length=max_sequence_length))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))  # 3 output classes (Positive, Negative, Neutral)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train_encoded, epochs=5, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test_encoded)
print(f"LSTM Accuracy: {accuracy:.2f}")

# Make predictions
y_pred_lstm_encoded = np.argmax(model.predict(X_test_padded), axis=-1)
y_pred_lstm = label_encoder.inverse_transform(y_pred_lstm_encoded)

# Print classification report and confusion matrix
print("\nLSTM Classification Report:\n", classification_report(y_test, y_pred_lstm))
print("\nLSTM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm))


## Visualisasi Model dan Kesimpulan

In [None]:
# Visualisasi Setiap Model dan Kesimpulan

import matplotlib.pyplot as plt
# Visualisasi Model dan Kesimpulan

# Naive Bayes
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Naive Bayes Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# SVM
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_svm), annot=True, fmt='d', cmap='Blues')
plt.title('SVM Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# LSTM
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_lstm), annot=True, fmt='d', cmap='Blues')
plt.title('LSTM Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Kesimpulan
print("Kesimpulan:")
print("Model Naive Bayes memiliki akurasi sebesar:", accuracy)
print("Model SVM memiliki akurasi sebesar:", accuracy_svm)
print("Model LSTM memiliki akurasi sebesar:", accuracy)

In [None]:
# get requirement txt

!pip freeze > requirements.txt