# Text Mining

## 1. Scraping Data:

Mengambil 20000 data yang paling relevan dari aplikasi KAI Access Google Play Store.

In [None]:
from google_play_scraper import Sort, reviews

# Mengambil ulasan dari Aplikasi KAI Access
app_id = 'com.kai.kaiticketing'
result, continuation_token = reviews(
    app_id,
    lang='id',
    country='id',
    sort=Sort.MOST_RELEVANT,
    filter_score_with=None,
    count=20000
)

In [None]:
import pandas as pd

# menampilkan hasil scraping
df_scraping = pd.DataFrame(result)
print(len(df_scraping))
df_scraping.head()

## 2. Filter Data

Memilih hanya data ulasan yang memiliki kata 'tiket', maksimal 6000 data.

In [None]:
# Membuat list untuk menyimpan ulasan
scraping_reviews = []

# Mengambil 6000 ulasan yang mengandung kata 'tiket'
for scraping in result:
    if 'tiket' in scraping['content'].lower():
        scraping_reviews.append(scraping)
        if len(scraping_reviews) == 6000:
            break

In [None]:
import pandas as pd
df_filter = pd.DataFrame(scraping_reviews)

print("total data: ", len(df_filter))
print("tipe data ulasan (content): ", df_filter['content'].dtypes)
df_filter.head()

## 3. Simpan Dataset CSV

In [None]:
df_filter.to_csv("ulasan_tiket_kai_access.csv", index=False)


# Text Preprocessing

## 1. Data Checking :

Melakukan pengecekan data, seperti nilai null dan memisahkan fitur yang akan digunakan.

In [None]:
import pandas as pd
df = pd.read_csv('ulasan_tiket_kai_access.csv')

df_total = len(df)
print("total data: ", df_total)
df.head()

In [None]:
# memisahkan kolom content sebagai fitur
df = df.drop(columns=df.columns.difference(['content']))

In [None]:
# cek fitur yang kosong
value_null = df.content.isnull().sum()
# cek fitur yang memiliki kata 'tiket'
value_counts = df.content.str.contains('tiket', case=False).sum()

In [None]:
print("cek data null : ", value_null)
print("jumlah ulasan/content yang memiliki kata 'tiket' : ", value_counts)

print("total data:", len(df))
print("kolom:", len(df.columns),"|", df.columns)
df.head()

## 2. Data Cleaning :

### 2.1 Casefolding

<i>Proses melakukan konversi teks. Mengubah huruf besar menjadi huruf kecil, dan mengubah huruf aksen ke bentuk tanpa aksen yang setara (mis: huruf é menjadi e, huruf E menjadi e).

In [None]:
from unidecode import unidecode

def casefolding(text):
    text = text.lower()
    text = unidecode(text)
    
    return text

df['cleaning'] = df['content'].apply(casefolding)

In [None]:
df[['cleaning']].head()

### 2.2 Cleansing

<i>Proses membersihkan atau membuang noise (angka, tanda baca, emoji, multi spasi, dan baris enter)

In [None]:
import re

def cleansing(text):
    
    # menghapus karakter yang bukan huruf, angka, atau spasi
    text = re.sub(r'[^\w\s]', ' ', text)
    # menghapus angka menjadi satu spasi
    text = re.sub(r'\d+', '', text)
    # menghapus multi-spasi menjadi satu spasi
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['cleaning'] = df['cleaning'].apply(cleansing)

In [None]:
df[['cleaning']].head()

In [None]:
# hasil cleaning
df.head(5)

## 3. Data Normalize :

### 3.1 Stemming

<i>Proses menemukan kata dasar dengan menghilangkan semua imbuhan yang menyatu pada kata. Misalnya kata "diperbaiki" akan diubah menjadi "baik".

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()
    
def stemming(text):
    
    stemmed_text = stemmer.stem(text)
    return stemmed_text

df['normalize'] = df['cleaning'].apply(stemming)

In [None]:
df[['normalize']].head()

### 3.2 Slang Word Normalization

<i>Proses mengubah kata non-baku (slang) menjadi kata baku.

In [None]:
kbba_dictionary = pd.read_csv(
    'https://raw.githubusercontent.com/insomniagung/kamus_kbba/main/kbba.txt', 
    delimiter='\t', names=['slang', 'formal'], header=None, encoding='utf-8')

slang_dict = dict(zip(kbba_dictionary['slang'], kbba_dictionary['formal']))
kbba_dictionary.iloc[0:5]

In [None]:
def convert_slangword(text):
    words = text.split()
    
    normalized_words = [slang_dict[word] if word in slang_dict else word for word in words]
    normalized_text = ' '.join(normalized_words)
    return normalized_text

df['normalize'] = df['normalize'].apply(convert_slangword)

In [None]:
df[['normalize']].head()

In [None]:
# hasil normalize
df.head(11)

## 4. Words Removal :

### 4.1 Stopword Removal

<i>Proses menghapus seluruh kata yang dianggap tidak memiliki makna. Seperti kata hubung "yang", "di", "dan", "dari".

In [None]:
from nlp_id.stopword import StopWord

def remove_stopword(text):
    stopword = StopWord()
    text = stopword.remove_stopword(text)
    return text

df['removal'] = df['normalize'].apply(remove_stopword)

In [None]:
df[['removal']].head(5)

### 4.2 Unwanted Word Removal 

<i>Proses membuat dictionary kata-kata yang kurang dianggap bermakna secara manual, lalu menghapus kata yang sama dari ulasan. Kata yang dianggap tidak bermakna yaitu seperti nama bulan dalam kalender.

In [None]:
from nltk.tokenize import word_tokenize

def remove_unwanted_words(text):
    unwanted_words = {'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 
                      'sep', 'oct', 'nov', 'dec', 'januari', 'februari', 'maret', 
                      'april', 'mei', 'juni', 'juli', 'agustus', 'september', 
                      'oktober', 'november', 'desember', 'gin'}
    
    word_tokens = word_tokenize(text)
    filtered_words = [word for word in word_tokens if word not in unwanted_words]
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text

df['removal'] = df['removal'].apply(remove_unwanted_words)

In [None]:
df[['removal']].head(5)

### 4.3 Short Word Removal

<i>Proses menghapus kata apapun yang kurang dari 3 karakter. Seperti kata 'di'.

In [None]:
def remove_short_words(text):
    return ' '.join([word for word in text.split() if len(word) >= 3])

df['removal'] = df['removal'].apply(remove_short_words)

In [None]:
df[['removal']].head(5)

In [None]:
# hasil words removal
df.head(11)

## 5. Tokenizing  :

### 5.1 Split Words

<i>Proses pemisahan kata pada tiap ulasan.

In [None]:
from nlp_id.tokenizer import Tokenizer
tokenizer = Tokenizer()

def tokenizing(text):
    return tokenizer.tokenize(text)

df['tokenizing'] = df['removal'].apply(tokenizing)

In [None]:
df[['tokenizing']].head()

In [None]:
# df['tokenizing'].to_excel("tokenizing_ulasan_tiket_kai_access.xlsx", index=False)

### 5.2 Labeling

<i>Proses melakukan pelabelan (positif dan negatif) pada ulasan.

In [None]:
# Dictionary kata positif yang digunakan :
df_positive = pd.read_csv(
    'https://raw.githubusercontent.com/SadamMahendra/ID-NegPos/main/positive.txt', sep='\t')
list_positive = list(df_positive.iloc[::, 0])

# Dictionary kata negatif yang digunakan :
df_negative = pd.read_csv(
    'https://raw.githubusercontent.com/SadamMahendra/ID-NegPos/main/negative.txt', sep='\t')
list_negative = list(df_negative.iloc[::, 0])

In [None]:
# Membuat DataFrame positive words
df_positive_words = pd.DataFrame({'List Positive': list_positive})
print("Positive : ", df_positive_words.shape[0], "kata.")

# Membuat DataFrame negative words
df_negative_words = pd.DataFrame({'List Negative': list_negative})
print("Negative : ", df_negative_words.shape[0], "kata.")

# Menggabungkan DataFrame positive dan negative
df_dictionary = pd.concat([df_positive_words, df_negative_words], axis=1)

# Menampilkan DataFrame dengan tabel positif di sebelah kiri dan tabel negatif di sebelah kanan
df_dictionary.head()

In [None]:
# menghitung kata-kata positif/negatif pada dictionary lalu menentukan sentimennya :
def sentiment_analysis_dictionary_id(text):
    score = 0
    positive_words = []
    negative_words = []
    neutral_words = []

    for word in text:
        if (word in list_positive):
            score += 1
            positive_words.append(word)
        if (word in list_negative):
            score -= 1
            negative_words.append(word)
        if (word not in list_positive and word not in list_negative): 
            neutral_words.append(word)

    polarity = ''
    if (score > 0):
        polarity = 'positive'
    elif (score < 0):
        polarity = 'negative'
    else:
        polarity = 'neutral'

    result = {'positif': positive_words,'negatif':negative_words,'neutral': neutral_words}
    return score, polarity, result, positive_words, negative_words, neutral_words

hasil = df['tokenizing'].apply(sentiment_analysis_dictionary_id)
hasil = list(zip(*hasil))
df['polarity_score'] = hasil[0]
df['polarity'] = hasil[1]
hasil_kata_positive = hasil[3]
hasil_kata_negative = hasil[4]
hasil_kata_neutral = hasil[5]

In [None]:
# CEK NEUTRAL
all_netral_words = [word for sublist in hasil_kata_neutral for word in sublist]
netral_freq = pd.Series(all_netral_words).value_counts().reset_index().rename(columns={'index': 'Neutral Word', 0: 'Frequency'})
topword_neutral = netral_freq.head()

# HAPUS NETRAL
df = df[df.polarity != 'neutral']

In [None]:
# menghitung hasil sentiment analysis
print("jumlah: ", df['polarity'].value_counts().sum())
print(df['polarity'].value_counts())

In [None]:
df.head()

### 5.2.1 Top Words

Merupakan kata teratas yang paling sering muncul di seluruh dokumen berdasarkan kata dari kamus dictionary positive negative.

In [None]:
def top_words(hasil_kata_positive, hasil_kata_negative):
    all_positive_words = [word for sublist in hasil_kata_positive for word in sublist]
    all_negative_words = [word for sublist in hasil_kata_negative for word in sublist]
    positive_freq = pd.Series(all_positive_words).value_counts().reset_index().rename(columns={'index': 'Positive Word', 0: 'Frequency'})
    negative_freq = pd.Series(all_negative_words).value_counts().reset_index().rename(columns={'index': 'Negative Word', 0: 'Frequency'})
    topword_positive = positive_freq.head(20)
    topword_negative = negative_freq.head(20)
    return topword_positive, topword_negative
        
top_kata_positive, top_kata_negative = top_words(hasil_kata_positive, hasil_kata_negative)
result3 = pd.DataFrame(top_kata_positive)
result4 = pd.DataFrame(top_kata_negative)

concate_result = pd.concat([result3, result4], axis=1)
concate_result

### 5.2.2 Pie Chart

Proses melakukan visualisasi jumlah sentimen positive & negative menggunakan Pie Chart.

In [None]:
import matplotlib.pyplot as plt

df_sub = df.loc[df.polarity.isin(['positive', 'negative'])]
sizes = [count for count in df_sub.polarity.value_counts()]
explode = (0.1, 0)
total_sizes = sum(sizes)
fig, ax = plt.subplots(figsize=(6, 6), facecolor='none')
labels = ['Negative', 'Positive']
colors = ['#ff9999', '#66b3ff']
wedgeprops = {'width': 0.7, 'edgecolor': 'white', 'linewidth': 2}
pie = ax.pie(x=sizes, labels=['', ''], colors=colors, explode=explode,
    autopct=lambda pct: "{:.1f}%\n({:d})".format(pct, int(pct / 100 * total_sizes)),
    textprops={'fontsize': 9, 'color': 'black'}, shadow=True,
    wedgeprops=wedgeprops)
ax.legend(pie[0], labels, loc='center left', fontsize=10)
ax.set_title(f"Sentiment Analysis on KAI Access Reviews \n(Total: {total_sizes} reviews)", 
             fontsize=10, color='black', pad=4)
plt.show(fig)

### 5.2.3 Wordcloud

<i>Proses menampilkan seluruh kata dalam sentimen pada Wordcloud. Jika kata semakin sering muncul, maka ditampilkan dengan ukuran yang lebih besar.

In [None]:
from collections import Counter
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import numpy as np

positive_words = df[df.polarity == 'positive']['tokenizing'].apply(pd.Series).stack().tolist()
positive_word_counts = Counter(positive_words)

negative_words = df[df.polarity == 'negative']['tokenizing'].apply(pd.Series).stack().tolist()
negative_word_counts = Counter(negative_words)

mask_pos = np.array(Image.open("img/train_pos.jpg"))
mask_neg = np.array(Image.open("img/train_neg.jpg"))

positive_wordcloud = WordCloud(width=1000, height=800, mask=mask_pos, max_words=2000,
                               background_color='black').generate_from_frequencies(positive_word_counts)

negative_wordcloud = WordCloud(width=1000, height=800, mask=mask_neg, max_words=2000,
                               background_color='black').generate_from_frequencies(negative_word_counts)

figPos, axPos = plt.subplots(figsize=(9, 4))
axPos.imshow(positive_wordcloud.recolor(color_func=ImageColorGenerator(mask_pos)), interpolation='bilinear')
axPos.axis('off')
plt.show(figPos)

figNeg, axNeg = plt.subplots(figsize=(9, 4))
axNeg.imshow(negative_wordcloud.recolor(color_func=ImageColorGenerator(mask_neg)), interpolation='bilinear')
axNeg.axis('off')
plt.show(figNeg)

### 5.3 Pembobotan TF-IDF

<i>Proses memberikan nilai bobot pada dokumen. Proses TF-IDF (Term Frequency-Inverse Document Frequency) tujuannya untuk mengetahui seberapa penting suatu kata dalam dokumen tersebut. 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

df = df.copy()
df['tokenizing'] = df['tokenizing'].astype(str)
tf_idf = TfidfVectorizer()
review = df['tokenizing'].values.tolist()
tf_idf_vector = tf_idf.fit(review)
X = tf_idf_vector.transform(review)
y = df['polarity']

print(X[0:2])

## 6. Modeling :

### 6.1 Pemisahan Data (Train & Test) 

<i>Proses pemisahan data latih (train) & data uji (test). Data latih (train) ditetapkan 90%, dan data uji (test) sebanyak 10%.

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=0)
all_data = len(y)
data_train = len(y_train)
data_test = len(y_test)
vector = X_train.shape, X_test.shape

print("Total Data : ", all_data)
print("Total Data Train : ", data_train)
print("Total Data Test : ", data_test)


### 6.2 Radom Forest Classifier

<i>Pada proses ini, data yang telah dibagi akan dimodeling dengan Random Forest Classifier untuk mendapatkan akurasi. 

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

rfc_fit = rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

predict = rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

akurasi = accuracy_score(y_pred, y_test) * 100
akurasi_bulat = round(akurasi, 1)
print("Random Forest Classifier Accuracy Score: ", akurasi_bulat, "%")

## 7. Evaluasi Performa Model :

### 7.1 Classification Report 

<i>Proses menampilkan hasil kinerja model klasifikasi. Membantu dalam menganalisis dan memahami seberapa baik model dapat memprediksi label dengan benar. Jika semakin tinggi persentase Precision, Recall, dan F1-Score maka model sudah seimbang dan baik.

In [None]:
from sklearn.metrics import classification_report

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n\n", classification_rep)

### 7.2 Confusion Matrix

<i>Proses menampilkan Confusion Matrix dan Menghitung Akurasi Model. Confusion Matrix menyatakan jumlah data uji (test) yang benar dan salah diklasifikasi. Menghasilkan output True Positive, True Negative, False Positive, dan False Negative. Jika jumlah True (Positive & Negative) lebih banyak dari False (Positive & Negative), maka hasil data uji (test) dikatakan sudah baik.

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots()
ConfusionMatrixDisplay(cm, display_labels=rfc.classes_).plot(ax=ax) 
plt.show(fig) 

print()
TP = cm[0, 0]
TN = cm[1, 1]
FP = cm[1, 0]
FN = cm[0, 1]
Accuracy = (TP + TN) / (TP + TN + FP + FN)
resultAccuracy = round(Accuracy, 3)*100

equation = "(TP + TN) / (TP + TN + FP + FN) = Accuracy"
calculate = f"{TP} + {TN} / {TP} + {TN} + {FP} + {FN} = {resultAccuracy}"

df_cm = pd.DataFrame({
    "Value": [TP, TN, FP, FN],
    "Label": [
        "True Positive", "True Negative", "False Positive", "False Negative"
    ],
    "As": ["TP", "TN", "FP", "FN"]
})

print("Equation Accuracy:")
print(equation)
print()
print("Calculate Accuracy:")
print(calculate)

print()
df_cm

<center>- Selesai. -</center>