<a href="https://colab.research.google.com/github/inyunita/Text-Classification-NN/blob/main/Text_Classification_Neural_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

In [1]:
import pandas as pd
import numpy as np
#text preprocessing
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
#split data
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Overview


---

Dataset merupakan data komentar news dengan 3 buah label yaitu:

1 'not abusive' : contohnya "Pengetahuan tidak dapat menggantikan persahabatan. Aku (Patrick) lebih suka jadi idiot daripada kehilanganmu (Spongebob)"

2 'abusive but not offensive': contohnya "Kencing onta asli apa onta bonbin bray yg bagus ?"

3 'abusive and offensive' : ⁣contohnya "GOBLOK,ngapain beli indosat ? g ada untungnya, jaringan aja super lemot,di papua aja g ada jaringannya, mentok2 cuma 2G, harusnya pikir pakai otak bagaimana beli seluruh saham telkomsel dari singtel ???"

# Data Preparation and Cleaning

In [4]:
data = '/content/drive/MyDrive/Test Data Scientist Atmatech/Abusive_Language_Detection_Indonesia.xlsx'
df = pd.read_excel(data)
df

Unnamed: 0,Kalimat,label
0,san ente aje yg unboxing Indosat. masa ente ka...,1
1,"‚Å£‚Å£GOBLOK,ngapain beli indosat ? g ada untu...",3
2,Ngotot mau beli saham indosat kok jika jd pres...,1
3,Buyback Isat??anda sehat??sdh diminum obatnya?...,1
4,"Saya percaya kalau sama Sandiaga Uno, tapi kal...",1
...,...,...
3179,"hentikan aja, ngapai2n audisi2an, pantesa boca...",1
3180,Ini gerombolan orang2 yang mau meruntuhkan bib...,1
3181,KPAI isinya org bodoh ya.gmana pd djarum ga me...,3
3182,Kpai idak ada gunanya. #bubarkan KPAI,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3184 entries, 0 to 3183
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Kalimat  3184 non-null   object
 1   label    3184 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 49.9+ KB


In [6]:
# Mengecek elemen-elemen yang tidak valid
text_column = 'Kalimat'
invalid_data = df[df[text_column].isna() | df[text_column].apply(lambda x: not isinstance(x, str))]
print(invalid_data)

# Mengecek tipe data kolom 'text'
print(df[text_column].dtype)

    Kalimat  label
484    1111      1
object


In [7]:
df.isna().sum()

Kalimat    0
label      0
dtype: int64

In [8]:
# Mengganti elemen yang bukan string dengan string kosong
df[text_column] = df[text_column].apply(lambda x: str(x) if isinstance(x, str) else "")

In [9]:
# Mengecek elemen-elemen yang tidak valid
text_column = 'Kalimat'
invalid_data = df[df[text_column].isna() | df[text_column].apply(lambda x: not isinstance(x, str))]
print(invalid_data)

# Mengecek tipe data kolom 'text'
print(df[text_column].dtype)

Empty DataFrame
Columns: [Kalimat, label]
Index: []
object


### Hapus Tanda Baca dan Karakter yang Tidak Diperlukan

In [10]:
#remove punctuation function
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)
#apply remove punct
df['Kalimat'] = df['Kalimat'].apply(remove_punctuation)
df

Unnamed: 0,Kalimat,label
0,san ente aje yg unboxing Indosat masa ente kal...,1
1,ÅÅGOBLOKngapain beli indosat g ada untungnya ...,3
2,Ngotot mau beli saham indosat kok jika jd pres...,1
3,Buyback Isatanda sehatsdh diminum obatnya Siny...,1
4,Saya percaya kalau sama Sandiaga Uno tapi kala...,1
...,...,...
3179,hentikan aja ngapai2n audisi2an pantesa boca k...,1
3180,Ini gerombolan orang2 yang mau meruntuhkan bib...,1
3181,KPAI isinya org bodoh yagmana pd djarum ga men...,3
3182,Kpai idak ada gunanya bubarkan KPAI,1


# Tokenisasi

In [11]:
#tokenization function
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens
#apply tokenization to data
df['Kalimat'] = df['Kalimat'].apply(tokenize_text)
df

Unnamed: 0,Kalimat,label
0,"[san, ente, aje, yg, unboxing, Indosat, masa, ...",1
1,"[ÅÅGOBLOKngapain, beli, indosat, g, ada, untun...",3
2,"[Ngotot, mau, beli, saham, indosat, kok, jika,...",1
3,"[Buyback, Isatanda, sehatsdh, diminum, obatnya...",1
4,"[Saya, percaya, kalau, sama, Sandiaga, Uno, ta...",1
...,...,...
3179,"[hentikan, aja, ngapai2n, audisi2an, pantesa, ...",1
3180,"[Ini, gerombolan, orang2, yang, mau, meruntuhk...",1
3181,"[KPAI, isinya, org, bodoh, yagmana, pd, djarum...",3
3182,"[Kpai, idak, ada, gunanya, bubarkan, KPAI]",1


### Remove Stopword

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
stopwords = stopwords.words("indonesian")
df["Kalimat"] = df["Kalimat"].apply(lambda x: [word for word in x if word not in stopwords])
df

Unnamed: 0,Kalimat,label
0,"[san, ente, aje, yg, unboxing, Indosat, ente, ...",1
1,"[ÅÅGOBLOKngapain, beli, indosat, g, untungnya,...",3
2,"[Ngotot, beli, saham, indosat, jd, presiden, A...",1
3,"[Buyback, Isatanda, sehatsdh, diminum, obatnya...",1
4,"[Saya, percaya, Sandiaga, Uno, Jokowi, 100gak,...",1
...,...,...
3179,"[hentikan, aja, ngapai2n, audisi2an, pantesa, ...",1
3180,"[Ini, gerombolan, orang2, meruntuhkan, bibit, ...",1
3181,"[KPAI, isinya, org, bodoh, yagmana, pd, djarum...",3
3182,"[Kpai, idak, gunanya, bubarkan, KPAI]",1


### Split Dataset

In [14]:
# split data 70% train 30% test
X = df['Kalimat']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Feature Selection

In [15]:
# Unique word
unique_words = set()
for text_list in df['Kalimat']:
    for word in text_list:
        unique_words.add(word)

total_unique_words = len(unique_words)
print(f'Jumlah kata unik dalam korpus: {total_unique_words}')

Jumlah kata unik dalam korpus: 12391


In [16]:
# change words in list to string
X_train_text = [' '.join(kalimat) for kalimat in X_train]
X_test_text = [' '.join(kalimat) for kalimat in X_test]

In [17]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=12391)  # 12391 = unique word in corpus (jumlah fitur)

# Transform text data to vektor TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

In [18]:
print(X_train_tfidf.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [19]:
y_train

1074    1
261     1
1025    1
2906    1
195     1
       ..
1095    3
1130    1
1294    1
860     3
3174    1
Name: label, Length: 2228, dtype: int64

# Train Model

Arsitektur model yang dilatih adalah arsitektur Neural Network, arsitektur neural network dipilih berdasarkan penelitian-penelitian sebelumnya yang menunjukkan bahwa arsitektur neural network cocok dan baik digunakan untuk klasifikasi data teks. Hanya saja arsitektur neural network yang diimplementasikan disini adalah NN yang sederhana dengan 1 buah hidden layer (bukan deep learning).

Karena pada dataset terdapat 3 buah kelas, sehingga fungsi aktivasi yang digunakan adalah softmax, karena softmax cocok digunakan untuk multiclass.

In [20]:
# softmax
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / exp_x.sum(axis=1, keepdims=True)

In [21]:
X_train_tfidf.shape

(2228, 8501)

In [22]:
# Inisialisasi bobot dan bias
np.random.seed(0)
input_size = 8501 # fitur input
hidden_size = 6 #num of neuron
output_size = 3
learning_rate = 0.1
epochs = 100

# Inisialisasi bobot dan bias secara acak
w1 = np.random.randn(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
w2 = np.random.randn(hidden_size, output_size)
b2 = np.zeros((1, output_size))


In [24]:
# Pelatihan model
for epoch in range(epochs):
    # Forward pass
    y_train= y_train.astype('float64')
    X_train_tfidf =X_train_tfidf.astype('float64')
    z1 = X_train_tfidf.dot(w1) + b1
    a1 = np.tanh(z1)  # fungsi aktivasi tanh
    z2 = a1.dot(w2) + b2
    a2 = softmax(z2)

    # loss
    one_hot_y = pd.get_dummies(y_train.values.flatten()).values
    loss = -np.mean(one_hot_y * np.log(a2))

    #Backpropagation
    dz2 = a2 - one_hot_y
    dw2 = np.dot(a1.T, dz2)
    db2 = np.sum(dz2, axis=0, keepdims=True)
    dz1 = np.dot(dz2, w2.T) * (1 - a1**2)
    dw1 = np.dot(X_train_tfidf.T, dz1)
    db1 = np.sum(dz1, axis=0, keepdims=True)

    # dz2 = a2 - one_hot_y
    # dw2 = np.dot(a1.T, dz2) / X_train_tfidf.shape[1]  # Membagi dengan jumlah sampel
    # db2 = np.sum(dz2, axis=0, keepdims=True) / X_train_tfidf.shape[1]  # Membagi dengan jumlah sampel
    # dz1 = np.dot(dz2, w2.T) * (1 - a1**2)
    # dw1 = np.dot(X_train_tfidf.T, dz1)
    # db1 = np.sum(dz1, axis=0, keepdims=True)


    # Update bobot and bias
    w1 = w1 - learning_rate * dw1
    b1 = b1 - learning_rate * db1
    w2 = w2 - learning_rate * dw2
    b2 = b2 - learning_rate * db2

    if epoch % 100 == 0:
        print(f'Epoch {epoch}: loss = {loss}')

ValueError: ignored