In [270]:
!pip install PySastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [271]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [272]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [273]:
%cd ./drive/MyDrive/NLP/

[Errno 2] No such file or directory: './drive/MyDrive/NLP/'
/content/drive/MyDrive/NLP


In [274]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import nltk
import torch
from torchtext.data import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
import gensim
import gc
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Import Data

In [275]:
df = pd.read_csv("./data_worthcheck/train.csv",index_col=0)

In [276]:
test_df = pd.read_csv("./data_worthcheck/test.csv")
test_df

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,yes
...,...,...
2795,ku tenang2 bae ku sih ya corona nya ga depok k...,no
2796,guru hati hati ya virus corona uda indonesia t...,yes
2797,4 terawan menyebut virus corona indonesia terd...,yes
2798,realffk buhari can t pronounce corona virus,no


In [277]:
text_normalization_url = "https://raw.githubusercontent.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection/master/new_kamusalay.csv"
text_normalization_df = pd.read_csv(text_normalization_url,names=['slang', 'normal'], encoding='ISO-8859-1')
text_normalization_df

Unnamed: 0,slang,normal
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali
...,...,...
15162,mendikbud,menteri pendidikan dan kebudayaan
15163,mendag,menteri perdagangan
15164,menaker,menteri tenaga kerja
15165,memetwit,mentwit


## Preprocess Data

In [278]:
df.loc[df["label"] == "no",'label']= 0
df.loc[df["label"] == "yes",'label'] = 1
test_df.loc[test_df["label"] == "no",'label']= 0
test_df.loc[test_df["label"] == "yes",'label'] = 1

In [279]:
def preprocess_x(df, normaliseText = False):
  cleanedData = []

  factory = StemmerFactory()
  bahasa_stemmer = factory.create_stemmer()
  factory = StopWordRemoverFactory()
  stopwords = factory.get_stop_words()
  swords = stopwords
  for text in df["text_a"]:
      
      text = text.lower()

      # Cleaning links
      text = re.sub(r'https\s+t\s+co\s\w+', '', text)
      
      # Cleaning everything except alphabetical characters
      text = re.sub("[^a-zA-Z]"," ",text)
      
      
      # Tokenizing and lemmatizing
      text = nltk.word_tokenize(text)

      # Normalize text
      if normaliseText:
        normalized_text = []
        for word in text:
          if (text_normalization_df['slang'] == word).any():
            index =  text_normalization_df.index[text_normalization_df['slang'] == word]
            normalized_text.append(text_normalization_df.iloc[index[0]]['normal'])
          else:
            normalized_text.append(word)
      else:
        normalized_text = text

      # Stem Text
      text = [bahasa_stemmer.stem(word) for word in normalized_text]
      
      # Removing stopwords
      text = [word for word in text if word not in swords]
      
      # Joining
      text = " ".join(text)
      
      cleanedData.append(text)
  return cleanedData

In [280]:
x_train = preprocess_x(df)
x_test = preprocess_x(test_df)
y_train = np.asarray(df["label"]).astype('int')
y_test = np.asarray(test_df["label"]).astype('int')

#Deep Learning

In [281]:
# Load pretrained word2vec model (taken from https://github.com/deryrahman/word2vec-bahasa-indonesia)
path = './pretrained_word2vec_bahasa/idwiki_word2vec_300.model'
id_w2v = gensim.models.word2vec.Word2Vec.load(path)

In [282]:
# Declare global variables
TOKEN_LENGTH = 300
learning_rate = 0.01
num_epochs = 10
CUDA: torch.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [283]:
tokenizer = get_tokenizer("spacy")
train_tokens = [tokenizer(x) for x in x_train]
test_tokens = [tokenizer(x) for x in x_test]

  f'Spacy model "{language}" could not be loaded, trying "{OLD_MODEL_SHORTCUTS[language]}" instead'


In [284]:
def vectorize_tokens(tokens):
  vectorized_tokens = []
  for tokens in tokens:
    vectorized_token = []
    for token in tokens:
      try:
        vectorized_token.append(id_w2v.wv.get_vector(token))
      except:
        vectorized_token.append(np.zeros(300))
    vectorized_tokens.append(torch.tensor(vectorized_token))
  return vectorized_tokens

In [285]:
vectorized_train_tokens = vectorize_tokens(train_tokens)
vectorized_test_tokens = vectorize_tokens(test_tokens)

In [286]:
class WorthcheckDataset(torch.utils.data.Dataset):
  def __init__(self, vectorized_tokens, labels, max_token_size) -> None:
    super().__init__()
    self.vectorized_tokens = vectorized_tokens
    self.labels = labels
    self.max_token_size = max_token_size
    
  def __len__(self) -> int:
    return len(self.vectorized_tokens)
    
  def __getitem__(self, idx) -> tuple:
    if self.vectorized_tokens[idx].size(0) <= self.max_token_size:
      return (torch.cat((self.vectorized_tokens[idx], torch.zeros(self.max_token_size - self.vectorized_tokens[idx].size(0), 300)), dim=0), self.labels[idx])
    else:
      return (self.vectorized_tokens[idx][:self.max_token_size, :], self.labels[idx])

In [287]:
train_dataset = WorthcheckDataset(vectorized_train_tokens, y_train, TOKEN_LENGTH)
test_dataset = WorthcheckDataset(vectorized_test_tokens, y_test, TOKEN_LENGTH)

In [288]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size = 256, shuffle = True, pin_memory = True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size = 256, pin_memory = True)

In [289]:
class DeepLearningModel(torch.nn.Module):
  def __init__(self, input_size, hidden_size, num_layers):
    super(DeepLearningModel, self).__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.clf = torch.nn.Sequential(
            torch.nn.Linear(input_size*num_layers, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size, hidden_size*2),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size*2, 1),
            torch.nn.Sigmoid()
        )

  def forward(self, x):
      return self.clf(x.view(x.size(0), -1))

In [290]:
dl_model = DeepLearningModel(300, 500, TOKEN_LENGTH)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(dl_model.parameters(), lr=learning_rate)  

In [291]:
def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()

In [292]:
def fit(dataloader: torch.utils.data.DataLoader, model: torch.nn.Module, criterion: torch.nn.Module, optimizer: torch.optim.Optimizer) -> tuple:
  
  # set model to training mode
  model.to(CUDA).train()

  # log
  epoch_loss: float = 0
  epoch_correct: int = 0
  epoch_count: int = 0

  # load a batch of data
  for X, y in dataloader:

    # move to GPU
    X: torch.Tensor = X.to(torch.float).to(CUDA)
    y: torch.Tensor = y.view(-1, 1).to(torch.float).to(CUDA)

    # forward pass
    y_tilde: torch.Tensor = model(X)
    loss: float = criterion(y_tilde, y)

    # backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # log
    epoch_loss += loss.item()
    epoch_correct += (y_tilde.round() == y).sum().item()
    epoch_count += y.size(dim=0)
    clear_cache()
    
  # return log
  return (epoch_loss, epoch_correct, epoch_count)

In [293]:
def evaluate(dataloader: torch.utils.data.DataLoader, model: torch.nn.Module, criterion: torch.nn.Module) -> tuple:
  
  # set model to test mode
  model.to(CUDA).eval()

  # log
  epoch_loss: float = 0
  epoch_correct: int = 0
  epoch_count: int = 0

  # load a batch of data
  for X, y in dataloader:

    # move to GPU
    X: torch.Tensor = X.to(torch.float).to(CUDA)
    y: torch.Tensor = y.view(-1, 1).to(torch.float).to(CUDA)

    # forward pass
    with torch.no_grad():
      y_tilde: torch.Tensor = model(X)
      loss: float = criterion(y_tilde, y)

    # log
    epoch_loss += loss.item()
    epoch_correct += (y_tilde.round() == y).sum().item()
    epoch_count += y.size(dim=0)
    clear_cache()
    
  # return log
  return (epoch_loss, epoch_correct, epoch_count)

In [294]:
# Train the model
n_total_steps = len(train_dataloader)
for epoch in range(num_epochs):
  epoch_loss, epoch_correct, epoch_count = fit(train_dataloader, dl_model, criterion, optimizer)
  print (f'Epoch [{epoch+1}/{num_epochs}], loss: {epoch_loss}, accuracy: {epoch_correct / epoch_count} ({epoch_correct}/{epoch_count})')
  torch.save(dl_model.state_dict(), f"./models/model_state_linear{epoch+1}.pt")

Epoch [1/10], loss: 55.49845510721207, accuracy: 0.8004722003610943 (17291/21601)
Epoch [2/10], loss: 31.549093410372734, accuracy: 0.887412619786121 (19169/21601)
Epoch [3/10], loss: 22.444929763674736, accuracy: 0.9251886486736726 (19985/21601)
Epoch [4/10], loss: 23.54626925289631, accuracy: 0.9494930790241193 (20510/21601)
Epoch [5/10], loss: 19.401399575173855, accuracy: 0.9632887366325633 (20808/21601)
Epoch [6/10], loss: 20.25689112767577, accuracy: 0.9706958011203185 (20968/21601)
Epoch [7/10], loss: 20.082257814705372, accuracy: 0.9710661543447062 (20976/21601)
Epoch [8/10], loss: 20.67242281138897, accuracy: 0.9705569186611731 (20965/21601)
Epoch [9/10], loss: 30.124452006071806, accuracy: 0.9658812092032776 (20864/21601)
Epoch [10/10], loss: 27.569613875821233, accuracy: 0.969168094069719 (20935/21601)


In [295]:
for i_epoch in range(num_epochs):
  dl_model.load_state_dict(torch.load(f"./models/model_state_linear{i_epoch+1}.pt", map_location=CUDA))
  epoch_loss, epoch_correct, epoch_count = evaluate(test_dataloader, dl_model, criterion)
  print(f"Epoch {i_epoch + 1} model, test set loss: {epoch_loss}, accuracy: {epoch_correct / epoch_count} ({epoch_correct}/{epoch_count})")

Epoch 1 model, test set loss: 5.035563409328461, accuracy: 0.8242857142857143 (2308/2800)
Epoch 2 model, test set loss: 5.888532340526581, accuracy: 0.8175 (2289/2800)
Epoch 3 model, test set loss: 8.63590782880783, accuracy: 0.8128571428571428 (2276/2800)
Epoch 4 model, test set loss: 8.66850996017456, accuracy: 0.8160714285714286 (2285/2800)
Epoch 5 model, test set loss: 10.536758244037628, accuracy: 0.8042857142857143 (2252/2800)
Epoch 6 model, test set loss: 11.139420926570892, accuracy: 0.8042857142857143 (2252/2800)
Epoch 7 model, test set loss: 11.89089810848236, accuracy: 0.8014285714285714 (2244/2800)
Epoch 8 model, test set loss: 14.786003947257996, accuracy: 0.8135714285714286 (2278/2800)
Epoch 9 model, test set loss: 12.802927196025848, accuracy: 0.7978571428571428 (2234/2800)
Epoch 10 model, test set loss: 18.265276312828064, accuracy: 0.8075 (2261/2800)
