In [1]:
!pip install pandas sastrawi tqdm



You should consider upgrading via the 'D:\@Programming\@On Going\School\spam_detector\venv\Scripts\python.exe -m pip install --upgrade pip' command.


# Preparing NLP Indonesia Data (Cleaning Module)

In [60]:
# imported from iFest 2021 Data Cleaning Module by Yaudahlah Teams,
# Refactored by Kaenova Mahendra Auditama (Yaudahlah Teams)

import pandas as pd
from tqdm import tqdm
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

class DataCleaning:
  def __init__(self, stopword:list = [], slang_word:dict = {}) -> None:
    factory     = StemmerFactory()
    self.stemmer     = factory.create_stemmer()
    self.stopword = stopword
    self.slang_word = slang_word

  def AddKamusAlay(self, new_dict:dict = {}):
    if (type(new_dict) != dict): raise TypeError("Not a valid type")
    self.slang_word = self.slang_word | new_dict
  
  def AddStopWord(self, stopword:list = []):
    if (type(stopword) != list): raise TypeError("Not a valid type")
    self.custom_word = self.custom_word + stopword
    
  def CleanDataFrame(self, df:pd.DataFrame, text_cols:str, label_cols:str, 
                     word_min:int=0, label_mapping:dict=None, dropna:bool=False):
    """
    Using multiprocessing (*if available) to process data from pandas Dataframe.
    Will be outputing a new dataframe with a processed data.
    """
    print("Processing...")
    final_list_clean = []
    final_list_dirty = []
    final_label = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
      sentence = row[text_cols]
      label = row[label_cols]
      
      # Process label
      if label_mapping is not None:
        if label not in label_mapping:
          print(f"Label {label} is not matched any label_mapping you've defined. This label will be ignored")
          continue      
        clean_label = label_mapping[label]
      else:
        clean_label = label  
      
      # Process Text
      clean_sentence = self.__cleanText__(sentence, self.slang_word,
                                          self.stopword, self.stemmer)
      if (clean_sentence is None):
        print(f"Sentence '{sentence}' is empty after processing. This sentence will be ignored")
        continue
      if (len(clean_sentence.split()) < word_min):
        continue
      
      final_list_clean.append(clean_sentence)
      final_list_dirty.append(sentence)
      final_label.append(clean_label)
        
    # Creating pandas dataframe
    data = {
      'raw': final_list_dirty,
      'processed': final_list_clean,
      'label': final_label
    }
    final_df = pd.DataFrame(data)
    if dropna:
      print("NaN Dropped")
      final_df = final_df.dropna(how='any')
    final_df['processed'] = final_df['processed'].astype(str)
    final_df['raw'] = final_df['raw'].astype(str)

    return final_df

  def CleanOneText(self, text):
    return self.__cleanText__(text, self.slang_word, self.stopword, self.stemmer)

  def __cleanText__(self, text:str, slangword:dict, stopword:list, stemmer) -> str:
    '''
    Processing a text, deleting some web associated word, removing word from stopword list
    and change defined slang word.
    '''
    # HTML and text annotation removal
    text = re.sub(r'http\S+', '', text)
    text = re.sub('(@\w+|#\w+)','',text)
    text = re.sub('<.*?>', '', text)  
    temp_text = list(text)
    for i in range(len(temp_text)):
      if temp_text[i] in string.punctuation:
        temp_text[i] = " "
    text = ''.join(temp_text)
    text = re.sub('[^a-zA-Z]',' ',text) 
    text = re.sub("\n"," ",text)
    text = text.lower()
    text = re.sub("(username|user|url|rt|xf|fx|xe|xa)\s|\s(user|url|rt|xf|fx|xe|xa)","",text)
    text = re.sub(r'(\w)(\1{2,})', r"\1", text)
    text = re.sub(r"\b[a-zA-Z]\b","",text)
    text = re.sub('(s{2,})',' ',text)
    text=' '.join(text.split())
    text_split = text.split(' ')
    final_text_split = []
    for i in range(len(text_split)):
      if type(text_split[i]) != str:
        continue
      if str(text_split[i]) in stopword:
        continue
      if str(text_split[i]) in slangword:
        text_split[i] = str(slangword[text_split[i]])
      final_text_split.append(text_split[i])
    
    stemmed_text = stemmer.stem(" ".join(final_text_split))
    
    # just to make sure
    if len(stemmed_text) == 0:
      return None   
    
    return stemmed_text

# Preparing Slang Word / Stopwords

In [61]:
import numpy as np
kamus_alay1 = pd.read_csv('https://raw.githubusercontent.com/fendiirfan/Kamus-Alay/main/Kamu-Alay.csv')
dict_kamus_alay1 = {}
for _,row in kamus_alay1.iterrows():
    if row["kataBaik"] is np.NaN:
        continue
    dict_kamus_alay1[row["kataAlay"]] = row["kataBaik"]

In [62]:
import numpy as np
kamus_alay2 = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv')
kamus_alay2 = kamus_alay2.filter(['slang', 'formal'], axis=1)
kamus_alay2 = kamus_alay2.drop_duplicates(subset=['slang'], keep='first')
dict_kamus_alay2 = {}
for _,row in kamus_alay2.iterrows():
    if row["formal"] is np.NaN:
        continue
    dict_kamus_alay2[row["slang"]] = row["formal"]

In [63]:
stopword   = list(pd.read_csv('https://raw.githubusercontent.com/datascienceid/stopwords-bahasa-indonesia/master/stopwords_id_satya.txt', header = None)[0])

# Start To Clean some Data

Creating Data Cleaner Instance

In [64]:
cleaner = DataCleaning(stopword, dict_kamus_alay1 | dict_kamus_alay2)

Importing RAW Data

In [65]:
df = pd.read_csv("../data/raw/dataset_sms_spam_v2.csv")
df

Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,promo
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,promo
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",promo
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",promo
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,promo
...,...,...
1138,"Yooo sama2, oke nanti aku umumin di grup kelas",normal
1139,😁 sebelumnya ga ad nulis kerudung. Kirain warn...,normal
1140,Mba mau kirim 300 ya,normal
1141,nama1 beaok bwrangkat pagi...mau cas atay tra...,normal


Checking Labels and Start Data Cleaning

In [66]:
df["label"].unique()

array(['promo', 'penipuan', 'normal'], dtype=object)

In [67]:
label_mapping = {
    "normal" : 0,
    "promo" : 1,
    "penipuan": 1
}

cleaned_df = cleaner.CleanDataFrame(df, "Teks", "label", 0, label_mapping=label_mapping)

Processing...


100%|██████████| 1143/1143 [02:44<00:00,  6.96it/s]


In [68]:
cleaned_df

Unnamed: 0,raw,processed,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,promo beli paket flash mulai gb my telkomsel a...,1
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,gb hari rp ribu spesial pilih aktif promo sd n...,1
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",pulang yang hormat sisa kuota flash kb downloa...,1
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",pulang yang hormat sisa kuota flash kb downloa...,1
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,gb hari rp ribu spesial pilih aktif buru skb,1
...,...,...,...
1138,"Yooo sama2, oke nanti aku umumin di grup kelas",ya oke umumin grup kelas,0
1139,😁 sebelumnya ga ad nulis kerudung. Kirain warn...,enggak ada tulis kerudung ira warna jins,0
1140,Mba mau kirim 300 ya,mbak kirim iya,0
1141,nama1 beaok bwrangkat pagi...mau cas atay tra...,nama beaok bwrangkat pagi cas atay tranfer,0


Apperantly there's duplicate on processed label, so we need to drop it

In [71]:
clean_dups_df = cleaned_df.drop_duplicates(["processed"])

In [72]:
clean_dups_df

Unnamed: 0,raw,processed,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,promo beli paket flash mulai gb my telkomsel a...,1
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,gb hari rp ribu spesial pilih aktif promo sd n...,1
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",pulang yang hormat sisa kuota flash kb downloa...,1
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,gb hari rp ribu spesial pilih aktif buru skb,1
5,5 HARI LAGI ! EKSTRA Pulsa 50rb dg beli paket ...,hari ekstra pulsa ribu dengan beli paket inter...,1
...,...,...,...
1138,"Yooo sama2, oke nanti aku umumin di grup kelas",ya oke umumin grup kelas,0
1139,😁 sebelumnya ga ad nulis kerudung. Kirain warn...,enggak ada tulis kerudung ira warna jins,0
1140,Mba mau kirim 300 ya,mbak kirim iya,0
1141,nama1 beaok bwrangkat pagi...mau cas atay tra...,nama beaok bwrangkat pagi cas atay tranfer,0


Exporting to CSV

In [74]:
clean_dups_df.to_csv("../data/processed/clean.csv", index=False)