In [1]:
!pip install sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 5.0 MB/s 
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [17]:
# imported from iFest 2021 Data Cleaning Module by Yaudahlah Teams,
# Refactored by Kaenova Mahendra Auditama (Yaudahlah Teams)

import pandas as pd
from tqdm import tqdm
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

class DataCleaning:
  def __init__(self, stopword:list = [], slang_word:dict = {}) -> None:
    factory     = StemmerFactory()
    self.stemmer     = factory.create_stemmer()
    self.stopword = stopword
    self.slang_word = slang_word

  def AddKamusAlay(self, new_dict:dict = {}):
    if (type(new_dict) != dict): raise TypeError("Not a valid type")
    self.slang_word = self.slang_word | new_dict
  
  def AddStopWord(self, stopword:list = []):
    if (type(stopword) != list): raise TypeError("Not a valid type")
    self.custom_word = self.custom_word + stopword
    
  def CleanDataFrame(self, df:pd.DataFrame, text_cols:str, label_cols:str, 
                     word_min:int=0, label_mapping:dict=None, dropna:bool=False, verbose=False):
    """
    Using multiprocessing (*if available) to process data from pandas Dataframe.
    Will be outputing a new dataframe with a processed data.
    """
    print("Processing...")
    final_list_clean = []
    final_list_dirty = []
    final_label = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
      sentence = row[text_cols]
      label = row[label_cols]
      
      # Process label
      if label_mapping is not None:
        if label not in label_mapping:
          print(f"Label {label} is not matched any label_mapping you've defined. This label will be ignored")
          continue      
        clean_label = label_mapping[label]
      else:
        clean_label = label  
      
      # Process Text
      clean_sentence = self.__cleanText__(sentence, self.slang_word,
                                          self.stopword, self.stemmer, verbose)
      if (clean_sentence is None):
        print(f"Sentence '{sentence}' is empty after processing. This sentence will be ignored")
        continue
      if (len(clean_sentence.split()) < word_min):
        continue
      
      final_list_clean.append(clean_sentence)
      final_list_dirty.append(sentence)
      final_label.append(clean_label)
        
    # Creating pandas dataframe
    data = {
      'raw': final_list_dirty,
      'processed': final_list_clean,
      'label': final_label
    }
    final_df = pd.DataFrame(data)
    if dropna:
      print("NaN Dropped")
      final_df = final_df.dropna(how='any')
    final_df['processed'] = final_df['processed'].astype(str)
    final_df['raw'] = final_df['raw'].astype(str)

    return final_df

  def CleanOneText(self, text, verbose=False):
    return self.__cleanText__(text, self.slang_word, self.stopword, self.stemmer, verbose)

  def __cleanText__(self, text:str, slangword:dict, stopword:list, stemmer, verbose=False) -> str:
    '''
    Processing a text, deleting some web associated word, removing word from stopword list
    and change defined slang word.
    '''
    # HTML and text annotation removal
    text = re.sub(r'http\S+', '', text)
    text = re.sub('(@\w+|#\w+)','',text)
    text = re.sub('<.*?>', '', text)  
    temp_text = list(text)
    for i in range(len(temp_text)):
      if temp_text[i] in string.punctuation:
        temp_text[i] = " "
    text = ''.join(temp_text)
    text = re.sub('[^a-zA-Z]',' ',text) 
    text = re.sub("\n"," ",text)
    text = text.lower()
    text = re.sub("(username|user|url|rt|xf|fx|xe|xa)\s|\s(user|url|rt|xf|fx|xe|xa)","",text)
    text = re.sub(r'(\w)(\1{2,})', r"\1", text)
    text = re.sub(r"\b[a-zA-Z]\b","",text)
    text = re.sub('(s{2,})',' ',text)
    if verbose:
      print(f"After Special Character : {text}")
    text = text.lower()
    if verbose:
      print(f"After Lower : {text}")
    text=' '.join(text.split())
    text_split = text.split(' ')
    final_text_split = []
    for i in range(len(text_split)):
      if type(text_split[i]) != str:
        continue
      if str(text_split[i]) in stopword:
        continue
      if str(text_split[i]) in slangword:
        text_split[i] = str(slangword[text_split[i]])
      final_text_split.append(text_split[i])
    text = " ".join(final_text_split)
    if verbose:
      print(f"After Stopword and Slangword : {text}")
    stemmed_text = stemmer.stem(text)
    if verbose:
      print(f"After Stemming : {stemmed_text}")
    
    # just to make sure
    if len(stemmed_text) == 0:
      return None   
    
    return stemmed_text

# Preparing Slang Word

In [4]:
kamus_alay1 = pd.read_csv('https://raw.githubusercontent.com/fendiirfan/Kamus-Alay/main/Kamu-Alay.csv')
dict_kamus_alay1 = {}
for _,row in kamus_alay1.iterrows():
    if row["kataBaik"] is np.NaN:
        continue
    dict_kamus_alay1[row["kataAlay"]] = row["kataBaik"]

In [5]:
kamus_alay2 = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv')
kamus_alay2 = kamus_alay2.filter(['slang', 'formal'], axis=1)
kamus_alay2 = kamus_alay2.drop_duplicates(subset=['slang'], keep='first')
dict_kamus_alay2 = {}
for _,row in kamus_alay2.iterrows():
    if row["formal"] is np.NaN:
        continue
    dict_kamus_alay2[row["slang"]] = row["formal"]

# Instantiate Cleaner

In [18]:
cleaner = DataCleaning([], {**dict_kamus_alay1, **dict_kamus_alay2})

# Script for cleaning text Twitter

In [None]:
RAW_DIR = "/content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Crawling/RAW"
PROCESSED_DIR = "/content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Crawling/Processed"

In [None]:
def clean_files_dir(cleaner: DataCleaning, src_dir:str, dst_dir:str,
                    text_column: str):
    # Removing files in destination if exists
    files = os.listdir(dst_dir)
    for file in files:
        os.remove(dst_dir + f"/{file}")

    # Cleaning files
    files = os.listdir(src_dir)
    for file in files:
        print(f"Processing file {file}")
        df = pd.read_csv(src_dir+f"/{file}")
        unclean_texts = []
        clean_texts = []
        for i, row in tqdm(df.iterrows(), total=df.shape[0]):
            text = row[text_column]
            unclean_texts.append(text)
            text = cleaner.CleanOneText(str(text))
            clean_texts.append(text)
        new_df = pd.DataFrame(data={"raw" : unclean_texts, "clean" : clean_texts})
        new_df = new_df.drop_duplicates()
        new_df = new_df.dropna()
        new_df.to_csv(dst_dir+f"/{file}", index=False)

clean_files_dir(cleaner, RAW_DIR, PROCESSED_DIR, "tweet")

Processing file sinopharm.csv


100%|██████████| 2149/2149 [00:42<00:00, 50.81it/s]


Processing file moderna.csv


100%|██████████| 4693/4693 [07:28<00:00, 10.46it/s]


Processing file efektif.csv


100%|██████████| 10300/10300 [08:05<00:00, 21.20it/s]


Processing file pfizer.csv


100%|██████████| 9972/9972 [10:18<00:00, 16.12it/s]


Processing file bayar.csv


100%|██████████| 1694/1694 [04:11<00:00,  6.73it/s]


Processing file gratis.csv


100%|██████████| 11277/11277 [11:31<00:00, 16.31it/s]


Processing file sinovac.csv


100%|██████████| 21663/21663 [16:29<00:00, 21.90it/s]


Processing file astrazeneca.csv


100%|██████████| 9103/9103 [03:30<00:00, 43.17it/s]


# Script for cleaning text General Sentiment Analysis

In [None]:
label_mapping = {
    -1 : 0, # Negatively sentiment
    0 : 1, # Neutral sentiment
    1 : 2 # Positive sentiment
}


In [None]:
# IndoNLU
df = pd.read_csv("/content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/1. RAW/General Sentiment Analysis Labeled/IndoNLU_SMSA_DOC-SENTIMENT_PROSA.csv")
unclean_texts = []
clean_texts = []
labels = []
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
  text = str(row["tweet"])
  unclean_texts.append(text)
  text = cleaner.CleanOneText(str(text))
  clean_texts.append(text)
  labels.append(row["labels"])
new_df = pd.DataFrame(data={"raw" : unclean_texts, "clean" : clean_texts, "labels": labels})
new_df = new_df.drop_duplicates()
new_df = new_df.dropna()
new_df.to_csv("/content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/2. Clean/General/IndoNLU.csv", index=False)

In [None]:
# Ridlife
df = pd.read_csv("/content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/1. RAW/General Sentiment Analysis Labeled/ridlife-dataset-idsa-Indonesian Sentiment Twitter Dataset Labeled.csv")
unclean_texts = []
clean_texts = []
labels = []
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
  text = str(row["Tweet"])
  unclean_texts.append(text)
  text = cleaner.CleanOneText(str(text))
  clean_texts.append(text)
  labels.append(row["sentimen"])
new_df = pd.DataFrame(data={"raw" : unclean_texts, "clean" : clean_texts, "labels": labels})
new_df = new_df.drop_duplicates()
new_df = new_df.dropna()
new_df.to_csv("/content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/2. Clean/General/Ridlife.csv", index=False)

100%|██████████| 10806/10806 [23:03<00:00,  7.81it/s]


Improving Cleaning Data   
This needs to be done because there's some duplicates on "clean" column. This gives us insight that there's some tweet that have same context. Like a newsline and spam. Hence, we need to remove that and keep it only the first one. This can be seen on the "clean data moderna v1"

In [None]:
import mimetypes
import os

In [None]:
# We are going to clean all the duplicates on "clean" column on every dataset

input_path = "/content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/Processed/v2"
based_column_clean = "clean"

def clean_duplicates(input_dir:str, column:str=None) -> None:
  new_input_dir = input_dir+"/improved/"
  if not os.path.exists(new_input_dir):
    os.mkdir(new_input_dir)
  list_file = os.listdir(input_dir)
  for i in list_file:
    cur_file = input_dir+f"/{i}"
    if mimetypes.guess_type(cur_file)[0] != "text/csv":
      continue
    df = pd.read_csv(cur_file)
    df = df.drop_duplicates(subset=column)
    output_dir = new_input_dir+f"{i}"
    df.to_csv(output_dir, index=False)
    print(f"{i} has been cleaned and outputted to {output_dir}")

clean_duplicates(input_path, based_column_clean)

sinopharm.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/Processed/v2/improved/sinopharm.csv
moderna.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/Processed/v2/improved/moderna.csv
efektif.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/Processed/v2/improved/efektif.csv
pfizer.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/Processed/v2/improved/pfizer.csv
bayar.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/Processed/v2/improved/bayar.csv
gratis.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/Processed/v2/improved/gratis.csv
sinovac.csv has been cleaned and outputted to /con

Let's clean up tweet with the links  
Because link tend to direct to a news

In [None]:
# We are going to clean all the duplicates on "clean" column on every dataset
# reference: https://stackoverflow.com/questions/39948757/how-to-delete-rows-in-python-pandas-dataframe-using-regular-expressions
import os
import mimetypes

input_path = "/content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/2. Clean/Twitter/v2/improved"
output_path = "/content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/2. Clean/Twitter/v3"

def remove_row_with_links(input_dir:str, output_path) -> None:
  if not os.path.exists(output_path):
    os.mkdir(output_path)
  list_file = os.listdir(input_dir)
  for i in list_file:
    cur_file = input_dir+f"/{i}"
    if mimetypes.guess_type(cur_file)[0] != "text/csv":
      continue
    df = pd.read_csv(cur_file)
    patternDel = "https://t.co" # Beccause so many news with this links
    filter = df['raw'].str.contains(patternDel)
    df = df[~filter].reset_index(drop=True)
    output_dir = output_path+f"/{i}"
    df.to_csv(output_dir, index=False)
    print(f"{i} has been cleaned and outputted to {output_dir}")

remove_row_with_links(input_path, output_path)

sinopharm.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/2. Clean/Twitter/v3/sinopharm.csv
bayar.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/2. Clean/Twitter/v3/bayar.csv
pfizer.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/2. Clean/Twitter/v3/pfizer.csv
moderna.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/2. Clean/Twitter/v3/moderna.csv
efektif.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/2. Clean/Twitter/v3/efektif.csv
gratis.csv has been cleaned and outputted to /content/gdrive/Shareddrives/Riset Sentimen Vaksin COVID: Yaudahlah/Riset/Data/2. Clean/Twitter/v3/gratis.csv
astrazeneca.csv has been cleaned and outputted to /content/gdr

# Tahapan Preprocessing

In [19]:
text = "Vaksin ma perawatan covid itu gratis. Pemerintah untung dari mana coba?"

In [20]:
cleaner.CleanOneText(text, True)

After Special Character : vaksin ma perawatan covid itu gratis  pemerintah untung dari mana coba 
After Lower : vaksin ma perawatan covid itu gratis  pemerintah untung dari mana coba 
After Stopword and Slangword : vaksin sama perawatan covid itu gratis pemerintah untung dari mana coba
After Stemming : vaksin sama awat covid itu gratis perintah untung dari mana coba


'vaksin sama awat covid itu gratis perintah untung dari mana coba'