# Text Preprocessing Pipeline for Bahasa
The process includes:

1. Casefolding (lowercase, remove numbers, remove punctuation, remove non-ascii character);
2. Strip HTML, remove url, remove email;
3. Normalize slang world;
4. Stemming;
5. Tokenize;
6. Filtering / stopword removal.

2020 &copy; Kuncahyo Setyo Nugroho <br/>
Faculty of Computer Science, Brawijaya University

Present in PyCon ID Online 2020, 13 November 2020

In [3]:
import re
import string
import unicodedata
import nltk
import pandas as pd
from bs4 import BeautifulSoup
from emo_unicode import UNICODE_EMO, EMOTICONS
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

class TextPreprocessing():
    def __init__(self, text = 'test'):
        self.text = text
    
    def lowercase(self):
        """Convert to lowercase"""
        self.text = self.text.lower()
        self.text = self.text.strip()
        return self

    def strip_html(self):
        """HTML tag removal"""
        soup = BeautifulSoup(self.text, 'lxml')
        self.text = soup.get_text()
        return self

    def remove_url(self):
        """Remove URL (http/https/www) or custom URL"""
        self.text = re.sub(r'https?://\S+|www\.\S+', '', self.text)
        self.text = re.sub(r'pic.twitter.com\S+', '', self.text) # custom for twitter
        return self
    
    def remove_email(self):
        """Remove email"""
        self.text = re.sub('\S*@\S*\s?', '', self.text)
        return self
    
    def remove_between_square_brackets(self):
        """Remove string beetwen square brackets []"""
        self.text = re.sub('\[[^]]*\]', '', self.text)
        return self

    def remove_numbers(self):
        """Remove numbers"""
        self.text = re.sub('[-+]?[0-9]+', '', self.text)
        return self
    
    def remove_emoji(self):
        """Remove emoji, e.g 😜😀 """
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               "]+", flags=re.UNICODE)
        self.text = emoji_pattern.sub(r'', self.text)
        return self
    
    def remove_emoticon(self):
        """Remove emoticon, e.g :-)"""
        emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
        self.text = emoticon_pattern.sub(r'', self.text)
        return self
    
    def convert_emoji(self):
        """Convert emoji to word"""
        for emoji in UNICODE_EMO:
            self.text = self.text.replace(emoji, '_'.join(UNICODE_EMO[emoji].replace(',','').replace(':','').split()))
        return self
    
    def convert_emoticon(self):
        """Convert emoticon to word"""
        for emoticon in EMOTICONS:
            self.text = re.sub(u'('+emoticon+')', '_'.join(EMOTICONS[emoticon].replace(',','').split()), self.text)
        return self

    def remove_punctuation(self):
        """Remove punctuation"""
        self.text = re.sub(r'[^\w\s]', '', self.text)
        return self

    def remove_non_ascii(self):
        """Remove non-ascii character"""
        self.text = unicodedata.normalize('NFKD', self.text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        return self

    def normalize_word(self):
        """Normalize slang world"""
        normal_word_path = pd.read_csv('key_norm.csv')
        
        self.text = ' '.join([normal_word_path[normal_word_path['singkat'] == word]['hasil'].values[0] 
        if (normal_word_path['singkat'] == word).any() else word for word in self.text.split()])
        return self

    def stemming(self):
        """Stemming for Bahasa with Sastrawi"""
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        self.text = stemmer.stem(self.text)
        return self

    def tokenize(self):
        """Tokenize words"""
        self.words = nltk.word_tokenize(self.text)
        return self

    def stopwords_removal(self):
        """Stopword removal"""
        stopword = stopwords.words('indonesian')
        more_stopword = ['daring', 'online', 'pd'] # add more stopword to default corpus
        stop_factory = stopword + more_stopword
        
        clean_words = []
        for word in self.words:
            if word not in stop_factory:
                clean_words.append(word)
        self.words = clean_words
        return self
    
    def join_words(self):
        """Jonin all words"""
        self.words = ' '.join(self.words)
        return self
    
    def do_all(self, text):
        """Do all text preprocessing process""" # or custom process
        self.text = text
        self = self.lowercase()
        self = self.strip_html()
        self = self.remove_url()
        self = self.remove_email()
        self = self.remove_between_square_brackets()
        self = self.remove_numbers()
        self = self.remove_emoticon()
        self = self.remove_emoji()
        self = self.convert_emoticon()
        self = self.convert_emoji()
        self = self.remove_punctuation()
        self = self.remove_non_ascii()
        self = self.normalize_word()
        self = self.stemming()
        self = self.tokenize()
        self = self.stopwords_removal()
        self = self.join_words()
        return self.words

In [9]:
sample_text = 'Drama kmarin sore : seharian puasa trus pas lg mandi kaget bcz kedatangan tamu tak diundang pas bgt woi pas lagi adzan maghrib trus jd galau dong seharian itu dianggap sah puasa apa enggak 😀 https://pic.twitter.com/0sl7DKUKFl'

tp = TextPreprocessing(sample_text)
print(tp.do_all(sample_text))

drama kemarin sore puasa pas mandi kaget bcz tamu undang pas banget woi pas adzan maghrib galau anggap sah puasa


Example data: https://github.com/meisaputri21/Indonesian-Twitter-Emotion-Dataset

In [6]:
data = pd.read_csv('twitter_emotion_dataset.csv')
tp = TextPreprocessing() 
data['clean_tweet'] = data['tweet'].apply(tp.do_all)

data.head(10)

Unnamed: 0,label,tweet,clean_tweet
0,sadness,Separuh hati ini iri jika melihat seorang anak...,paruh hati iri lihat orang anak duduk jalan ib...
1,happy,Ketika aku tersenyum bukan berarti hidupku sem...,senyum arti hidup sempurna syukur tuhan ikan ku
2,sadness,"dari mau tdr, tidur, sampe bgn tdr kok yaa kay...",tdr tidur bangun tdr yaa aneh enak hati bawa i...
3,love,kan kupeluk engkau erat2 hingga tak ada seoran...,peluk engkau erat orang rebut mu peluk ku sena...
4,anger,Kalo mau ganti presiden itu harus jelas siapa ...,ganti presiden calon adu program kerja koalisi...
5,happy,"Sukses n keren banget dgn no.1 kualitas,bhn kr...",sukses n keren banget nomor kualitasbhn krjait...
6,anger,"Udah mau sarjana 2 kali, mbokya mulut nya jang...",sarjana kali mbokya mulut nya tinggal ajar sek...
7,anger,Gimana orang ga nilai dr jilbab/syari/nggak ny...,orang nilai jilbabsyaringgak nya kadang gemez ...
8,fear,Hari ini jadwal presentasi proker di LPPM karn...,jadwal presentasi proker lppm praktikum ketua ...
9,anger,Foto saya di instagram masih ada cuma lupa pas...,foto instagram lupa password instanya buka sal...
