In [75]:
import pandas as pd
import re

In [93]:
df_ori = pd.read_csv('data/df_kotor.csv', index_col=0)
df = df_ori["comment"]
df.head()

0    Izin bertnya apakah studi idenpenden mitra pro...
1                               Dapat uang saku gaksih
2            @rwrt1.0 kelas full english kak batch ini
3                        Saya, saya bang ga lolos msib
4                                                   🔥🔥
Name: comment, dtype: object

## Casefolding

In [94]:
# remove sentence which contains only one word
def remove_sentence(text): 
    word = text.split()
    wordCount = len(word)
    if(wordCount<=1):
        text = ''
        
    return text

In [95]:
def casefolding(text):
    # mengubah huruf kapital menjadi huruf kecil
    text = text.lower()
    
    # menghapus mention dan link
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
    
    # menghapus semua karakter selain huruf dan angka
    text = re.sub(r'[^a-z0-9 ]', '', text)
    
    # menghapus angka
    text = re.sub(r"\b\d+\b", " ", text)
    return text

df = df.apply(remove_sentence)
df = df.apply(casefolding)
df.head()

0    izin bertnya apakah studi idenpenden mitra pro...
1                               dapat uang saku gaksih
2                     kelas full english kak batch ini
3                         saya saya bang ga lolos msib
4                                                     
Name: comment, dtype: object

## Tokenization

In [96]:
def tokenization(text):
    # memecah kalimat menjadi kata
    text = text.split()
    return text

df = df.apply(tokenization)
df.head()

0    [izin, bertnya, apakah, studi, idenpenden, mit...
1                          [dapat, uang, saku, gaksih]
2              [kelas, full, english, kak, batch, ini]
3                  [saya, saya, bang, ga, lolos, msib]
4                                                   []
Name: comment, dtype: object

## Filtering

### Replace Slangs

In [97]:
df_slang = pd.read_csv('src/colloquial-indonesian-lexicon.csv')
slang_dict = dict(zip(df_slang['slang'], df_slang['formal']))

In [98]:
import json

slangword = {}

with open('src/combined_slang_words.txt', 'r') as f:
    slangword = json.load(f)

In [99]:
def replace_slang(text):
    return [slang_dict.get(token, token) for token in text]

def replace_slang_v2(text):
    return [slangword.get(token, token) for token in text]

df = df.apply(replace_slang)
df = df.apply(replace_slang_v2)
df.head()

0    [izin, bertnya, apakah, studi, idenpenden, mit...
1                          [dapat, uang, saku, gaksih]
2              [kelas, full, english, kak, batch, ini]
3               [saya, saya, bang, tidak, lolos, msib]
4                                                   []
Name: comment, dtype: object

### Remove Stopwords

In [100]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('indonesian', 'english'))
stop_words.update([
    'min', 'ga', 'gak', 'iya', 'ya', 'sih', 'gk', 'kak', 'bang', 'pak', 'sir', 'bro', 'sob', 'sdr','mbak', 'mba', 'kakak', 'kakaknya', 'kakakku', 'nya', 'mas', 
])

def remove_stopword(text):
    return [word for word in text if not word in stop_words]

df = df.apply(remove_stopword)
df.head()

0    [izin, bertnya, studi, idenpenden, mitra, prog...
1                                 [uang, saku, gaksih]
2                        [kelas, full, english, batch]
3                                        [lolos, msib]
4                                                   []
Name: comment, dtype: object

## Stemming

In [101]:
from sklearn.pipeline import Pipeline
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def stemming(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return [stemmer.stem(word) for word in text]

df = df.apply(stemming)
df.head()

0    [izin, bertnya, studi, idenpenden, mitra, prog...
1                                 [uang, saku, gaksih]
2                        [kelas, full, english, batch]
3                                        [lolos, msib]
4                                                   []
Name: comment, dtype: object

## Save Preprocessed Data

In [102]:
df_clean = df_ori.copy()
df_clean = df_clean.drop(columns=['clean', 'topic'])
df_clean["clean"] = df.apply(lambda x: ' '.join(x))

# save to csv
df_clean.to_csv('data/df_clean.csv')

In [104]:
df_clean.head()

Unnamed: 0,comment,clean
0,Izin bertnya apakah studi idenpenden mitra pro...,izin bertnya studi idenpenden mitra programing...
1,Dapat uang saku gaksih,uang saku gaksih
2,@rwrt1.0 kelas full english kak batch ini,kelas full english batch
3,"Saya, saya bang ga lolos msib",lolos msib
4,🔥🔥,
