In [68]:
import pandas as pd
import re

In [69]:
df_ori = pd.read_csv('data/df_kotor.csv')
df = df_ori["comment"]
df.head()

0    Izin bertnya apakah studi idenpenden mitra pro...
1                               Dapat uang saku gaksih
2            @rwrt1.0 kelas full english kak batch ini
3                        Saya, saya bang ga lolos msib
4                                                   🔥🔥
Name: comment, dtype: object

## Casefolding

In [70]:
# remove sentence which contains only one word
def remove_sentence(text): 
    word = text.split()
    wordCount = len(word)
    if(wordCount<=1):
        text = ''
        
    return text

In [71]:
def casefolding(text):
    # mengubah huruf kapital menjadi huruf kecil
    text = text.lower()
    
    # menghapus mention dan link
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
    
    # menghapus semua karakter selain huruf dan angka
    text = re.sub(r'[^a-z0-9 ]', '', text)
    
    # menghapus angka
    text = re.sub(r"\b\d+\b", " ", text)
    return text

df = df.apply(remove_sentence)
df = df.apply(casefolding)
df.head()

0    izin bertnya apakah studi idenpenden mitra pro...
1                               dapat uang saku gaksih
2                     kelas full english kak batch ini
3                         saya saya bang ga lolos msib
4                                                     
Name: comment, dtype: object

## Tokenization

In [72]:
def tokenization(text):
    # memecah kalimat menjadi kata
    text = text.split()
    return text

df = df.apply(tokenization)
df.head()

0    [izin, bertnya, apakah, studi, idenpenden, mit...
1                          [dapat, uang, saku, gaksih]
2              [kelas, full, english, kak, batch, ini]
3                  [saya, saya, bang, ga, lolos, msib]
4                                                   []
Name: comment, dtype: object

## Filtering

### Replace Slangs

In [73]:
df_slang = pd.read_csv('src/colloquial-indonesian-lexicon.csv')
slang_dict = dict(zip(df_slang['slang'], df_slang['formal']))

In [74]:
import json

slangword = {}

with open('src/combined_slang_words.txt', 'r') as f:
    slangword = json.load(f)

In [75]:
def replace_slang(text):
    return [slang_dict.get(token, token) for token in text]

def replace_slang_v2(text):
    return [slangword.get(token, token) for token in text]

df = df.apply(replace_slang)
df = df.apply(replace_slang_v2)
df.head()

0    [izin, bertnya, apakah, studi, idenpenden, mit...
1                          [dapat, uang, saku, gaksih]
2              [kelas, full, english, kak, batch, ini]
3               [saya, saya, bang, tidak, lolos, msib]
4                                                   []
Name: comment, dtype: object

### Remove Stopwords

In [76]:
# from nltk.corpus import stopwords

# stop_words = set(stopwords.words('indonesian', 'english'))
# stop_words.update([
#     'min', 'ga', 'gak', 'iya', 'ya', 'sih', 'gk', 'kak', 'bang', 'pak', 'sir', 'bro', 'sob', 'sdr','mbak', 'mba', 'kakak', 'kakaknya', 'kakakku', 'nya', 'mas', 
# ])

# def remove_stopword(text):
#     return [word for word in text if not word in stop_words]

# df = df.apply(remove_stopword)
# df.head()

## Stemming

In [77]:
# from sklearn.pipeline import Pipeline
# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# def stemming(text):
#     factory = StemmerFactory()
#     stemmer = factory.create_stemmer()
#     return [stemmer.stem(word) for word in text]

# df = df.apply(stemming)
# df.head()

## Save Preprocessed Data

In [80]:
df_clean = df_ori.copy()
df_clean = df_clean.drop(columns=['clean', 'topic'])
df_clean["clean"] = df.apply(lambda x: ' '.join(x))

In [81]:
df_clean = df_clean.drop(columns=['Unnamed: 0'])
df_clean

Unnamed: 0,comment,clean
0,Izin bertnya apakah studi idenpenden mitra pro...,izin bertnya apakah studi idenpenden mitra pro...
1,Dapat uang saku gaksih,dapat uang saku gaksih
2,@rwrt1.0 kelas full english kak batch ini,kelas full english kak batch ini
3,"Saya, saya bang ga lolos msib",saya saya bang tidak lolos msib
4,🔥🔥,
...,...,...
577,@spontanahuy tanggal 15 terakhirnya 🙏🏻 ada bbr...,tanggal terakhirnya ada beberapa perusahaan ya...
578,@spontanahuy ga ush berharap lgi udh tgl segin...,tidak perlu berharap lagi sudah tanggal segini...
579,@spontanahuy asliii butuh kepastian,asli butuh kepastian
580,Siapa saja yg lolos utk ikut survei diinfokan ...,siapa saja yang lolos untuk ikut survei diinfo...


In [82]:
# save to csv
df_clean.to_csv('data/df_clean.csv')