# 1. Import Modul

In [None]:
!pip install sastrawi

In [20]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
nltk.download('punkt_tab')
nltk.download('stopwords')

from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/adnandi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/adnandi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Load Data

In [None]:
data = pd.read_csv('data/dataset_laporan_berlabel.csv')
data.head()

Unnamed: 0,tgl-laporan,title,link,content,jenis_laporan,tingkat_urgensi,tingkat_bahaya
0,"Jumat, 08:23",Evakuasi Tawon,https://www.lapor.go.id/laporan/detil/evakuasi...,Pernohonan evakuasi tawon di atap rumah di sal...,Evakuasi/Penyelamatan Hewan,Segera,Bahaya Rendah
1,"Jumat, 08:20",ular masuk dapur,https://www.lapor.go.id/laporan/detil/ular-mas...,Laporan dari salah satu warga di Kelurahan Mon...,Evakuasi/Penyelamatan Hewan,Segera,Bahaya Rendah
2,"Jumat, 08:18",ular masuk rumah,https://www.lapor.go.id/laporan/detil/ular-mas...,Laporan dari salah satu warga di Kelurahan Ron...,Evakuasi/Penyelamatan Hewan,Segera,Bahaya Rendah
3,"Jumat, 08:15",Kunci tertinggal di dalam mobil,https://www.lapor.go.id/laporan/detil/kunci-te...,Laporan dari salah satu warga di Desa Tunah Ke...,Penyelamatan Non Hewan & Bantuan Teknis,Normal,Bahaya Rendah
4,"Jumat, 08:12",Evakuasi Ular Sawo Kembang,https://www.lapor.go.id/laporan/detil/evakuasi...,Laporan permohonan evakuasi Ular Sowo kembang ...,Evakuasi/Penyelamatan Hewan,Segera,Bahaya Rendah


In [None]:
alay_dict = pd.read_csv('data/alay_dict.csv', encoding='latin-1', header=None)
alay_dict = alay_dict.rename(columns={0: 'original',
                                      1: 'replacement'})

# 3. Preprocessing

In [34]:
data.drop_duplicates(inplace=True)

In [35]:
data.drop('tingkat_bahaya', axis=1, inplace=True)

In [36]:
data['jenis_laporan'].value_counts()

jenis_laporan
Evakuasi/Penyelamatan Hewan                338
Kebakaran                                  165
Penyelamatan Non Hewan & Bantuan Teknis     74
Layanan Lingkungan & Fasilitas Umum         54
Name: count, dtype: int64

In [37]:
data.shape

(741, 6)

In [38]:
stop_words = set(stopwords.words('indonesian'))

In [39]:
def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('  +', ' ', text) # Remove extra spaces
    return text

def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z!?,.]+', ' ', text)
    return text

alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

def remove_stopword(text):
    # Tokenisasi kata agar lebih aman menghapus stopword
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [40]:
def preprocess(text):
    text = lowercase(text)
    text = remove_nonaplhanumeric(text)
    text = remove_unnecessary_char(text)
    text = normalize_alay(text)
    # text = stemming(text)
    text = remove_stopword(text)
    return text

In [41]:
data['content'] = data['content'].apply(preprocess)

In [42]:
data.sample(7)

Unnamed: 0,tgl-laporan,title,link,content,jenis_laporan,tingkat_urgensi
296,"25 Okt 2024, 08:25",Evakuasi Sarang Tawon,https://www.lapor.go.id/laporan/detil/evakuasi...,laporan permohonan evakuasi sarang tawon salah...,Evakuasi/Penyelamatan Hewan,Segera
527,"29 Apr, 8:06",Evakuasi biawak,https://www.lapor.go.id/laporan/detil/evakuasi...,evakuasi biawak jl . tanggaring ih no 1 rumah ...,Evakuasi/Penyelamatan Hewan,Segera
406,"12 Des 2024, 04:05",Pohon Tumbang,https://www.lapor.go.id/laporan/detil/pohon-tu...,tabe .... melaporkan pohon tumbang akibat angi...,Penyelamatan Non Hewan & Bantuan Teknis,Normal
562,"19 Apr, 9:33",Evakuasi lebah,https://www.lapor.go.id/laporan/detil/evakuasi...,"evakuasi lebah jl . belibis i no.12b minggu , ...",Evakuasi/Penyelamatan Hewan,Segera
317,"12 Sep 2024, 10:04",ODGJ Meresahkan,https://www.lapor.go.id/laporan/detil/odgj-mer...,odgj meresahkan dsn . pareng rt.03 rw . 01 ds ...,,
219,"30 Des 2024, 09:48",Selang gas Bocor,https://www.lapor.go.id/laporan/detil/selang-g...,mohon dibantu kebakaran akibat selang gas boco...,,
354,"31 Jul 2024, 4:29",Kebakaran di Pemakaman Umum,https://www.lapor.go.id/laporan/detil/kebakara...,kebakaran pohon panggang berdiameter 7m berlok...,Kebakaran,Immediat


In [44]:
data.to_csv("data-laporan-preprocessed.csv", index=False)