In [2]:
import pandas as pd 
import numpy as np

In [3]:
pemilu_data = pd.read_csv('dataset/pemilu-2024.csv')

In [4]:
pemilu_data['news-title'].head(50)

0     DPO PPLN Kuala Lumpur Serahkan Diri, Langsung ...
1     Pengamat Nilai Prabowo-Gibran Sebagai Kunci De...
2     Hasil Rekapitulasi 16 Provinsi, Prabowo-Gibran...
3     DPR Undang KPU Rapat Evaluasi Pemilu 2024 Sete...
4      Kris Dayanti Akui Gagal Lolos DPR di Pemilu 2024
5          KPU Tunda Rekapitulasi Suara Nasional Sulbar
6     Hari Pertama Puasa, Prabowo Unggah Foto Makan ...
7     KPU: Airin Rachmi Caleg Suara Terbanyak di Dap...
8     Terpilih Lagi di NTT, Caleg DPR NasDem Ratu Wu...
9     Raja Yordania Sapa Prabowo 'My Brother, My Fri...
10    DPO PPLN Kuala Lumpur Serahkan Diri, Langsung ...
11    Pengamat Nilai Prabowo-Gibran Sebagai Kunci De...
12    Hasil Rekapitulasi 16 Provinsi, Prabowo-Gibran...
13    DPR Undang KPU Rapat Evaluasi Pemilu 2024 Sete...
14     Kris Dayanti Akui Gagal Lolos DPR di Pemilu 2024
15         KPU Tunda Rekapitulasi Suara Nasional Sulbar
16    Hari Pertama Puasa, Prabowo Unggah Foto Makan ...
17    KPU: Airin Rachmi Caleg Suara Terbanyak di

## Case Folding

In [5]:
pemilu_data['news-title'] = pemilu_data['news-title'].str.lower()

print('Case Folding Result : \n')
print(pemilu_data['news-title'].head())

Case Folding Result : 

0    dpo ppln kuala lumpur serahkan diri, langsung ...
1    pengamat nilai prabowo-gibran sebagai kunci de...
2    hasil rekapitulasi 16 provinsi, prabowo-gibran...
3    dpr undang kpu rapat evaluasi pemilu 2024 sete...
4     kris dayanti akui gagal lolos dpr di pemilu 2024
Name: news-title, dtype: object


## Tokenizing

In [6]:
import string
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# Tokenizing
def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_tweet_special)

# remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_singl_char)




In [7]:
# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

pemilu_data['news-tokens'] = pemilu_data['news-title'].apply(word_tokenize_wrapper)

In [8]:
print('Tokenizing Result : \n') 
print(pemilu_data['news-tokens'].head())
print('\n\n\n')

Tokenizing Result : 

0    [dpo, ppln, kuala, lumpur, serahkan, diri, lan...
1    [pengamat, nilai, prabowogibran, sebagai, kunc...
2    [hasil, rekapitulasi, provinsi, prabowogibran,...
3    [dpr, undang, kpu, rapat, evaluasi, pemilu, se...
4    [kris, dayanti, akui, gagal, lolos, dpr, di, p...
Name: news-tokens, dtype: object






## Menghitung frekuensi distribusi token

In [9]:
def freqDist_wrapper(text):
    return FreqDist(text)

pemilu_data['news-tokens-fdist'] = pemilu_data['news-tokens'].apply(freqDist_wrapper)

print('Frequency Tokens: \n')
print(pemilu_data['news-tokens-fdist'].head())

Frequency Tokens: 

0    {'dpo': 1, 'ppln': 1, 'kuala': 1, 'lumpur': 1,...
1    {'pengamat': 1, 'nilai': 1, 'prabowogibran': 1...
2    {'hasil': 1, 'rekapitulasi': 1, 'provinsi': 1,...
3    {'dpr': 1, 'undang': 1, 'kpu': 1, 'rapat': 1, ...
4    {'kris': 1, 'dayanti': 1, 'akui': 1, 'gagal': ...
Name: news-tokens-fdist, dtype: object


## Filtering (Stopword Removal)

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\M JAWAHIRUL
[nltk_data]     FANANI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords

# get stopword from nltk stopword
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')

# manually add stopword
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah', 'dan'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
# txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
# list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# convert list to dictionary
list_stopwords = set(list_stopwords)

# remove stopword pada list token
def stopword_removal(words):
    return[word for word in words if word not in list_stopwords]

pemilu_data['news-tokens-wsw'] = pemilu_data['news-tokens'].apply(stopword_removal)

print(pemilu_data['news-tokens'].head())

0    [dpo, ppln, kuala, lumpur, serahkan, diri, lan...
1    [pengamat, nilai, prabowogibran, sebagai, kunc...
2    [hasil, rekapitulasi, provinsi, prabowogibran,...
3    [dpr, undang, kpu, rapat, evaluasi, pemilu, se...
4    [kris, dayanti, akui, gagal, lolos, dpr, di, p...
Name: news-tokens, dtype: object


## Normalization

In [12]:

# Membaca file normalisasi.csv
normalized_word = pd.read_excel('dataset/normalisasi.xlsx')

# Membuat kamus untuk kata-kata yang dinormalisasi
normalized_word_dict = {}

# Mengisi kamus dengan pasangan kata asal dan kata normalisasi
for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1]

# Fungsi untuk normalisasi term
def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

# Mengaplikasikan normalisasi pada kolom 'news-tokens-wsw' dan menyimpan hasilnya di kolom 'news-normalized'
pemilu_data['news-normalized'] = pemilu_data['news-tokens-wsw'].apply(normalized_term)

print(pemilu_data['news-normalized'].head())


0    [dpo, ppln, kuala, lumpur, serahkan, langsung,...
1    [pengamat, nilai, prabowogibran, kunci, demokr...
2    [hasil, rekapitulasi, provinsi, prabowogibran,...
3    [dpr, undang, kpu, rapat, evaluasi, pemilu, re...
4     [kris, dayanti, akui, gagal, lolos, dpr, pemilu]
Name: news-normalized, dtype: object


  if row[0] not in normalized_word_dict:
  normalized_word_dict[row[0]] = row[1]


## Stemmer

In [20]:
import Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in pemilu_data['news-normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ''

print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term, ":", term_dict[term])

print(term_dict)
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in pemilu_data['news-normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ''

# print(len(term_dict))
# print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term, ":", term_dict[term])

# print(term_dict)
# print("------------------------")

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return[term_dict[term] for term in document]

pemilu_data['news-tokens-stemmed'] = pemilu_data['news-normalized'].apply(get_stemmed_term)
print(pemilu_data['news-tokens-stemmed'])

1457
------------------------
dpo : dpo
ppln : ppln
kuala : kuala
lumpur : lumpur
serahkan : serah
langsung : langsung
sidang : sidang
pn : pn
jakpus : jakpus
pengamat : amat
nilai : nilai
prabowogibran : prabowogibran
kunci : kunci
demokrasi : demokrasi
indonesia : indonesia
hasil : hasil
rekapitulasi : rekapitulasi
provinsi : provinsi
sapu : sapu
bersih : bersih
kemenangan : menang
dpr : dpr
undang : undang
kpu : kpu
rapat : rapat
evaluasi : evaluasi
pemilu : milu
selesai : selesai
kris : kris
dayanti : dayanti
akui : aku
gagal : gagal
lolos : lolos
tunda : tunda
suara : suara
nasional : nasional
sulbar : sulbar
puasa : puasa
prabowo : prabowo
unggah : unggah
foto : foto
makan : makan
bareng : bareng
titiek : titiek
didit : didit
airin : airin
rachmi : rachmi
caleg : caleg
dapil : dapil
banten : banten
iii : iii
terpilih : pilih
ntt : ntt
nasdem : nasdem
ratu : ratu
wulla : wulla
mengundurkan : undur
raja : raja
yordania : yordania
sapa : sapa
my : my
brother : brother
friend : frien

## Save data

In [21]:

pemilu_data_new = pemilu_data.drop(columns=['web-scraper-order','web-scraper-start-url', 'pages'])

print(pemilu_data.columns)

print(pemilu_data_new.columns)

Index(['web-scraper-order', 'web-scraper-start-url', 'news-title', 'pages',
       'news-tokens', 'news-tokens-fdist', 'news-tokens-wsw',
       'news-normalized', 'news-tokens-stemmed'],
      dtype='object')
Index(['news-title', 'news-tokens', 'news-tokens-fdist', 'news-tokens-wsw',
       'news-normalized', 'news-tokens-stemmed'],
      dtype='object')


In [None]:
# Save to excel

pemilu_data_new.to_excel("results/Pemilu_Text_Preprocessing.xlsx")



In [None]:
# Save to csv

pemilu_data_new.to_csv("results/Pemilu_Test_Preprocessing.csv")