In [None]:
import pandas as pd 
import numpy as np

In [None]:
pemilu_data = pd.read_csv('dataset/pemilu-2024.csv')

In [None]:
pemilu_data['news-title'].head(50)

## Case Folding

In [None]:
pemilu_data['news-title'] = pemilu_data['news-title'].str.lower()

print('Case Folding Result : \n')
print(pemilu_data['news-title'].head())

## Tokenizing

In [None]:
import string
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# Tokenizing
def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_tweet_special)

# remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

pemilu_data['news-title'] = pemilu_data['news-title'].apply(remove_singl_char)




In [None]:
# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

pemilu_data['news-tokens'] = pemilu_data['news-title'].apply(word_tokenize_wrapper)

In [None]:
print('Tokenizing Result : \n') 
print(pemilu_data['news-tokens'].head())
print('\n\n\n')

## Menghitung frekuensi distribusi token

In [None]:
def freqDist_wrapper(text):
    return FreqDist(text)

pemilu_data['news-tokens-fdist'] = pemilu_data['news-tokens'].apply(freqDist_wrapper)

print('Frequency Tokens: \n')
print(pemilu_data['news-tokens-fdist'].head())

## Filtering (Stopword Removal)

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

# get stopword from nltk stopword
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')

# manually add stopword
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah', 'dan'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
# txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
# list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# convert list to dictionary
list_stopwords = set(list_stopwords)

# remove stopword pada list token
def stopword_removal(words):
    return[word for word in words if word not in list_stopwords]

pemilu_data['news-tokens-wsw'] = pemilu_data['news-tokens'].apply(stopword_removal)

print(pemilu_data['news-tokens'].head())

## Normalization

In [None]:

# Membaca file normalisasi.csv
normalized_word = pd.read_excel('dataset/normalisasi.xlsx')

# Membuat kamus untuk kata-kata yang dinormalisasi
normalized_word_dict = {}

# Mengisi kamus dengan pasangan kata asal dan kata normalisasi
for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1]

# Fungsi untuk normalisasi term
def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

# Mengaplikasikan normalisasi pada kolom 'news-tokens-wsw' dan menyimpan hasilnya di kolom 'news-normalized'
pemilu_data['news-normalized'] = pemilu_data['news-tokens-wsw'].apply(normalized_term)

print(pemilu_data['news-normalized'].head())


## Stemmer

In [None]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in pemilu_data['news-normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ''

print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term, ":", term_dict[term])

print(term_dict)
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in pemilu_data['news-normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ''

# print(len(term_dict))
# print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term, ":", term_dict[term])

# print(term_dict)
# print("------------------------")

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return[term_dict[term] for term in document]

pemilu_data['news-tokens-stemmed'] = pemilu_data['news-normalized'].apply(get_stemmed_term)
print(pemilu_data['news-tokens-stemmed'])

## Save data

In [25]:
# Save to excel

pemilu_data.to_excel("results/Text_Preprocessing.xlsx")

In [27]:
# Save to csv

pemilu_data.to_csv("results/Test_Preprocessing.csv")