In [15]:
# Install necessary libraries
!pip install pandas nltk swifter openpyxl Sastrawi

!wget -q -O normalisasi-V1.xlsx https://raw.githubusercontent.com/hopefullycorrect/rulelex/main/py/normalisasi-V1.xlsx
!wget -q -O stopword.txt https://raw.githubusercontent.com/hopefullycorrect/rulelex/main/py/stopword.txt
import urllib.request
# Download the Excel file
url = "https://raw.githubusercontent.com/hopefullycorrect/rulelex/main/py/IU(someday-OR-knees-OR-blueming).csv"
urllib.request.urlretrieve(url, "IU(someday-OR-knees-OR-blueming).csv")

# Import libraries
import pandas as pd
import numpy as np
import string
import re
import nltk
import swifter

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Import functions from NLTK
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

# Import Sastrawi Stemmer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Load data
TWEET_DATA = pd.read_csv("/content/IU(someday-OR-knees-OR-blueming).csv")

# Case Folding
TWEET_DATA['tweet'] = TWEET_DATA['full_text'].str.lower()

# Text Preprocessing
def preprocess_text(text):
    text = text.replace('\\t', " ").replace('\\n', " ").replace('\\u', " ").replace('\\', "")
    text = text.encode('ascii', 'replace').decode('ascii')
    text = re.sub(r"(@[A-Za-z0-9_]+)|([^\s]+\.[^\s]+)|(t\.co/[^\s]+)|(#\w+)", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()
    text = re.sub('\\s+', ' ', text)
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    return text

TWEET_DATA['tweet'] = TWEET_DATA['tweet'].swifter.apply(preprocess_text)

# Tokenize
TWEET_DATA['tweet_tokens'] = TWEET_DATA['tweet'].swifter.apply(word_tokenize)

# Frequency Distribution
TWEET_DATA['tweet_tokens_fdist'] = TWEET_DATA['tweet_tokens'].swifter.apply(FreqDist)

# Stopwords removal
list_stopwords = stopwords.words('indonesian')
additional_stopwords = ["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 'kalo', 'amp', 'biar', 'bikin', 'bilang', 'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 'jd', 'jgn', 'sdh', 'aja', 'n', 't', 'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt', '&amp', 'yah']

# Read additional stopwords from file
txt_stopword = pd.read_csv("/content/stopword.txt", names=["stopwords"], header=None)
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
list_stopwords = set(list_stopwords)

def remove_stopwords(words):
    return [word for word in words if word not in list_stopwords]

TWEET_DATA['tweet_tokens_WSW'] = TWEET_DATA['tweet_tokens'].swifter.apply(remove_stopwords)

# Normalization
normalizad_word = pd.read_excel("/content/normalisasi-V1.xlsx", header=None)
normalizad_word_dict = pd.Series(normalizad_word[1].values, index=normalizad_word[0]).to_dict()

def normalize_terms(document):
    document = [normalizad_word_dict.get(term, term) for term in document]
    document = [re.sub(r'(.)\1{2,}', r'\1\1', term) for term in document]
    return document

TWEET_DATA['tweet_normalized'] = TWEET_DATA['tweet_tokens_WSW'].swifter.apply(normalize_terms)

# Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_terms(document):
    return [stemmer.stem(term) for term in document]

TWEET_DATA['tweet_tokens_stemmed'] = TWEET_DATA['tweet_normalized'].swifter.apply(stem_terms)

# Save to Excel
TWEET_DATA.to_excel("IU.xlsx")

# Print results
print('Case Folding Result : \n')
print(TWEET_DATA['tweet'].head(5))
print('\nTokenizing Result : \n')
print(TWEET_DATA['tweet_tokens'].head())
print('\nFrequency Tokens : \n')
print(TWEET_DATA['tweet_tokens_fdist'].head().apply(lambda x: x.most_common()))
print(TWEET_DATA['tweet_tokens_WSW'].head())
print(TWEET_DATA['tweet_tokens_stemmed'].head())


Pandas Apply:   0%|          | 0/1831 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1831 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1831 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1831 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1831 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1831 [00:00<?, ?it/s]

Case Folding Result : 

0                     iu sama kiyong prnah main dimana
1    mba iu bingung jakaena pada sedihsedih bgt min...
2                       spesial itu ya jakarta bagi iu
3    abis meminum obat pcd alias nonton lagi fancam...
4    karna mohon maaf pelajar saat itu untuk ke kor...
Name: tweet, dtype: object

Tokenizing Result : 

0              [iu, sama, kiyong, prnah, main, dimana]
1    [mba, iu, bingung, jakaena, pada, sedihsedih, ...
2                [spesial, itu, ya, jakarta, bagi, iu]
3    [abis, meminum, obat, pcd, alias, nonton, lagi...
4    [karna, mohon, maaf, pelajar, saat, itu, untuk...
Name: tweet_tokens, dtype: object

Frequency Tokens : 

0    [(iu, 1), (sama, 1), (kiyong, 1), (prnah, 1), ...
1    [(minta, 2), (the, 2), (mba, 1), (iu, 1), (bin...
2    [(spesial, 1), (itu, 1), (ya, 1), (jakarta, 1)...
3    [(lagi, 2), (jadi, 2), (lagu, 2), (abis, 1), (...
4    [(yang, 2), (di, 2), (aku, 2), (so, 2), (lagu,...
Name: tweet_tokens_fdist, dtype: object
0    