# ***PREPROCESSING DATA***

In [None]:
import pandas as pd

data = pd.read_csv("dataset.csv", on_bad_lines='warn')

data.info()

In [None]:
df = pd.DataFrame(data[['teks']])
df.head(5)

**PROSES HAPUS DATA DUPLIKAT**

---



In [None]:
df.info()

In [None]:
df.drop_duplicates(subset ="teks", keep = 'first', inplace = True)
df.info()

**WORDCLOUD SEBELUM PREPROCESSING**

---



In [None]:
import pandas as pd
import numpy as np
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

df['teks'] = df['teks'].fillna('')

text = ' '.join(df['teks'].astype(str).tolist())

stopwords = set(STOPWORDS)

wc = WordCloud(stopwords=stopwords, background_color="white", max_words=500, width=800, height=400)

wc.generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

text = " ".join(df["teks"])

tokens = text.split()
word_counts = Counter(tokens)

top_words = word_counts.most_common(10)
word, count = zip(*top_words)
# Menggunakan palet warna lebih soft
colors = plt.cm.Pastel1(range(len(word)))

plt.figure(figsize=(15, 4))
bars = plt.bar(word, count, color=colors)
plt.xlabel("Kata-Kata Sering Muncul", fontsize=12, fontweight='bold')
plt.ylabel("Jumlah Kata", fontsize=12, fontweight='bold')
plt.title("Frekuensi Kata", fontsize=18, fontweight='bold')
plt.xticks(rotation=45)

# Menambahkan angka rata tengah di atas setiap bar
for bar, num in zip(bars, count):
    plt.text(bar.get_x() + bar.get_width() / 2 - 0.1, num + 1, str(num), fontsize=12, color='black', ha='center')


plt.show()

**PROSES CLEANING**

---



In [None]:
import re
import string
import nltk

# Fungsi untuk menghapus emoji
def remove_emoji(tweet):
    if tweet is not None and isinstance(tweet, str):
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F700-\U0001F77F"  # alchemical symbols
            u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
            u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
            u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
            u"\U0001FA00-\U0001FA6F"  # Chess Symbols
            u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
            u"\U0001F004-\U0001F0CF"  # Additional emoticons
            u"\U0001F1E0-\U0001F1FF"  # flags
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', tweet)
    else:
        return tweet

# Fungsi untuk menghapus simbol
def remove_symbols(tweet):
    if tweet is not None and isinstance(tweet, str):
        tweet = re.sub(r'[^a-zA-Z0-9\s]', '', tweet)  # Menghapus semua simbol
    return tweet

# Fungsi untuk menghapus angka
def remove_numbers(tweet):
    if tweet is not None and isinstance(tweet, str):
        tweet = re.sub(r'\d', '', tweet)  # Menghapus semua angka
    return tweet

def remove_username(text):
    import re
    return re.sub(r'@[^\s]+', '', text)


df['cleaning'] = df['teks'].apply(lambda x: remove_username(x))
df['cleaning'] = df['cleaning'].apply(lambda x: remove_emoji(x))
df['cleaning'] = df['cleaning'].apply(lambda x: remove_symbols(x))
df['cleaning'] = df['cleaning'].apply(lambda x: remove_numbers(x))

df.head(5)

**PROSES CASE FOLDING**

---



In [None]:
def case_folding(text):
    if isinstance(text, str):
        lowercase_text = text.lower()
        return lowercase_text
    else:
        return text

df['case_folding'] = df['cleaning'].apply(case_folding)
df.head(5)

**Normalisasi Kata**

---



In [None]:
# Upload kaggle.json
from google.colab import files
files.upload() # Pilih kaggle.json

# Setup Kaggle API credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset dari Kaggle
!kaggle datasets download -d fornigulo/kamus-slag

# Unzip dataset
!unzip kamus-slag.zip


In [None]:
import pandas as pd

# Fungsi penggantian kata tidak baku
def replace_taboo_words(text, kamus_tidak_baku):
    if isinstance(text, str):
        words = text.split()
        replaced_words = []
        kalimat_baku = []
        kata_diganti = []
        kata_tidak_baku_hash = []

        for word in words:
            if word in kamus_tidak_baku:
                baku_word = kamus_tidak_baku[word]
                if isinstance(baku_word, str) and all(char.isalpha() or char.isspace() for char in baku_word):
                    replaced_words.append(baku_word)
                    kalimat_baku.append(baku_word)
                    kata_diganti.append(word)
                    kata_tidak_baku_hash.append(hash(word))
            else:
                replaced_words.append(word)
        replaced_text = ' '.join(replaced_words)
    else:
        replaced_text = ''
        kalimat_baku = []
        kata_diganti = []
        kata_tidak_baku_hash = []

    return replaced_text, kalimat_baku, kata_diganti, kata_tidak_baku_hash

In [None]:
# Baca dataset
data =  pd.DataFrame(df[['teks','cleaning','case_folding']])
data.head(5)

In [None]:
kamus_data = pd.read_excel('kamuskatabaku.xlsx')
kamus_tidak_baku = dict(zip(kamus_data['tidak_baku'], kamus_data['kata_baku']))
kamus_data.head()

In [None]:
# Terapkan fungsi penggantian kata tidak baku
data['normalization'], data['Kata_Baku'], data['Kata_Tidak_Baku'],data['Kata_Tidak_Baku_Hash'] = zip(*data['case_folding'].apply(lambda x: replace_taboo_words(x, kamus_tidak_baku)))

df =  pd.DataFrame(data[['teks','cleaning','case_folding','normalization']])

df.head(100)

**TOKENIZATION**

---



In [None]:
def tokenize(text):
    tokens = text.split()
    return tokens

df['tokenize'] = df['normalization'].apply(tokenize)

df.head(5)

**PROSES STOPWORD REMOVAL**

---



In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('indonesian')

In [None]:
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

df['stopword removal'] = df['tokenize'].apply(lambda x: remove_stopwords(x))

df.head(5)

**PROSES STEAMING DATA**

---



In [None]:
!pip install Sastrawi

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(text):
    return [stemmer.stem(word) for word in text]

df['steming_data'] = df['stopword removal'].apply(lambda x: ' '.join(stem_text(x)))
df.head()

**PROSES HAPUS DATA BERNILAI KOSONG (NAN)**

---



In [None]:
df.info()

In [None]:
data = df.dropna()
data.info()

**WORDCLOUD SETELAH PREPROCESSING**

---



In [None]:
import pandas as pd
import numpy as np
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

# Menggabungkan teks dari kolom 'steming_data'
text = ' '.join(data['steming_data'].astype(str).tolist())

stopwords = set(STOPWORDS)
stopwords.update([
    'abang', 'adik', 'adek', 'allah', 'amin', 'awas', 'banget', 'bawa', 'berat', 'biar', 'boys', 'cuman', 'cuy',
    'dan', 'dek', 'di', 'dik', 'download', 'duluan', 'efek', 'erick', 'full', 'ganti', 'gue', 'guardiola', 'hilang',
    'hoki', 'ini', 'jalan', 'jajar', 'kah', 'kali', 'kalah', 'kayak', 'ke', 'ku', 'lagu', 'lagi', 'latih', 'lawan',
    'lu', 'lupa', 'mafia', 'malaysia', 'nih', 'nya', 'pas', 'pecat', 'sih', 'sia', 'suka', 'susul', 'takut', 'tau',
    'tidur', 'tinggal', 'tohir', 'tolong', 'towel', 'tengah', 'tunggu', 'untung', 'vietnam', 'wasit', 'ya', 'yaman',
    'yang', 'yg', 'yuk'
])

wc = WordCloud(stopwords=stopwords, background_color="white", max_words=500, width=800, height=400)

wc.generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import STOPWORDS

text = " ".join(data["steming_data"])

stopwords = set(STOPWORDS)
stopwords.update([
    'abang', 'adik', 'adek', 'allah', 'amin', 'awas', 'banget', 'bawa', 'berat', 'biar', 'boys', 'cuman', 'cuy',
    'dan', 'dek', 'di', 'dik', 'download', 'duluan', 'efek', 'erick', 'full', 'ganti', 'gue', 'guardiola', 'hilang',
    'hoki', 'ini', 'jalan', 'jajar', 'kah', 'kali', 'kalah', 'kayak', 'ke', 'ku', 'lagu', 'lagi', 'latih', 'lawan',
    'lu', 'lupa', 'mafia', 'malaysia', 'nih', 'nya', 'pas', 'pecat', 'sih', 'sia', 'suka', 'susul', 'takut', 'tau',
    'tidur', 'tinggal', 'tohir', 'tolong', 'towel', 'tengah', 'tunggu', 'untung', 'vietnam', 'wasit', 'ya', 'yaman',
    'yang', 'yg', 'yuk'
])

tokens = [word for word in text.split() if word not in stopwords]
word_counts = Counter(tokens)

top_words = word_counts.most_common(10)
word, count = zip(*top_words)
# Menggunakan palet warna lebih soft
colors = plt.cm.Pastel1(range(len(word)))

# Membuat plot
plt.figure(figsize=(12, 5))
bars = plt.bar(word, count, color=colors)
plt.xlabel("Kata-Kata Sering Muncul", fontsize=12, fontweight='bold')
plt.ylabel("Jumlah Kata", fontsize=12, fontweight='bold')
plt.title("Frekuensi Kata", fontsize=18, fontweight='bold')
plt.xticks(rotation=45)

# Menambahkan angka rata tengah di atas setiap bar
for bar, num in zip(bars, count):
    plt.text(bar.get_x() + bar.get_width() / 1.6 - 0.1, num + 1, str(num), fontsize=12, color='black', ha='center')

# Menampilkan plot
plt.show()


In [None]:
data.to_csv('hasil_preprocessing.csv',encoding='utf8', index=False)