In [None]:
%restart_python

In [None]:
import requests
import pandas as pd

SUPABASE_URL = "SUPABASE_URL"
SUPABASE_KEY = "SERVICE_ROLE_API_KEY"

endpoint = f"{SUPABASE_URL}/rest/v1/news?select=*"

headers = {
    "apikey": SUPABASE_KEY,
    "Authorization": f"Bearer {SUPABASE_KEY}"
}

resp = requests.get(endpoint, headers=headers)
resp.raise_for_status()

pdf = pd.DataFrame(resp.json())
df = spark.createDataFrame(pdf)

df.show(5)

In [None]:
import pandas as pd
import re
import string
from pyspark.sql.functions import col, trim
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
%pip install wordcloud

In [None]:
print("Total data sebelum pembersihan:", df.count())

df_clean = (
    df
    .dropna(subset=["title"])
    .filter(trim(col("title")) != "")
    .dropDuplicates(["title"])
    .select("title")
)

print("Total data setelah pembersihan:", df_clean.count())

df_clean.show(5)

In [None]:
pdf = df_clean.toPandas()

In [None]:
def clean_text(text):
    text = str(text).lower()

    text = re.sub(r'http\S+|www\S+', '', text)

    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)

    text = re.sub(
        "[" 
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002700-\U000027BF"
        u"\U000024C2-\U0001F251"
        "]+",
        "",
        text
    )

    # hapus SEMUA selain huruf
    text = re.sub(r'[^a-z\s]', ' ', text)

    # rapikan spasi
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
def tokenize_text(text):
    return text.split()

In [None]:
normalization_dict = {
    "gk": "tidak", "ga": "tidak", "gak": "tidak", "nggak": "tidak",
    "ngga": "tidak", "tdk": "tidak", "tak": "tidak",
    "udh": "sudah", "udah": "sudah", "sdh": "sudah",
    "dr": "dari", "krn": "karena", "pdhl": "padahal",
    "yg": "yang", "dgn": "dengan", "dlm": "dalam",
    "aja": "saja", "bgt": "banget", "jg": "juga"
}

def normalize_tokens(tokens):
    return [normalization_dict.get(t, t) for t in tokens]

In [None]:
stopword_factory = StopWordRemoverFactory()
default_stopwords = set(stopword_factory.get_stop_words())

custom_stopwords = {
    "nih","sih","dong","deh","kan","lah","nya","kok","loh","pun",
    "aja","banget","cuma","doang","gitu",
    "iya","ya","oh","eh","hehe","hmm","wkwk","haha",
    "udah","lagi","masih","dah","telah","baru","akan",
    "ingin","harus","boleh","bisa","sangat","sekali"
}

negation_words = {"tidak","tak","gak","nggak","ngga","bukan","belum"}
stopwords = (default_stopwords | custom_stopwords) - negation_words

def remove_stopwords(tokens):
    return [t for t in tokens if t not in stopwords]

stemmer = StemmerFactory().create_stemmer()

def stemming_tokens(tokens):
    return [stemmer.stem(t) for t in tokens]

In [None]:
def preprocess_text(text):
    cleaned = clean_text(text)
    tokens = tokenize_text(cleaned)
    normalized = normalize_tokens(tokens)
    no_stopwords = remove_stopwords(normalized)
    stemmed = stemming_tokens(no_stopwords)
    return stemmed

pdf["title_clean"] = pdf["title"].apply(preprocess_text)
pdf[["title", "title_clean"]].head()

In [None]:
nrc_path = "/Volumes/workspace/drive/kamus-nrc/NRC-Emotion-Lexicon (1).csv"
nrc_df = pd.read_csv(nrc_path)

print("Kolom NRC:", nrc_df.columns)

In [None]:
lexicon_dict = {}

for _, row in nrc_df.iterrows():
    score = {
        "positive": int(row.get("Positive", 0)),
        "negative": int(row.get("Negative", 0))
    }

    # Bahasa Indonesia
    id_word = row.get("Indonesian (id)")
    if isinstance(id_word, str):
        lexicon_dict[id_word.lower()] = score

    # Bahasa Inggris (jaga-jaga)
    en_word = row.get("English (en)")
    if isinstance(en_word, str):
        lexicon_dict[en_word.lower()] = score

print("Jumlah kata lexicon:", len(lexicon_dict))

In [None]:
def sentiment_score(tokens, lexicon):
    pos, neg = 0, 0

    if not isinstance(tokens, list):
        return pos, neg

    for t in tokens:
        if t in lexicon:
            pos += lexicon[t]["positive"]
            neg += lexicon[t]["negative"]

    return pos, neg

In [None]:
def sentiment_label(tokens, lexicon):
    pos, neg = sentiment_score(tokens, lexicon)

    if pos >= neg:
        return "Positif"
    else:
        return "Negatif"

In [None]:
pdf["sentiment"] = pdf["title_clean"].apply(
    lambda tokens: sentiment_label(tokens, lexicon_dict))
display(pdf[["title", "title_clean", "sentiment"]])

In [None]:
sentiment_counts = pdf["sentiment"].value_counts()
display(sentiment_counts)

In [None]:
pdf["text"] = pdf["title_clean"].apply(lambda x: " ".join(x))

pdf["label"] = pdf["sentiment"].map({
    "Negatif": 0,
    "Positif": 1
})

pdf[["text", "label"]].head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    pdf["text"],
    pdf["label"],
    test_size=0.2,
    random_state=42,
    stratify=pdf["label"]
)

print("Train:", len(X_train))
print("Test :", len(X_test))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=3
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)

In [None]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC(
    class_weight="balanced",
    max_iter=5000
)

svm_model.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

y_pred = svm_model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

print(
    classification_report(
        y_test,
        y_pred,
        target_names=["Negatif", "Positif"]
    )
)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Combine all processed text into a single string
pdf = df.select("title").toPandas()

all_words = ' '.join(
    [' '.join(t) if isinstance(t, list) else str(t) for t in pdf['title']]
)

wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white'
).generate(all_words)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Processed Text')
plt.show()