In [1]:
!pip install newspaper3k lxml_html_clean
!pip install --upgrade lxml
!pip install sastrawi

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.3-py3-none-any.whl.metadata (2.3 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downlo

In [2]:
!pip install newspaper3k



In [4]:
import feedparser
import pandas as pd
import nltk, re, html
from bs4 import BeautifulSoup
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from wordcloud import WordCloud

nltk.download("punkt")
nltk.download("punkt_tab")

# ============================
# 1. AMBIL DATA DARI GOOGLE NEWS
# ============================

RSS = "https://news.google.com/rss/search?q=%22kesehatan+mental%22+OR+%22depresi%22+OR+%22gangguan+mental%22&hl=id&gl=ID&ceid=ID:id"

feed = feedparser.parse(RSS)

titles = []
summaries = []
links = []

for entry in feed.entries:
    titles.append(entry.title)
    summaries.append(entry.summary)
    links.append(entry.link)

print("Total artikel ditemukan:", len(titles))

df = pd.DataFrame({
    "title": titles,
    "summary": summaries,
    "link": links
})

df["raw_text"] = df["title"] + " " + df["summary"]

# ============================
# 2. CLEAN HTML & ENTITIES
# ============================

def clean_html(text):
    # hilangkan HTML tags
    text = BeautifulSoup(text, "html.parser").get_text(" ")
    # decode html entities (&nbsp;, &quot;, dll)
    text = html.unescape(text)
    # hapus hex warna/angka (misal 6f6f6f)
    text = re.sub(r"\b[0-9a-fA-F]{6}\b", " ", text)
    # hapus kata font, color, news, google bila muncul dalam HTML
    text = re.sub(r"\b(font|color|google|news|com|href|rss|articles|nbsp)\b", " ", text)
    # bersihkan karakter non-alfanumerik
    text = re.sub(r"[^A-Za-z0-9\u00C0-\u024F ]", " ", text)
    return text

df["clean_html"] = df["raw_text"].apply(clean_html)

# ============================
# 3. PREPROCESS NLP
# ============================

factory = StemmerFactory()
stemmer = factory.create_stemmer()

stopwords = set([
    "yang","dan","di","ke","dari","pada","untuk","dengan","sebagai","itu","ini","atau","oleh",
    "para","adalah","juga","tidak","saat","akan","karena","lebih","agar","bagi","dalam",
    "kita","mereka","akan","sudah","belum","jadi","hingga","sehat","mental","kesehatan"
])

def preprocess(text):
    t = text.lower()
    tokens = nltk.word_tokenize(t)
    tokens = [stemmer.stem(w) for w in tokens if w not in stopwords and len(w) > 2]
    return " ".join(tokens)

df["clean"] = df["clean_html"].apply(preprocess)
df.to_csv("cleaned_articles.csv", index=False)

# ============================
# 4. TF-IDF
# ============================

vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
X = vectorizer.fit_transform(df["clean"])

scores = np.asarray(X.mean(axis=0)).ravel()
terms = vectorizer.get_feature_names_out()

top_idx = scores.argsort()[::-1][:20]
top_df = pd.DataFrame({"term": terms[top_idx], "score": scores[top_idx]})

print("\nTOP 20 TF-IDF:")
print(top_df)

# ============================
# 5. WORDCLOUD
# ============================

wc = WordCloud(width=1200, height=600, background_color="white")
wc.generate(" ".join(df["clean"]))
wc.to_file("wordcloud.png")

print("\nWordcloud berhasil dibuat: wordcloud.png")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Total artikel ditemukan: 100

TOP 20 TF-IDF:
             term     score
0         depresi  0.036340
1            rata  0.022247
2       indonesia  0.019078
3            jaga  0.017196
4       mahasiswa  0.015978
5            kuat  0.015964
6          remaja  0.015402
7         jakarta  0.015044
8          antara  0.014744
9           warga  0.013822
10           news  0.013564
11         ganggu  0.013325
12           anak  0.012688
13  angka depresi  0.012542
14          angka  0.012542
15       nasional  0.012523
16    antara news  0.012285
17          alami  0.012132
18        tingkat  0.012027
19           guru  0.011837

Wordcloud berhasil dibuat: wordcloud.png


In [None]:
from google.colab import files
files.download("wordcloud.png")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>