In [None]:
import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## PART I: Preparing the documents/webpages

In [None]:
urls = ['https://travel.kompas.com/',
        'https://www.kompas.com/homey',
        'https://properti.kompas.com/',
        'https://lifestyle.kompas.com/',
        'https://www.kompas.com/food',
        'https://money.kompas.com/',
        'https://tekno.kompas.com/',
        'https://bola.kompas.com/',
        'https://edukasi.kompas.com/',
        'https://health.kompas.com/']

url = []
for url in urls:
  response = requests.get(url)
  soup = BeautifulSoup(response.content, "html.parser")

In [None]:
link = []
for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
    i['href'] = i['href'] + '?page=all'
    link.append(i['href'])

print(link)

['https://health.kompas.com/read/2021/01/14/100200768/15-kondisi-orang-yang-tak-bisa-disuntik-vaksin-covid-19?page=all', 'https://health.kompas.com/read/2021/01/12/200800868/10-penyakit-yang-ditandai-dengan-kelelahan?page=all', 'https://health.kompas.com/read/2021/01/12/080400068/12-makanan-yang-mengandung-potasium-tinggi?page=all', 'https://health.kompas.com/read/2021/01/13/060600668/minum-kopi-sebelum-olahraga-bagaimana-baiknya-?page=all', 'https://health.kompas.com/read/2021/01/12/060600368/12-gejala-hipotiroid-yang-perlu-diwaspadai?page=all', 'https://health.kompas.com/read/2021/01/13/080700168/kapan-harus-ke-dokter-ketika-sakit-tenggorokan-?page=all', 'https://health.kompas.com/read/2021/01/14/140700468/3-reaksi-yang-mungkin-terjadi-setelah-divaksin-covid-19?page=all', 'https://health.kompas.com/read/2021/01/12/100800968/10-dampak-buruk-konsumsi-karbohidrat-berlebihan?page=all', 'https://health.kompas.com/read/2021/01/12/120200468/7-gejala-anemia-pada-wanita?page=all', 'https://he

In [None]:
# Retrieve Paragraphs
webpages = []
for i in link:
    r = requests.get(i)
    soup = BeautifulSoup(r.content, 'html.parser')
    content = []
    for i in soup.find('div', {'class':'read__content'}).find_all('p'):
        content.append(i.text)

    webpages.append(' '.join(content))

## PART II: Preprocessing the documents/webpages

In [None]:
steam = PorterStemmer()

In [None]:
word = []
for d in webpages:
    document_test = word_tokenize(d) #tokenizing
    lowercase_sentence = re.sub(r'[^\x00-\x7F]+', ' ', d)  
    document_test = lowercase_sentence.lower() #case folding
    document_test = re.sub(r'[0-9]', '', document_test)
    filtered_sentence = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') #stopword
    document_test = filtered_sentence.sub('', document_test)
    document_test = document_test.split()
    document_test = " ".join([steam.stem(word) for word in document_test])#stem
      
    word.append(document_test)

# PART III: Indexing

In [None]:
X = TfidfVectorizer()
Y = X.fit_transform(alldoclist)

In [None]:
print(Y)

  (0, 514)	0.040587517997996085
  (0, 988)	0.030186133947862858
  (0, 739)	0.040587517997996085
  (0, 1007)	0.040587517997996085
  (0, 28)	0.040587517997996085
  (0, 595)	0.034503098373069
  (0, 346)	0.034503098373069
  (0, 10)	0.021788525338776343
  (0, 680)	0.018017294698008706
  (0, 518)	0.016436252146278453
  (0, 639)	0.040587517997996085
  (0, 1040)	0.030186133947862858
  (0, 1398)	0.040587517997996085
  (0, 1100)	0.040587517997996085
  (0, 74)	0.02410171432293578
  (0, 344)	0.040587517997996085
  (0, 913)	0.040587517997996085
  (0, 7)	0.030186133947862858
  (0, 141)	0.040587517997996085
  (0, 1321)	0.040587517997996085
  (0, 334)	0.040587517997996085
  (0, 679)	0.016436252146278453
  (0, 1239)	0.034503098373069
  (0, 1104)	0.026837636196411673
  (0, 87)	0.034503098373069
  :	:
  (9, 1363)	0.0160920583200353
  (9, 3)	0.029383563883082164
  (9, 61)	0.19099316524003407
  (9, 244)	0.11753425553232866
  (9, 990)	0.019370434717604745
  (9, 1401)	0.029554001947785468
  (9, 355)	0.019370

In [None]:
df = pd.DataFrame(Y.T.toarray(), index=X.get_feature_names())
print(df)

                0         1         2  ...         7         8         9
abnorm   0.000000  0.000000  0.000000  ...  0.000000  0.025068  0.000000
academi  0.000000  0.000000  0.000000  ...  0.012049  0.000000  0.000000
ada      0.021789  0.014517  0.000000  ...  0.007609  0.000000  0.042664
adalah   0.015006  0.129972  0.036656  ...  0.026202  0.027805  0.029384
adanya   0.000000  0.027042  0.000000  ...  0.000000  0.000000  0.000000
...           ...       ...       ...  ...       ...       ...       ...
yang     0.135054  0.249945  0.244375  ...  0.282983  0.120488  0.102842
yoghurt  0.000000  0.000000  0.011016  ...  0.000000  0.000000  0.000000
yogurt   0.000000  0.000000  0.077114  ...  0.000000  0.000000  0.000000
york     0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000
zat      0.000000  0.032116  0.019625  ...  0.016834  0.119089  0.000000

[1515 rows x 10 columns]


# PART IV: Retrieval

In [None]:
def getSimilarity(results, df):
  results = [results]
  results_vec = X.transform(results).toarray().reshape(df.shape[0],)
  similarity = {}

  for i in range(10):
    similarity[i] = np.dot(df.loc[:, i].values, results_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(results_vec)
    similarity_sorted = sorted(similarity.items(), key=lambda x: x[1], reverse=True)

  print()
  print("Result :")

  for k, v in similarity_sorted:
    if v != 0.0:
      print("Nilai Similaritas:", v)
      print(alldoclist[k])
      print(link[k])
      print()

# PART V: Search

In [None]:
term = input("search : ")
getSimilarity(term, df)

search : hidup sehat

Result :
Nilai Similaritas: 0.03955004340807893
kompas.com - anemia adalah salah satu masalah kesehatan rentan menyerang wanita. anemia merupakan kondisi saat tubuh kekurangan sel darah merah yang sehat. sel darah merah bertuga membawa oksigen yang cukup ke jaringan tubuh. saat pasokan oksigen berkurang, fungsi tubuh dapat terganggu. baca juga: waspada, suka makan es batu berisiko rusak gigi dan tanda anemia dilansir dari women health, anemia yang banyak menyerang wanita adalah jeni anemia defisiensi zat besi. penyebab anemia defisiensi zat besi beras dari kekurangan zat besi. wanita rentan terkena anemia ini saat haid. zat besi dibutuhkan untuk memproduksi darah baru untuk menggantikan darah yang terbuang setiap period menstruasi. risiko wanita usia subur terkena anemia meningkat saat period haidnya lebih lama atau darah yang keluar cukup banyak. sejumlah wanita juga rentan mengalami anemia saat mengalami fibroid rahim (pertumbuhan sel abnorm di dalam rahim), ata