## Crawling


In [1]:
import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def crawl(url, num_page):
    
    result = []
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "lxml")
    
    #find paging page 
    paging = soup.find_all("nav",{'class':'pagination'})
    paging_link = paging[0].find_all('a',{'class':'pagination__page'})
    last_page = num_page

    #looping through paging
    for i in range(1,last_page+1):
        print(url+'page/'+str(i))

        #find article link
        req = requests.get(url+'page/'+str(i))
        soup = BeautifulSoup(req.text, "lxml")
        news_links = soup.find_all("div",{'class':'thumb-container thumb-65'})
        # print(news_links)

        #looping through article link
    
        for idx,news in enumerate(news_links):
            news_dict = {}

            #find news title
            title_news= news.find('h1',{'class':'entry__title'}).text.replace("\n", "")
    

            #find urll news
            url_news = news.find('a',{'class':''}).get('href')

            
            #find news content in url
            req_news =  requests.get(url_news)
            soup_news = BeautifulSoup(req_news.text, "lxml")

            #find news content 
            news_content = soup_news.find("div",{'class':'entry__article-wrap'})

            #find paragraph in news content 
            p = news_content.find_all('p')
            content = ' '.join(item .text for item in p)
            news_content = content.encode('utf8','replace')

            #wrap in dictionary 
            news_dict['id']=idx+1
            news_dict['url'] = url_news
            news_dict['title'] = title_news
            news_dict['content'] = news_content
            result.append(news_dict)
         
    return result

In [3]:
url = 'https://visitjawatengah.jatengprov.go.id/id/regency/kabupaten-semarang/destinasi-wisata/'
crwl  = crawl(url, 4)

https://visitjawatengah.jatengprov.go.id/id/regency/kabupaten-semarang/destinasi-wisata/page/1
https://visitjawatengah.jatengprov.go.id/id/regency/kabupaten-semarang/destinasi-wisata/page/2
https://visitjawatengah.jatengprov.go.id/id/regency/kabupaten-semarang/destinasi-wisata/page/3


In [None]:
crwl = pd.DataFrame(crwl)
crwl.head()

Unnamed: 0,id,url,title,content
0,1,https://visitjawatengah.jatengprov.go.id/id/re...,PURI KAYANA CAMPING GROUND,b'#SobatWisata\xc2\xa0kangen ngecamp nggak nih...
1,2,https://visitjawatengah.jatengprov.go.id/id/re...,MENGUNJUNGI WISATA RELIGI NYATNYONO,b'Desa Nyatnyono di lereng Gunung Ungaran dike...
2,3,https://visitjawatengah.jatengprov.go.id/id/re...,"CANDI DUKUH, PETILASAN PRABU BRAWIJAYA","b'Candi Dukuh terletak di Desa Rowoboni, Kecam..."
3,4,https://visitjawatengah.jatengprov.go.id/id/re...,5 HAL SPESIAL DI DESA WISATA LEREP,b'Jalan-jalan ke Semarang? Mampirlah ke Desa W...
4,5,https://visitjawatengah.jatengprov.go.id/id/re...,ALL IN ONE IN SUNRISE HILL GEDONG SONGO,b'#SobatWisata\xc2\xa0lagi cari referensi rest...


In [None]:
crwl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       38 non-null     int64 
 1   url      38 non-null     object
 2   title    38 non-null     object
 3   content  38 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.3+ KB


In [None]:
def documents_cleaning(doc):
  docs_clean = []

  for d in doc:
      html = d.decode("utf-8") 
      document_test = re.sub(r'[^\x00-\x7F]+', ' ', html)
      document_test = re.sub(r'@\w+', '', document_test)
      document_test = document_test.lower()
      document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
      document_test = re.sub(r'[0-9]', '', document_test)
      document_test = re.sub(r'\s{2,}', ' ', document_test)
      document_test = document_test.replace("www", " ")
      document_test = document_test.replace(" com ", " ")
      document_test = document_test.replace("kompas", " ")
      document_test = document_test.replace("visitjateng", " ")
      document_test = document_test.replace("visitjawatengah", " ")
      document_test = document_test.replace("kabsemarangtourism", " ")
      document_test = document_test.replace("html", " ")
      document_test = document_test.replace("http", " ")
      document_test = document_test.replace("https", " ")
      docs_clean.append(document_test)

  return docs_clean

In [None]:
crwl['clean_content'] = documents_cleaning(crwl['content'])

In [None]:
crwl.to_csv('crawl_wisata.csv')
crwl.head()


Unnamed: 0,id,url,title,content,clean_content
0,1,https://visitjawatengah.jatengprov.go.id/id/re...,PURI KAYANA CAMPING GROUND,b'#SobatWisata\xc2\xa0kangen ngecamp nggak nih...,sobatwisata kangen ngecamp nggak nih cuss ber...
1,2,https://visitjawatengah.jatengprov.go.id/id/re...,MENGUNJUNGI WISATA RELIGI NYATNYONO,b'Desa Nyatnyono di lereng Gunung Ungaran dike...,desa nyatnyono di lereng gunung ungaran dikena...
2,3,https://visitjawatengah.jatengprov.go.id/id/re...,"CANDI DUKUH, PETILASAN PRABU BRAWIJAYA","b'Candi Dukuh terletak di Desa Rowoboni, Kecam...",candi dukuh terletak di desa rowoboni kecamata...
3,4,https://visitjawatengah.jatengprov.go.id/id/re...,5 HAL SPESIAL DI DESA WISATA LEREP,b'Jalan-jalan ke Semarang? Mampirlah ke Desa W...,jalan jalan ke semarang mampirlah ke desa wisa...
4,5,https://visitjawatengah.jatengprov.go.id/id/re...,ALL IN ONE IN SUNRISE HILL GEDONG SONGO,b'#SobatWisata\xc2\xa0lagi cari referensi rest...,sobatwisata lagi cari referensi resto pengina...


In [None]:
datalink = crwl[['clean_content','url']]

In [None]:
# Activate CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorizer
vect = CountVectorizer()  
vects = vect.fit_transform(datalink.clean_content)

td = pd.DataFrame(vects.toarray())
td.columns = vect.get_feature_names_out()
term_document_matrix = td.T
term_document_matrix.columns =  ['Doc'+str(i+1) for i in range(td.shape[0])]
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)

print(term_document_matrix.head())
print(term_document_matrix.shape)



term_document_matrix.to_csv('term_document.csv')

           Doc1  Doc2  Doc3  Doc4  Doc5  Doc6  Doc7  Doc8  Doc9  Doc10  ...  \
abad          0     0     1     0     0     0     0     0     0      0  ...   
acara         0     0     0     0     0     0     0     0     0      0  ...   
ada           0     0     0     0     1     0     1     0     0      0  ...   
adalah        0     0     0     0     0     0     0     0     0      0  ...   
adrenalin     0     0     0     0     0     0     0     0     0      1  ...   

           Doc30  Doc31  Doc32  Doc33  Doc34  Doc35  Doc36  Doc37  Doc38  \
abad           1      0      0      0      0      0      0      0      0   
acara          0      2      0      0      0      0      0      0      0   
ada            0      0      0      0      0      0      2      1      0   
adalah         0      1      0      0      1      0      1      0      1   
adrenalin      0      0      0      0      0      0      0      0      0   

           total_count  
abad                 2  
acara             

In [None]:
df = pd.read_csv('crawl_wisata.csv')
df.head()
print(df['title'],df['clean_content'])

0                     PURI KAYANA CAMPING GROUND 
1            MENGUNJUNGI WISATA RELIGI NYATNYONO 
2         CANDI DUKUH, PETILASAN PRABU BRAWIJAYA 
3             5 HAL SPESIAL DI DESA WISATA LEREP 
4        ALL IN ONE IN SUNRISE HILL GEDONG SONGO 
5                               Danuwo Waterpark 
6                                Watu Gajah Park 
7                            Khatulistiwa Garden 
8                         Gua Maria Rosa Mystica 
9                                  Dusun Semilir 
10                     SKIBAN DI KOPENG TREE TOP 
11                      GEDONGSONGO SUNRISE HILL 
12                      GUA MARIA KEREP AMBARAWA 
13    SALOKA, THEME PARK TERBESAR DI JAWA TENGAH 
14                      Jembatan Kaca Gumuk Reco 
15                           Taman Bunga Celosia 
16                             DESA WISATA LEREP 
17                            TIRTO ARGO SIWARAK 
18                         KAMPOENG KOPI BANARAN 
19                                    CURUG LAWE 


In [None]:
df_parse = df.copy()

cols = ["url","title","content","clean_content"]
for col in cols:
    df_parse[col] = df_parse[col].str.split()

print(df_parse[col])
df_parse[col].to_csv("df_parse.csv",index=False)

0     [sobatwisata, kangen, ngecamp, nggak, nih, cus...
1     [desa, nyatnyono, di, lereng, gunung, ungaran,...
2     [candi, dukuh, terletak, di, desa, rowoboni, k...
3     [jalan, jalan, ke, semarang, mampirlah, ke, de...
4     [sobatwisata, lagi, cari, referensi, resto, pe...
5     [woro, woro, ayo, ke, danuwo, segar, dan, biki...
6     [warnai, liburanmu, di, watu, gajah, park, wui...
7     [petik, stroberi, on, the, spot, cobain, deh, ...
8     [keheningan, di, gua, maria, rosa, mystica, ma...
9     [akhir, pekan, seru, di, dusun, semilir, cari,...
10    [skiban, di, kopeng, tree, top, seru, dan, men...
11    [gedongsongo, sunrise, hill, banyak, spot, ins...
12    [pesona, gua, maria, kerep, ambarawa, gua, mar...
13    [curug, kaliurip, ikon, wisata, baru, di, purw...
14    [masih, ingat, sensasi, menegangkan, ayunan, l...
15    [liburan, ke, eropa, tak, perlu, jauh, jauh, d...
16    [desa, wisata, lerep, memiliki, atraksi, wisat...
17    [tirto, argo, siwarak, di, dusun, blanten,

In [None]:
#Term with ID
df_term = pd.DataFrame(columns = ["Terms","ID"])

size = df_parse.shape[0]
for y in range(size):
    row = df_parse[col].iloc[y]

    for item in row:
        # df_term = pd.concat([df_term,{"Terms":item,"ID":y}], ignore_index=True)
        df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
        # https://www.balioglu.net/solved-how-to-convert-frame-append-to-pandas-concat/

        # pd.concat([pd.DataFrame([y], columns=["Terms","ID"]) for i in range(size)],ignore_index=True)

print(df_term.head(None))

  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms

            Terms  ID
0     sobatwisata   0
1          kangen   0
2         ngecamp   0
3           nggak   0
4             nih   0
...           ...  ..
2796         gula  37
2797         aren  37
2798         foto  37
2799            s  37
2800            p  37

[2801 rows x 2 columns]


  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"ID":y}, ignore_index=True)


In [None]:
# Sort Term berdasar alfabet
df_term_sorted = df_term.sort_values(by="Terms")

# print(df_term_sorted.tail(10))
print(df_term_sorted.shape[0])

2801


In [None]:
# inisialisasi inverted index
df_invertedindex = pd.DataFrame(columns = ["Terms","docFreq","postList"])
# copy nila term unik
size = df_term_sorted.shape[0]
df_invertedindex['Terms'] = df_term_sorted['Terms'].unique()

print(df_invertedindex.head())

       Terms docFreq postList
0       abad     NaN      NaN
1      acara     NaN      NaN
2        ada     NaN      NaN
3     adalah     NaN      NaN
4  adrenalin     NaN      NaN


In [None]:
# inisialisasi nilai
df_invertedindex["docFreq"]=0
print(df_invertedindex.tail(50))

               Terms  docFreq postList
1048          tropis        0      NaN
1049           tubuh        0      NaN
1050          tumpah        0      NaN
1051         tuntang        0      NaN
1052            udah        0      NaN
1053           udara        0      NaN
1054        udaranya        0      NaN
1055           ujung        0      NaN
1056            umat        0      NaN
1057           umbul        0      NaN
1058  umbulsidomukti        0      NaN
1059         ungaran        0      NaN
1060            unik        0      NaN
1061           untuk        0      NaN
1062              us        0      NaN
1063           utama        0      NaN
1064           utara        0      NaN
1065          valley        0      NaN
1066            view        0      NaN
1067            vila        0      NaN
1068           villa        0      NaN
1069      villajoglo        0      NaN
1070           wafat        0      NaN
1071          wahana        0      NaN
1072           waktu     

In [None]:
# inverted index
size = df_term_sorted.shape[0]
invertedIndexCounter = 0
temp_list = []
for i in range(size):
    if (df_term_sorted['Terms'].iloc[i] == df_invertedindex['Terms'].iloc[invertedIndexCounter]):
        df_invertedindex["docFreq"].iloc[invertedIndexCounter]+=1
        temp_list.append(df_term_sorted["ID"].iloc[i])
        df_invertedindex["postList"].iloc[invertedIndexCounter] = temp_list
    else :
        temp_list = []
        invertedIndexCounter += 1
        df_invertedindex["docFreq"].iloc[invertedIndexCounter] += 1
        temp_list.append(df_term_sorted["ID"].iloc[i])
        df_invertedindex["postList"].iloc[invertedIndexCounter] = temp_list

print(df_invertedindex.iloc[1:200])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_invertedindex["docFreq"].iloc[invertedIndexCounter]+=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_invertedindex["postList"].iloc[invertedIndexCounter] = temp_list
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_invertedindex["docFreq"].iloc[invertedIndexCounter] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

          Terms  docFreq                            postList
1         acara        2                            [30, 30]
2           ada        9  [14, 35, 28, 11, 36, 4, 10, 35, 6]
3        adalah        7        [30, 37, 35, 27, 33, 15, 18]
4     adrenalin        2                             [17, 9]
5     advanture        1                                [34]
..          ...      ...                                 ...
195      coffee        1                                [18]
196     cokelat        5                [36, 36, 36, 36, 36]
197         com        1                                [35]
198     contact        1                                [24]
199  convention        2                            [28, 18]

[199 rows x 3 columns]


In [None]:
# Export to CSV
print(df_invertedindex)
df_invertedindex.to_csv("invertedIndex.csv",index=False)

           Terms  docFreq                            postList
0           abad        2                             [2, 29]
1          acara        2                            [30, 30]
2            ada        9  [14, 35, 28, 11, 36, 4, 10, 35, 6]
3         adalah        7        [30, 37, 35, 27, 33, 15, 18]
4      adrenalin        2                             [17, 9]
...          ...      ...                                 ...
1093    youghurt        1                                [36]
1094         yuk        3                           [0, 3, 6]
1095      ziarah        1                                [22]
1096  zipcoaster        1                                [10]
1097        zona        1                                [17]

[1098 rows x 3 columns]


In [None]:
print(df_invertedindex[i])

KeyError: 2800