## Crawling


In [1]:
import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

**Fungsi Crawling**

In [2]:
def crawl(url, num_page):
    
    result = []
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "lxml")
    
    #find paging page 
    paging = soup.find_all("nav",{'class':'pagination'})
    paging_link = paging[0].find_all('a',{'class':'pagination__page'})
    last_page = num_page

    #looping through paging
    for i in range(1,last_page+1):
        print(url+'page/'+str(i))

        #find article link
        req = requests.get(url+'page/'+str(i))
        soup = BeautifulSoup(req.text, "lxml")
        news_links = soup.find_all("div",{'class':'thumb-container thumb-65'})
        # print(news_links)

        #looping through article link
    
        for idx,news in enumerate(news_links):
            news_dict = {}

            #find news title
            title_news= news.find('h1',{'class':'entry__title'}).text.replace("\n", "").lower()
            title_news = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', title_news)
            title_news = re.sub(r'[0-9]', '', title_news)
    

            #find urll news
            url_news = news.find('a',{'class':''}).get('href')

            
            #find news content in url
            req_news =  requests.get(url_news)
            soup_news = BeautifulSoup(req_news.text, "lxml")

            #find news content 
            news_content = soup_news.find("div",{'class':'entry__article-wrap'})

            #find paragraph in news content 
            p = news_content.find_all('p')
            content = ' '.join(item .text for item in p)
            news_content = content.encode('utf8','replace')

            #wrap in dictionary 
            news_dict['no']=idx+1
            news_dict['url'] = url_news
            news_dict['title'] = title_news
            news_dict['content'] = news_content
            result.append(news_dict)
         
    return result

In [3]:
url = 'https://visitjawatengah.jatengprov.go.id/id/regency/kabupaten-semarang/destinasi-wisata/'
crwl  = crawl(url, 4)

https://visitjawatengah.jatengprov.go.id/id/regency/kabupaten-semarang/destinasi-wisata/page/1
https://visitjawatengah.jatengprov.go.id/id/regency/kabupaten-semarang/destinasi-wisata/page/2
https://visitjawatengah.jatengprov.go.id/id/regency/kabupaten-semarang/destinasi-wisata/page/3
https://visitjawatengah.jatengprov.go.id/id/regency/kabupaten-semarang/destinasi-wisata/page/4


In [4]:
crwl = pd.DataFrame(crwl)
crwl.head()

Unnamed: 0,no,url,title,content
0,1,https://visitjawatengah.jatengprov.go.id/id/re...,puri kayana camping ground,b'#SobatWisata\xc2\xa0kangen ngecamp nggak nih...
1,2,https://visitjawatengah.jatengprov.go.id/id/re...,mengunjungi wisata religi nyatnyono,b'Desa Nyatnyono di lereng Gunung Ungaran dike...
2,3,https://visitjawatengah.jatengprov.go.id/id/re...,candi dukuh petilasan prabu brawijaya,"b'Candi Dukuh terletak di Desa Rowoboni, Kecam..."
3,4,https://visitjawatengah.jatengprov.go.id/id/re...,hal spesial di desa wisata lerep,b'Jalan-jalan ke Semarang? Mampirlah ke Desa W...
4,5,https://visitjawatengah.jatengprov.go.id/id/re...,all in one in sunrise hill gedong songo,b'#SobatWisata\xc2\xa0lagi cari referensi rest...


In [5]:
crwl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   no       38 non-null     int64 
 1   url      38 non-null     object
 2   title    38 non-null     object
 3   content  38 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.3+ KB


**Cleaning Dokumen hasil crwaling**

In [6]:
def documents_cleaning(doc):
  docs_clean = []

  for d in doc:
      html = d.decode("utf-8") 
      document_test = re.sub(r'[^\x00-\x7F]+', ' ', html)
      document_test = re.sub(r'@\w+', '', document_test)
      document_test = document_test.lower()
      document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
      document_test = re.sub(r'[0-9]', '', document_test)
      document_test = re.sub(r'\s{2,}', ' ', document_test)
      document_test = document_test.replace("www", " ")
      document_test = document_test.replace(" com ", " ")
      document_test = document_test.replace("kompas", " ")
      document_test = document_test.replace("visitjateng", " ")
      document_test = document_test.replace("visitjawatengah", " ")
      document_test = document_test.replace("kabsemarangtourism", " ")
      document_test = document_test.replace("html", " ")
      document_test = document_test.replace("http", " ")
      document_test = document_test.replace("https", " ")
      docs_clean.append(document_test)

  return docs_clean

In [7]:
crwl['clean_content'] = documents_cleaning(crwl['content'])

In [8]:
crwl.head()

Unnamed: 0,no,url,title,content,clean_content
0,1,https://visitjawatengah.jatengprov.go.id/id/re...,puri kayana camping ground,b'#SobatWisata\xc2\xa0kangen ngecamp nggak nih...,sobatwisata kangen ngecamp nggak nih cuss ber...
1,2,https://visitjawatengah.jatengprov.go.id/id/re...,mengunjungi wisata religi nyatnyono,b'Desa Nyatnyono di lereng Gunung Ungaran dike...,desa nyatnyono di lereng gunung ungaran dikena...
2,3,https://visitjawatengah.jatengprov.go.id/id/re...,candi dukuh petilasan prabu brawijaya,"b'Candi Dukuh terletak di Desa Rowoboni, Kecam...",candi dukuh terletak di desa rowoboni kecamata...
3,4,https://visitjawatengah.jatengprov.go.id/id/re...,hal spesial di desa wisata lerep,b'Jalan-jalan ke Semarang? Mampirlah ke Desa W...,jalan jalan ke semarang mampirlah ke desa wisa...
4,5,https://visitjawatengah.jatengprov.go.id/id/re...,all in one in sunrise hill gedong songo,b'#SobatWisata\xc2\xa0lagi cari referensi rest...,sobatwisata lagi cari referensi resto pengina...


In [9]:
datalink = crwl[['no','title','content','clean_content','url']]
datalink.to_csv('crawl_wisata.csv', index=False)
datalink.head()

Unnamed: 0,no,title,content,clean_content,url
0,1,puri kayana camping ground,b'#SobatWisata\xc2\xa0kangen ngecamp nggak nih...,sobatwisata kangen ngecamp nggak nih cuss ber...,https://visitjawatengah.jatengprov.go.id/id/re...
1,2,mengunjungi wisata religi nyatnyono,b'Desa Nyatnyono di lereng Gunung Ungaran dike...,desa nyatnyono di lereng gunung ungaran dikena...,https://visitjawatengah.jatengprov.go.id/id/re...
2,3,candi dukuh petilasan prabu brawijaya,"b'Candi Dukuh terletak di Desa Rowoboni, Kecam...",candi dukuh terletak di desa rowoboni kecamata...,https://visitjawatengah.jatengprov.go.id/id/re...
3,4,hal spesial di desa wisata lerep,b'Jalan-jalan ke Semarang? Mampirlah ke Desa W...,jalan jalan ke semarang mampirlah ke desa wisa...,https://visitjawatengah.jatengprov.go.id/id/re...
4,5,all in one in sunrise hill gedong songo,b'#SobatWisata\xc2\xa0lagi cari referensi rest...,sobatwisata lagi cari referensi resto pengina...,https://visitjawatengah.jatengprov.go.id/id/re...


# Index

**Membaca crawl_wisata.csv**

In [10]:
df = pd.read_csv('crawl_wisata.csv')
df.head()

Unnamed: 0,no,title,content,clean_content,url
0,1,puri kayana camping ground,b'#SobatWisata\xc2\xa0kangen ngecamp nggak nih...,sobatwisata kangen ngecamp nggak nih cuss ber...,https://visitjawatengah.jatengprov.go.id/id/re...
1,2,mengunjungi wisata religi nyatnyono,b'Desa Nyatnyono di lereng Gunung Ungaran dike...,desa nyatnyono di lereng gunung ungaran dikena...,https://visitjawatengah.jatengprov.go.id/id/re...
2,3,candi dukuh petilasan prabu brawijaya,"b'Candi Dukuh terletak di Desa Rowoboni, Kecam...",candi dukuh terletak di desa rowoboni kecamata...,https://visitjawatengah.jatengprov.go.id/id/re...
3,4,hal spesial di desa wisata lerep,b'Jalan-jalan ke Semarang? Mampirlah ke Desa W...,jalan jalan ke semarang mampirlah ke desa wisa...,https://visitjawatengah.jatengprov.go.id/id/re...
4,5,all in one in sunrise hill gedong songo,b'#SobatWisata\xc2\xa0lagi cari referensi rest...,sobatwisata lagi cari referensi resto pengina...,https://visitjawatengah.jatengprov.go.id/id/re...


**Menggabungkan title dan clean_content dalam 1 kolom**

In [11]:
df['combined']=df['title']+' '+df['clean_content']

**Menampilkan kata-kata yang ada, lokasi kata, dan jumlah kata pada tiap dokumen, menggunakan *CountVectoriser***

In [12]:
# Activate CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorizer
vect = CountVectorizer()  
vects = vect.fit_transform(df['combined'])

td = pd.DataFrame(vects.toarray())
td.columns = vect.get_feature_names_out()
term_document_matrix = td.T
term_document_matrix.columns =  ['Doc'+str(i+1) for i in range(td.shape[0])]
term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)

print(term_document_matrix.head())


term_document_matrix.to_csv('term_document.csv')

           Doc1  Doc2  Doc3  Doc4  Doc5  Doc6  Doc7  Doc8  Doc9  Doc10  ...  \
abad          0     0     1     0     0     0     0     0     0      0  ...   
acara         0     0     0     0     0     0     0     0     0      0  ...   
ada           0     0     0     0     1     0     1     0     0      0  ...   
adalah        0     0     0     0     0     0     0     0     0      0  ...   
adrenalin     0     0     0     0     0     0     0     0     0      1  ...   

           Doc30  Doc31  Doc32  Doc33  Doc34  Doc35  Doc36  Doc37  Doc38  \
abad           1      0      0      0      0      0      0      0      0   
acara          0      2      0      0      0      0      0      0      0   
ada            0      0      0      0      0      0      2      1      0   
adalah         0      1      0      0      1      0      1      0      1   
adrenalin      0      0      0      0      0      0      0      0      0   

           total_count  
abad                 2  
acara             

**Tokenization** memecah kata pada hasil kombinasi title dan clean_content yang ada menggunakan

In [13]:
df_parse = df.copy()

cols = ["combined"]
for col in cols:
  df_parse[col] = df_parse[col].str.split()

df_parse.head()

Unnamed: 0,no,title,content,clean_content,url,combined
0,1,puri kayana camping ground,b'#SobatWisata\xc2\xa0kangen ngecamp nggak nih...,sobatwisata kangen ngecamp nggak nih cuss ber...,https://visitjawatengah.jatengprov.go.id/id/re...,"[puri, kayana, camping, ground, sobatwisata, k..."
1,2,mengunjungi wisata religi nyatnyono,b'Desa Nyatnyono di lereng Gunung Ungaran dike...,desa nyatnyono di lereng gunung ungaran dikena...,https://visitjawatengah.jatengprov.go.id/id/re...,"[mengunjungi, wisata, religi, nyatnyono, desa,..."
2,3,candi dukuh petilasan prabu brawijaya,"b'Candi Dukuh terletak di Desa Rowoboni, Kecam...",candi dukuh terletak di desa rowoboni kecamata...,https://visitjawatengah.jatengprov.go.id/id/re...,"[candi, dukuh, petilasan, prabu, brawijaya, ca..."
3,4,hal spesial di desa wisata lerep,b'Jalan-jalan ke Semarang? Mampirlah ke Desa W...,jalan jalan ke semarang mampirlah ke desa wisa...,https://visitjawatengah.jatengprov.go.id/id/re...,"[hal, spesial, di, desa, wisata, lerep, jalan,..."
4,5,all in one in sunrise hill gedong songo,b'#SobatWisata\xc2\xa0lagi cari referensi rest...,sobatwisata lagi cari referensi resto pengina...,https://visitjawatengah.jatengprov.go.id/id/re...,"[all, in, one, in, sunrise, hill, gedong, song..."


In [14]:
# Memasukkan tokenisasi ke dalam df_combined
df_combined = pd.DataFrame(columns = ["document"])
df_combined["document"] = df_parse['combined']

df_combined.head()

Unnamed: 0,document
0,"[puri, kayana, camping, ground, sobatwisata, k..."
1,"[mengunjungi, wisata, religi, nyatnyono, desa,..."
2,"[candi, dukuh, petilasan, prabu, brawijaya, ca..."
3,"[hal, spesial, di, desa, wisata, lerep, jalan,..."
4,"[all, in, one, in, sunrise, hill, gedong, song..."


**Menambahkan nilai ID dokumen dari lokasi kata ditemukan berdasarkan letak kata pada dokumen ke-n, n=index**

In [15]:
df_term = pd.DataFrame(columns = ["Terms","DocID"])

size = df_combined.shape[0]
for index in range(size):
    row = df_combined['document'].iloc[index]

    for item in row:
        df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)

  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = df_term.append({"Terms":item,"DocID":index}, ignore_index=True)
  df_term = 

In [16]:
df_term.head(10)

Unnamed: 0,Terms,DocID
0,puri,0
1,kayana,0
2,camping,0
3,ground,0
4,sobatwisata,0
5,kangen,0
6,ngecamp,0
7,nggak,0
8,nih,0
9,cuss,0


In [17]:
df_term.shape

(2930, 2)

**Mengurutkan Term berdasar alfabetnya**

In [18]:
df_term_sorted = df_term.sort_values(by="Terms")
df_term_sorted.head(20)

Unnamed: 0,Terms,DocID
158,abad,2
2241,abad,29
2376,acara,30
2363,acara,30
628,ada,10
688,ada,11
383,ada,6
2716,ada,35
872,ada,14
299,ada,4


In [19]:
df_term_sorted.shape

(2930, 2)

# inverted index

**Inisialisasi inverted index**

In [20]:
df_invertedindex = pd.DataFrame(columns = ["Terms","docFreq","postList"])
df_invertedindex

Unnamed: 0,Terms,docFreq,postList


In [21]:
# Memasukkan daftar term
size = df_term_sorted.shape[0]
df_invertedindex['Terms'] = df_term_sorted['Terms'].unique()

df_invertedindex.head()

Unnamed: 0,Terms,docFreq,postList
0,abad,,
1,acara,,
2,ada,,
3,adalah,,
4,adrenalin,,


In [22]:
# inisialisasi nilai
df_invertedindex["docFreq"]=0
df_invertedindex.head()

Unnamed: 0,Terms,docFreq,postList
0,abad,0,
1,acara,0,
2,ada,0,
3,adalah,0,
4,adrenalin,0,


In [23]:
size = df_term_sorted.shape[0]
print(size)
invertedIndexCounter = 0
temp_list = []
for i in range(size):
    if (df_term_sorted['Terms'].iloc[i] == df_invertedindex['Terms'].iloc[invertedIndexCounter]):
        df_invertedindex["docFreq"].iloc[invertedIndexCounter]+=1
        temp_list.append(df_term_sorted["DocID"].iloc[i])
        df_invertedindex["postList"].iloc[invertedIndexCounter] = temp_list
    else :
        temp_list = []
        invertedIndexCounter += 1
        df_invertedindex["docFreq"].iloc[invertedIndexCounter] += 1
        temp_list.append(df_term_sorted["DocID"].iloc[i])
        df_invertedindex["postList"].iloc[invertedIndexCounter] = temp_list


2930


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_invertedindex["docFreq"].iloc[invertedIndexCounter]+=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_invertedindex["postList"].iloc[invertedIndexCounter] = temp_list
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_invertedindex["docFreq"].iloc[invertedIndexCounter] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

In [24]:
print(df_invertedindex.head())

       Terms  docFreq                            postList
0       abad        2                             [2, 29]
1      acara        2                            [30, 30]
2        ada        9  [10, 11, 6, 35, 14, 4, 28, 35, 36]
3     adalah        7        [37, 33, 15, 18, 27, 35, 30]
4  adrenalin        2                             [9, 17]


In [25]:
# Export to CSV
print(df_invertedindex)
df_invertedindex.to_csv("invertedIndex.csv",index=False)

           Terms  docFreq                            postList
0           abad        2                             [2, 29]
1          acara        2                            [30, 30]
2            ada        9  [10, 11, 6, 35, 14, 4, 28, 35, 36]
3         adalah        7        [37, 33, 15, 18, 27, 35, 30]
4      adrenalin        2                             [9, 17]
...          ...      ...                                 ...
1104    youghurt        1                                [36]
1105         yuk        3                           [3, 0, 6]
1106      ziarah        1                                [22]
1107  zipcoaster        1                                [10]
1108        zona        1                                [17]

[1109 rows x 3 columns]


# Kompresi Index

**Membaca Dataset Inverted index**

In [26]:
df = pd.read_csv('invertedIndex.csv')
df.head(None)

Unnamed: 0,Terms,docFreq,postList
0,abad,2,"[2, 29]"
1,acara,2,"[30, 30]"
2,ada,9,"[10, 11, 6, 35, 14, 4, 28, 35, 36]"
3,adalah,7,"[37, 33, 15, 18, 27, 35, 30]"
4,adrenalin,2,"[9, 17]"
...,...,...,...
1104,youghurt,1,[36]
1105,yuk,3,"[3, 0, 6]"
1106,ziarah,1,[22]
1107,zipcoaster,1,[10]


In [27]:
# Membuat dataframe baru
df_dict = pd.DataFrame(columns = ["Freq","Posting_ptr","Terms_ptr"])

df_dict["Freq"] = df["docFreq"]
df_dict["Posting_ptr"] = df["postList"]

print(df_dict)


      Freq                         Posting_ptr Terms_ptr
0        2                             [2, 29]       NaN
1        2                            [30, 30]       NaN
2        9  [10, 11, 6, 35, 14, 4, 28, 35, 36]       NaN
3        7        [37, 33, 15, 18, 27, 35, 30]       NaN
4        2                             [9, 17]       NaN
...    ...                                 ...       ...
1104     1                                [36]       NaN
1105     3                           [3, 0, 6]       NaN
1106     1                                [22]       NaN
1107     1                                [10]       NaN
1108     1                                [17]       NaN

[1109 rows x 3 columns]


In [28]:
# Membuat term string dan term pointer
term_str = ""
term_counter = 0
for i in range(df_dict.shape[0]):
    term = df["Terms"].iloc[i]
    term_str += term
    df_dict["Terms_ptr"].iloc[i] = term_counter
    term_counter += len(term)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dict["Terms_ptr"].iloc[i] = term_counter


In [29]:
print(term_str)

abadacaraadaadalahadrenalinadvantureagamaagendakanagroairairnyaajaajakakanakhirakomodasiaksesaktivitasalaalamalamialasallalternatifamanambarawaanakandandaanekaanginangklungantarantaraantaranyaantikapaapiareaarenarenaargoarsitekstunyaarsitekturartinyaasalnyaasliaspalassumptaasyikatapatasatauatraksiatvawalayoayooayunanbbagbagaibagaikanbagaimanabagibagianbahanbahasabahkanbaikbakalbaladewabalemongbambubanbanaranbandunganbangetbangsabangunanbantalanbanyakbanyubanyubirubanyukuningbarangbaratbarengbarubasahbasahanbaturbawahbawenbebatuanbeberapabedabegitubelandabeliaubenangbenihbentangbentukberadaberagamberartiberasalberbagaiberbalutberbasisberbauberbedabercocokberdakwahberdasarkanberdekatanberdirinyaberdoaberenangberfotoberjarakberjauhanberjuangberkelokberkemahberkonsepberkunjungberlamaberlikuberlokasibermainbermotorbernamabernuansaberolahragaberpusatbersamabersandarbersantaiberselerabersumberbertingkatberupaberziarahbesarbesarnyabesibetahbeternakbiasabiayabibitbidadaribikinbilliardbintorobir

In [30]:
print(df_dict)

      Freq                         Posting_ptr Terms_ptr
0        2                             [2, 29]         0
1        2                            [30, 30]         4
2        9  [10, 11, 6, 35, 14, 4, 28, 35, 36]         9
3        7        [37, 33, 15, 18, 27, 35, 30]        12
4        2                             [9, 17]        18
...    ...                                 ...       ...
1104     1                                [36]      7154
1105     3                           [3, 0, 6]      7162
1106     1                                [22]      7165
1107     1                                [10]      7171
1108     1                                [17]      7181

[1109 rows x 3 columns]


In [32]:
# Export to csv
df_dict.to_csv('compressed_index.csv',index=False)
term_file = open("term_compress.txt", "w")
term_file.write(term_str)
term_file.close()

