## Scrape Detik

In [4]:
from requests import get
from urllib.parse import urlsplit
from bs4 import BeautifulSoup

class DetikNewsApi:

    def __init__(self):
        """ Search URL"""
        self.search_url = 'https://www.detik.com/search/searchall?'

    def build_search_url(self, query: str, page_number: int):
        """ Building search url with query input, we can jump to specific page number"""
        qs = f'query={query}'
        qs2 = '&sortby=time&sorttime=0&siteid=29&page='
        return self.search_url + qs + qs2 + str(page_number)

    def build_detail_url(self, url: str):
        """ Build detail URL will turn off pagination in detail page """
        a = urlsplit(url)
        qs = 'single=1'
        detail_url = a.scheme + '://' + a.netloc + a.path + '?' + qs
        return detail_url

    def result_count(self, search_response):
        """ Search result count, need search response page """
        soup = BeautifulSoup(search_response.text, 'html.parser')
        tag = soup.find('span', 'fl text').text
        count = [int(s) for s in tag.split() if s.isdigit()]
        return count[0]

    def detail(self, url: str) -> str:
        detail_url = self.build_detail_url(url)
        req = get(detail_url)
        soup = BeautifulSoup(req.text, 'html.parser')
        tag = soup.find('div', class_="detail__body-text")
        body = ''
        if tag.find_all('p'):
            for i in tag.find_all('p'):
                body += i.text
        else:
            body += tag.text
        return body

    def parse(self, search_response, detail):
        soup = BeautifulSoup(search_response.text, 'html.parser')
        tag = soup.find_all('article')
        data = []

        for i in tag:
            judul = i.find('h2').text
            link = i.find('a').get('href')
            gambar = i.find('img').get('src')
            body = ''
            if detail:
                body = self.detail(link)
            waktu = i.find('span', class_="date").text
            data.append({'judul': judul,
                    'link': link,
                    'gambar': gambar,
                    'body': body,
                    'waktu': waktu
                    })
        return data

    def search(self, query, page_number=1, detail=False):
        url = self.build_search_url(query, page_number)
        search_response = get(url)
        parse_result = self.parse(search_response, detail)
        return parse_result


In [8]:
detik = DetikNewsApi()

# method search(query, page_number, detail)
res_detik = detik.search('gojek', 2, True)

In [9]:
import pandas as pd

df = pd.DataFrame(res_detik)
df.head()

Unnamed: 0,judul,link,gambar,body,waktu
0,"Tinggalkan 'Status' Startup, GOTO Buka Bab Baru",https://finance.detik.com/bursa-dan-valas/d-65...,https://akcdn.detik.net.id/community/media/vis...,PT GoTo Gojek Tokopedia Tbk (GOTO) disebut-seb...,"detikFinanceSelasa, 14 Feb 2023 14:06 WIB"
1,"2 Hari ARB Beruntun, Saham GoTo Hari Ini Ngega...",https://finance.detik.com/bursa-dan-valas/d-65...,https://akcdn.detik.net.id/community/media/vis...,Saham PT GoTo Gojek Tokopedia Tbk (GOTO) mengu...,"detikFinanceSenin, 13 Feb 2023 11:24 WIB"
2,"Pasar Modal Terguncang, 32 Saham Anjlok hingga...",https://finance.detik.com/bursa-dan-valas/d-65...,https://akcdn.detik.net.id/community/media/vis...,Indeks Harga Saham Gabungan (IHSG) hari ini te...,"detikFinanceJumat, 10 Feb 2023 19:00 WIB"
3,"Astaga! 32 Saham ARB Hari ini, GoTo-Unilever T...",https://finance.detik.com/bursa-dan-valas/d-65...,https://akcdn.detik.net.id/community/media/vis...,Indeks Harga Saham Gabungan (IHSG) hari ini me...,"detikFinanceJumat, 10 Feb 2023 17:11 WIB"
4,"Emiten Raksasa Terseok-seok, Saham GOTO hingga...",https://finance.detik.com/bursa-dan-valas/d-65...,https://akcdn.detik.net.id/community/media/vis...,Sederet saham-saham tenar hari ini kebakaran. ...,"detikFinanceJumat, 10 Feb 2023 10:56 WIB"


In [10]:
# df.to_csv('detik.csv', index=False)