In [None]:
import re #pola kata
import json #kelola data response dari tokopedia
import requests #minta request ke situs tokopedia
from bs4 import BeautifulSoup #scraping data


# Header For Request
HEADERS = {
    'accept': '*/*',
    'content-type': 'application/json',
    'origin': 'https://www.tokopedia.com',
    'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
    'x-source': 'tokopedia-lite',
    'x-tkpd-lite-service': 'zeus',
}

def get_shop_id(shop_domain):
  """
  Cari ShopID berdasarkan nama domain toko e.g : wardah-offical untuk wardah dari https://www.tokopedia.com/wardah-official
  Kembalikan ShopID, namaShop, ratingShop
  """
  url = "https://gql.tokopedia.com/graphql/ShopInfoCore"
  data= [
    {
        "operationName": "ShopInfoCore",
        "variables": {
            "id": 0,
            "domain": shop_domain
        },
        "query": """
        query ShopInfoCore($id: Int!, $domain: String) {
          shopInfoByID(input: {shopIDs: [$id], fields: ["active_product", "allow_manage_all", "assets", "core", "closed_info", "create_info", "favorite", "location", "status", "is_open", "other-goldos", "shipment", "shopstats", "shop-snippet", "other-shiploc", "shopHomeType", "branch-link", "goapotik", "fs_type"], domain: $domain, source: "shoppage"}) {
            result {
              shopCore {
                description
                domain
                shopID
                name
                tagLine
                defaultSort
                __typename
              }
              createInfo {
                openSince
                __typename
              }
              favoriteData {
                totalFavorite
                alreadyFavorited
                __typename
              }
              activeProduct
              shopAssets {
                avatar
                cover
                __typename
              }
              location
              isAllowManage
              branchLinkDomain
              isOpen
              shipmentInfo {
                isAvailable
                image
                name
                product {
                  isAvailable
                  productName
                  uiHidden
                  __typename
                }
                __typename
              }
              shippingLoc {
                districtName
                cityName
                __typename
              }
              shopStats {
                productSold
                totalTxSuccess
                totalShowcase
                __typename
              }
              statusInfo {
                shopStatus
                statusMessage
                statusTitle
                tickerType
                __typename
              }
              closedInfo {
                closedNote
                until
                reason
                detail {
                  status
                  __typename
                }
                __typename
              }
              bbInfo {
                bbName
                bbDesc
                bbNameEN
                bbDescEN
                __typename
              }
              goldOS {
                isGold
                isGoldBadge
                isOfficial
                badge
                shopTier
                __typename
              }
              shopSnippetURL
              customSEO {
                title
                description
                bottomContent
                __typename
              }
              isQA
              isGoApotik
              partnerInfo {
                fsType
                __typename
              }
              __typename
            }
            error {
              message
              __typename
            }
            __typename
          }
        }
        """
    }
]
  response = requests.post(url, headers=HEADERS, json=data)
  try:
    return response.json()[0]['data']['shopInfoByID']['result'][0]['shopCore']['shopID']
  except:
    return 0

def get_reviews_data(shop_id, page=1, limit=10, sort_by="", filter_by="rating=1,2,3,4,5"):
  """ Ambil reviews data dari tokopedia """
  url = 'https://gql.tokopedia.com/graphql/ReviewList'
  data = [{
      "operationName": "ReviewList",
      "variables": {
          "shopID": shop_id,
          "page": page,
          "limit": limit,
          "sortBy": sort_by,
          "filterBy": filter_by
      },
      "query": """
      query ReviewList($shopID: String!, $limit: Int!, $page: Int!, $filterBy: String, $sortBy: String) {
          productrevGetShopReviewReadingList(shopID: $shopID, limit: $limit, page: $page, filterBy: $filterBy, sortBy: $sortBy) {
              list {
                  id: reviewID
                  product {
                      productID
                      productName
                      productPageURL
                      productStatus
                  }
                  rating
                  reviewTime
                  reviewText
                  reviewerID
                  reviewerName
                  replyText
                  replyTime
                  badRatingReasonFmt
              }
              hasNext
              shopName
              totalReviews
          }
      }
      """
  }]

  response = requests.post(url, headers=HEADERS, json=data)
  return response.json()

In [None]:
# cari toko wardah
shopID = get_shop_id('wardah-official')

In [None]:
#ambil shop ID
shopID

In [None]:
# Test 1 data
json_log = get_reviews_data(shopID, page=1, limit=1)

In [None]:
json_log

## 10.000 data

In [None]:
data = []

def rekursif_scrape(shop_id, start=1, limit=10, iteration=10, max_retries=5, rating=5, sort_order="desc"):
    """ ambil komentar berdasarkan paginasi dengan retry mechanism """
    retries = 0

    while retries < max_retries:
        try:
            json_log = get_reviews_data(shop_id, page=start, limit=limit, sort_by=f"create_time {sort_order}", filter_by=f"rating={rating}")
            data.extend(json_log[0]['data']['productrevGetShopReviewReadingList']['list'])
            has_next = json_log[0]['data']['productrevGetShopReviewReadingList']['hasNext']
            if has_next:
                rekursif_scrape(shop_id, start=start+1, limit=limit, iteration=iteration, max_retries=max_retries, rating=rating, sort_order=sort_order)
            else:
                retries = max_retries
            break
        except TypeError:
            print('Terjadi kesalahan', json_log)
            retries += 1
            if retries == max_retries:
                print(f"Gagal mengambil data setelah {max_retries} percobaan pada halaman {start} dengan rating={rating} dan sort_order={sort_order}")

    if retries == max_retries:
        if rating > 1:
            rekursif_scrape(shop_id, start=1, limit=limit, iteration=iteration, max_retries=max_retries, rating=rating-1, sort_order=sort_order)
        elif rating == 1 and sort_order == "desc":
            rekursif_scrape(shop_id, start=1, limit=limit, iteration=iteration, max_retries=max_retries, rating=5, sort_order="asc")
        elif rating == 1 and sort_order == "asc":
            print("Semua data sudah diambil atau tidak ada data yang bisa diambil.")
            return None


In [None]:
# Contoh pemanggilan fungsi, ambil 500 data tiap paginasi dengan pindah paginasi sebanyak 10x
rekursif_scrape(shopID, limit=10, iteration=1, max_retries=1)

In [None]:
data

In [None]:
# periksa jumlah data
len(data)

In [None]:
nama_toko = None
rating_toko = None
user_name = data[0]['reviewerName']
user_review = data[0]['reviewText']
user_rating = data[0]['rating']
user_product = data[0]['product']['productName']
user_date =  data[0]['reviewTime']

In [None]:
user_review

In [None]:
# buat data ke tabulasi
import pandas as pd
df = pd.DataFrame(data)

In [None]:
df['reviewTime'].value_counts()

In [None]:
df['rating'].value_counts()


In [None]:
#save to excel
df.to_excel('data.xlsx')

## 4500 Data

In [None]:
# data = []

# def rekursif_scrape(shop_id, start=1, limit=10, iteration=10, max_retries=5):
#     """ ambil komentar berdasarkan paginasi dengan retry mechanism """
#     if start >= iteration:
#         return None

#     retries = 0

#     while retries < max_retries:
#         try:
#             json_log = get_reviews_data(shop_id, page=start, limit=limit, sort_by="create_time desc", filter_by="rating=5")
#             data.extend(json_log[0]['data']['productrevGetShopReviewReadingList']['list'])
#             has_next = json_log[0]['data']['productrevGetShopReviewReadingList']['hasNext']
#             break
#         except TypeError:
#             print('Terjadi kesalahan', json_log)
#             has_next = False
#             retries += 1
#             if retries == max_retries:
#                 print(f"Gagal mengambil data setelah {max_retries} percobaan pada halaman {start}")
#                 return None

#     if has_next:
#         rekursif_scrape(shop_id, start=start+1, limit=limit, iteration=iteration, max_retries=max_retries)
#     else:
#         return None

# # Contoh pemanggilan fungsi, ambil 500 data tiap paginasi dengan pindah paginasi sebanyak 10x
# rekursif_scrape(shopID, limit=500, iteration=10)