<a href="https://colab.research.google.com/github/ikanx101/G-Colab/blob/main/Shopee_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Shopee Scraper v4.1**

## ___What's new?___

Ada beberapa perubahan yang terjadi pada versi ini. Yakni:

1. Ada perubahan pada **API** Shopee sehingga tidak memungkinkan untuk mengambil nilai **terjual** dari suatu produk. Jika ada *updates* terkait hal ini, akan saya kabari kemudian.

2. Pada versi terbaru ini, akan ada dua _files_ hasil _scrape_, yakni:

  1. Informasi seputar produk; berada pada _file_ bernama `Hasil Scrape Produk.csv`.
  2. Informasi seputar toko; berada pada file bernama `Hasil Scrape Toko.csv`.

Kedua _files_ tersebut disatukan dengan variabel `link` yang sama.

## __Petunjuk Pemakaian__ 

1. Silakan _upload file_ `.txt` berisi _links_ produk __Shopee__ yang hendak di-_scrape_ datanya.
1. Pastikan bahwa setiap baris dari file `.txt` tersebut hanya berisi satu _link_ produk saja.
1. Tuliskan nama file `.txt` tersebut dalam kolom yang disediakan secara lengkap. Misalkan nama _file:_ `link produk.txt`.
1. Silakan pilih `Runtime` > `Run All` dan tunggu hingga selesai.
1. Setelah proses selesai, silakan _download files_ yang tersedia.

## __Notes__

Dilarang mengubah algoritma yang ada. Silakan di-_clone_ ke _GDrive_ sendiri jika memang ingin memodifikasi algoritma ini.

_Created by:_ [Ikang](https://ikanx101.com/)
*Last Modif:* Senin, 15 Maret 2021 ~ 1.30 pm

In [6]:
#@title Nama File `.txt`
rm(list=ls())
nama_file <- "satu doang.txt" #@param {type:"string"}

In [7]:
#@title Preparation Product Links 
print("-- Proses Scrape Produk Dimulai --")
library(jsonlite)
library(dplyr)
library(tidyr)
 
contoh = c("https://shopee.co.id/Tropicana-Slim-Kecap-Manis-200Ml-i.12656836.95387848",
           "https://shopee.co.id/Tropicana-Slim-Sugar-Free-Cookies-Choco-200G-Tropicana-Slim-Hokkaido-Cheese-Cookies-100gr-i.12656836.6149828589",
           "https://shopee.co.id/HiLo-Thai-Tea-15-gr-10's-i.12656836.1389108883")
 
#links = readLines(nama_file)
#link = unique(links)
link = contoh

dummy = data.frame(id = c(1:length(link)),
                   url = link,
                   asli = link) %>% 
  filter(grepl('-i.',url,fixed = T)) %>% 
  filter(!grepl("help",url)) %>% 
  mutate(url = gsub("wpi","",url,ignore.case = T),
         url = gsub("isi","",url,ignore.case = T),
         url = gsub("iso","",url,ignore.case = T),
         url = gsub("imi","",url,ignore.case = T),
         url = gsub("im","",url,ignore.case = T),
         url = gsub("in","",url,ignore.case = T)) %>% 
  tidyr::separate(url,into = c('hapus','pakai'),sep = '-i.') %>% 
  tidyr::separate(pakai, into = c('info1','info2'),sep = '\\.') %>%
  mutate(link_final = paste0('https://shopee.co.id/api/v2/item/get?itemid=',
                             info2,
                             '&shopid=',
                             info1)) %>% 
  filter(!is.na(info2))
 
url = dummy$link_final

[1] "-- Proses Scrape Produk Dimulai --"


In [8]:
#@title Product Scrape Function
scrape_shopee = function(url){
  # buka json
  tes = read_json(url)
  #bentuk data frame
  data = data.frame(
    nama = tes$item$name,
    merek = ifelse(is.null(tes$item$brand),NA,tes$item$brand),
    diskon_produk = tes$item$discount,
    lokasi = tes$item$shop_location,
    status = tes$item$item_status,
    kategori = ifelse(is.null(tes$item$categories[[3]]$display_name),NA,tes$item$categories[[3]]$display_name),
    link = url
  )
  return(data)
}

In [9]:
#@title Proses Scraping (Mohon Bersabar ya)
i = 1
data = scrape_shopee(url[i])

for(i in 2:length(url)){
  temp = scrape_shopee(url[i])
  data = rbind(data,temp)
}

data$waktu.scrape = Sys.time()
raw = distinct(data)

In [10]:
#@title Cleaning Hasil Scrape Produk
# cleaning
data_clean = 
    raw %>% 
    mutate(merek = case_when(
        grepl("tropicana",nama,ignore.case = T) ~ "Tropicana Slim",
        grepl("l-men",nama,ignore.case = T) ~ "L-Men",
        grepl("nutri",nama,ignore.case = T) ~ "NutriSari",
        grepl("teen",nama,ignore.case = T) ~ "HiLo Teen",
        grepl("school",nama,ignore.case = T) ~ "HiLo School",
        grepl("hilo",nama,ignore.case = T) & grepl("rtd",nama,ignore.case = T) ~ "HiLo RTD",
        grepl("hilo",nama,ignore.case = T) & !grepl("teen|school|rtd",nama,ignore.case = T) ~ "HiLo Active/Gold",
        grepl("lokala",nama,ignore.case = T) ~ "Lokalate",
        grepl("wdan|dank|wedan",nama,ignore.case = T) ~ "WDank"
    )
               ) %>% 
    arrange(merek,nama,waktu.scrape) %>% 
    mutate(waktu.scrape = as.POSIXct(waktu.scrape) + 7*60*60)

In [11]:
#@title Exporting Hasil Scrape Produk to `.csv`
judul = paste0("Hasil Scrape Produk.csv")
write.csv(data_clean,judul)
print("-- Hasil Scrape Produk Sudah Selesai --")
print("-- Proses Scrape Toko Dimulai --")

[1] "-- Hasil Scrape Produk Sudah Selesai --"
[1] "-- Proses Scrape Toko Dimulai --"


In [12]:
#@title Preparation utk Scrape Toko
data_toko = 
  data_clean %>% 
  mutate(link_new = link) %>%
  separate(link_new,
           into = c("ikanx","fadhli"),
           sep = "shopid=") %>%
  mutate(ikanx = NULL,
         link_new = paste0("https://shopee.co.id/api/v2/shop/get?&shopid=",fadhli)) %>%
  select(link,link_new)
link_toko = unique(data_toko$link_new)

In [13]:
#@title Function Scrape Toko
scrape_toko_donk = function(url){
  tes = read_json(url)
  nama_seller = tes$data$name
  nama_seller = ifelse(is.null(nama_seller),NA,nama_seller)
  lokasi = tes$data$place
  lokasi = ifelse(is.null(lokasi),NA,lokasi)
  is_official = tes$data$is_official_shop
  is_official = ifelse(is.null(is_official),NA,is_official)
  is_verified = tes$data$is_shopee_verified
  is_verified = ifelse(is.null(is_verified),NA,is_verified)
  rating = tes$data$rating_star
  rating = ifelse(is.null(rating),NA,rating)
  follower_count = tes$data$follower_count
  follower_count = ifelse(is.null(follower_count),NA,follower_count)
  item_count = tes$data$item_count
  item_count = ifelse(is.null(item_count),NA,item_count)
  data_toko_clean = data.frame(link_toko,nama_seller,lokasi,is_official,
                               is_verified,rating,follower_count,item_count)
return(data_toko_clean)
}

In [14]:
#@title Scraping Toko
temp = data.frame()
for(i in 1:length(link_toko)){
  tempx = scrape_toko_donk(link_toko[i])
  temp = rbind(temp,tempx)
}

In [15]:
#@title Exporting Hasil Scrape Toko to `.csv`
data_toko_clean_banget = merge(data_toko,temp) %>% select(-link_new)
judul = paste0("Hasil Scrape Toko.csv")
write.csv(data_toko_clean_banget,judul)
print("-- Hasil Scrape Toko Sudah Selesai --")
print("--- DONE ---")

[1] "-- Hasil Scrape Toko Sudah Selesai --"
[1] "--- DONE ---"


In [18]:
ls()

data_clean
data_toko_clean_banget

nama,merek,diskon_produk,lokasi,status,kategori,link,waktu.scrape
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dttm>
HiLo Thai Tea 15 gr,HiLo Active/Gold,63%,KOTA B,n,Minuman,https://shopee.co.id/api/v2/item/get?itemid=1389108883&shopid=12656836,2021-03-15 13:20:34
Tropicana Slim Kecap Manis,Tropicana Slim,10%,KOTA B,n,Kecap & S,https://shopee.co.id/api/v2/item/get?itemid=95387848&shopid=12656836,2021-03-15 13:20:34
Tropicana Slim Sugar Free Cookies Choco 200G + Tropicana Slim Hokkaido Cheese Cookies,Tropicana Slim,23%,KOTA B,n,Kue K,https://shopee.co.id/api/v2/item/get?itemid=6149828589&shopid=12656836,2021-03-15 13:20:34


link,link_toko,nama_seller,lokasi,is_official,is_verified,rating,follower_count,item_count
<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<dbl>,<int>,<int>
https://shopee.co.id/api/v2/item/get?itemid=1389108883&shopid=12656836,https://shopee.co.id/api/v2/shop/get?&shopid=12656836,Nutrimart,"KOTA BEKASI - MEDAN SATRIA, JAWA BARAT, ID",True,False,4.906609,179690,424
https://shopee.co.id/api/v2/item/get?itemid=95387848&shopid=12656836,https://shopee.co.id/api/v2/shop/get?&shopid=12656836,Nutrimart,"KOTA BEKASI - MEDAN SATRIA, JAWA BARAT, ID",True,False,4.906609,179690,424
https://shopee.co.id/api/v2/item/get?itemid=6149828589&shopid=12656836,https://shopee.co.id/api/v2/shop/get?&shopid=12656836,Nutrimart,"KOTA BEKASI - MEDAN SATRIA, JAWA BARAT, ID",True,False,4.906609,179690,424
