In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import time
import re
import os
import urllib.request
from joblib import Parallel, delayed

# pandas options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# selenium imports
from selenium import webdriver

In [3]:
# selenium preferences to prevent load images
# define webdriver options
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_argument("--headless")
chrome_options.add_argument('window-size=1920x1080');
chrome_options.add_argument("--start-maximized");
chrome_options.add_experimental_option("prefs", prefs)

In [4]:
# read in scraping url
df = pd.read_csv('listing_url.csv')
df.head()

Unnamed: 0,listing_url,featured
0,https://garasi.id/mobil-bekas/2017-toyota-avan...,True
1,https://garasi.id/mobil-bekas/2015-toyota-avan...,True
2,https://garasi.id/mobil-bekas/2015-toyota-avan...,False
3,https://garasi.id/mobil-bekas/2016-toyota-avan...,False
4,https://garasi.id/mobil-bekas/2019-toyota-avan...,False


In [5]:
# preprocessing of pandas dataframe
models = []
year = []
listing_id = []

def preprocess_listing_url(x):
    temp = x['listing_url'].split('/')
    listing_id.append(temp[-1])
    
    splits = temp[-2].split('-')
    year.append(splits[0])
    mod_type = ''
    for i in range(1,len(splits)):
        mod_type = mod_type + ' ' + splits[i] 
    models.append(mod_type.strip())
    
df.apply(lambda x: preprocess_listing_url(x), axis = 1)
df['models'] = models
df['year'] = year
df['listing_id'] = listing_id

In [6]:
df.groupby(['models']).agg('count')

Unnamed: 0_level_0,listing_url,featured,year,listing_id
models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
daihatsu ayla,128,128,128,128
daihatsu sigra,128,128,128,128
daihatsu terios,128,128,128,128
daihatsu xenia,143,143,143,143
honda br v,131,131,131,131
honda brio,149,149,149,149
honda cr v,126,126,126,126
honda hr v,330,330,330,330
mitsubishi pajero sport,133,133,133,133
mitsubishi xpander,129,129,129,129


In [7]:
# Scale down samples in HRV
df = df[(df.index <= 1230) | (df.index >= 1444)]
df.reset_index(inplace = True)
df.groupby(['models']).agg('count')

Unnamed: 0_level_0,index,listing_url,featured,year,listing_id
models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
daihatsu ayla,128,128,128,128,128
daihatsu sigra,128,128,128,128,128
daihatsu terios,128,128,128,128,128
daihatsu xenia,143,143,143,143,143
honda br v,130,130,130,130,130
honda brio,149,149,149,149,149
honda cr v,126,126,126,126,126
honda hr v,118,118,118,118,118
mitsubishi pajero sport,133,133,133,133,133
mitsubishi xpander,129,129,129,129,129


In [8]:
df['listing_url'][0]

'https://garasi.id/mobil-bekas/2017-toyota-avanza/00qf715'

In [9]:
df['featured'][0]

True

In [10]:
df['models'][0]

'toyota avanza'

In [11]:
# global vars
filename_list = []
labels_list = []

In [12]:
len(filename_list)

0

In [14]:
def scrape_images(df, row):
    print('Scraping Row: ', row)
    
    # scrape featured listing => Take only first 4 exterior images
    if df['featured'][row] == True:
        
        # initialize driver
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(df['listing_url'][row])
        
        currlisting = df['listing_id'][row]
        im_count = 0
        
        # find image xpath
        for i in range(4):
            # find and save images with unique filename (listing_id_im_count.jpeg)
            img = driver.find_element_by_xpath("//*[@id='root']/div[1]/main/div/div[1]/div/div[1]/div/div[" + str(i+1) + "]/a/div")
            img = img.find_element_by_tag_name("img")
            img_url = img.get_property('src')
            extension = img_url.split('.')[-1]
            
            filename = currlisting + '_' + str(im_count) + '.' + extension
            labels = df['models'][row]
            
            urllib.request.urlretrieve(img_url, os.path.join(os.getcwd(), 'dataset', filename))
            im_count = im_count + 1
            
            # book keep labels and filenames
            filename_list.append(filename)
            labels_list.append(labels)
            
            # click next button
            next_button = driver.find_element_by_class_name('SlideshowArrow-Right')
            next_button.click()
        
        # close driver
        driver.close()

    # non featured listing
    else:
        
        # initialize driver
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(df['listing_url'][row])
        
        currlisting = df['listing_id'][row]
        im_count = 0
        
        cont = driver.find_element_by_class_name('SlideshowChip_Container')
        capt = cont.find_elements_by_class_name('fg-white')
        num_images = int(capt[1].text.split('/')[-1].strip())
        
        # find image xpath
        for i in range(num_images):
            # find and save images with unique filename (listing_id_im_count.jpeg)
            img = driver.find_element_by_xpath("//*[@id='root']/div[1]/main/div/div[1]/div/div[1]/div/div[" + str(i+1) + "]/a/div")
            img = img.find_element_by_tag_name("img")
            img_url = img.get_property('src')
            extension = img_url.split('.')[-1]
            
            filename = currlisting + '_' + str(im_count) + '.' + extension
            labels = df['models'][row]
            
            urllib.request.urlretrieve(img_url, os.path.join(os.getcwd(), 'dataset', filename))
            im_count = im_count + 1
            
            # book keep labels and filenames
            filename_list.append(filename)
            labels_list.append(labels)
            
            # click next buttion
            next_button = driver.find_element_by_class_name('SlideshowArrow-Right')
            next_button.click()
        
        # close driver
        driver.close()
        
        time.sleep(4)

In [54]:
Parallel(n_jobs=-1, require='sharedmem')(delayed(scrape_images)(df, i) for i in range(2336, df.shape[0]));

Scraping Row: Scraping Row: Scraping Row:  Scraping Row:   23362338
 2339
2337

Scraping Row:  2340
Scraping Row:  2341
Scraping Row:  2342
Scraping Row:  2343
Scraping Row:  2344
Scraping Row:  2345
Scraping Row:  2346
Scraping Row:  2347
Scraping Row:  2348
Scraping Row:  2349
Scraping Row:  2350
Scraping Row:  2351
Scraping Row:  2352
Scraping Row:  2353
Scraping Row:  2354
Scraping Row:  2355
Scraping Row:  2356
Scraping Row:  2357
Scraping Row:  2358
Scraping Row:  2359
Scraping Row:  2360
Scraping Row:  2361
Scraping Row:  2362
Scraping Row:  2363
Scraping Row:  2364
Scraping Row:  2365
Scraping Row:  2366
Scraping Row:  2367
Scraping Row:  2368
Scraping Row:  2369
Scraping Row:  2370
Scraping Row:  2371
Scraping Row:  2372
Scraping Row:  2373
Scraping Row:  2374
Scraping Row:  2375
Scraping Row:  2376
Scraping Row:  2377
Scraping Row:  2378
Scraping Row:  2379
Scraping Row:  2380
Scraping Row:  2381
Scraping Row:  2382
Scraping Row:  2383
Scraping Row:  2384
Scraping Row:  2385


In [55]:
if len(np.unique(filename_list)) == len(os.listdir(os.path.join(os.getcwd(), 'dataset'))):
    print('Same length can continue')
else:
    print('different length!, please investigate')

Same length can continue


In [62]:
# stroe all labels
labels_df = pd.DataFrame()
labels_df['file_name'] = filename_list
labels_df['labels'] = labels_list 
labels_df.head()

Unnamed: 0,file_name,labels
0,00qf715_0.jpeg,toyota avanza
1,00px849_0.jpeg,toyota avanza
2,00pr438_0.jpeg,toyota avanza
3,00qh040_0.png,toyota avanza
4,00qf715_1.jpeg,toyota avanza


In [63]:
# save to output
labels_df.to_csv('labels_df.csv', index = False)

In [64]:
labels_df.shape

(14792, 2)

In [65]:
len(os.listdir(os.path.join(os.getcwd(), 'dataset')))

14792