In [1]:
import pandas as pd
import numpy as np
import time
import requests

In [2]:
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.keys import Keys

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
user_agent = UserAgent()
options = Options()
# options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--disable-notifications')
options.add_argument('--disable-popup-blocking')
options.add_argument(f'user-agent={user_agent.random}')
# options.add_argument('--incognito')
options.add_experimental_option("prefs", {"profile.default_content_setting_values.geolocation": 2})

In [5]:
def getMobbiData(soup):
    product_link = soup.find_all('a', class_='featured-car-product-link')
    product_link = ["www.mobbi.id" + a['href'] for a in product_link]
    product_link = list(set(product_link))

    transmission = soup.find_all('div', {'data-product-transmission': True})
    transmission = [a['data-product-transmission'] for a in transmission]

    brand = soup.find_all('div', {'data-product-brand': True})
    brand = [a['data-product-brand'] for a in brand]

    model = soup.find_all('div', {'data-product-category': True})
    model = [a['data-product-category'] for a in model]

    variant = soup.find_all('div', {'data-product-variant': True})
    variant = [a['data-product-variant'] for a in variant]

    year = soup.find_all('div', {'data-product-year': True})
    year = [a['data-product-year'] for a in year]

    price = soup.find_all('div', {'data-product-price': True})
    price = [a['data-product-price'] for a in price]

    mileage = soup.find_all('div', {'data-product-mileage': True})
    mileage = [a['data-product-mileage'] for a in mileage]

    location = soup.find_all('div', {'data-product-location': True})
    location = [a['data-product-location'] for a in location]

    # data_lengths = [len(product_link), len(transmission), len(brand), len(variant), len(price), len(year), len(model), len(mileage), len(location)]
    # if len(set(data_lengths)) != 1:
    #     print("Warning: Arrays have different lengths. Filling with NaN.")
    #     max_length = max(data_lengths)
    #     for array in [product_link, transmission, brand, variant, price, year, model, mileage, location]:
    #         while len(array) < max_length:
    #             array.append(np.nan)
                
    df3 = pd.DataFrame({
        'product_link': product_link,
        'product_brand': brand,
        'product_model': model,
        'product_variant': variant,
        'product_price (IDR)': price,
        'product_transmission': transmission,
        'product_mileage (KM)': mileage,
        'production_year': year,
        'product_location': location
    })

    return df3

In [6]:
data = pd.DataFrame()
url = 'https://www.mobbi.id/'

driver = webdriver.Chrome('chromedriver.exe', options=options)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
time.sleep(5)

#close banner
driver.find_element_by_xpath("""//*[@id="btnwClear"]""").click()

for i in range(1,4):
    driver.find_element_by_xpath("""//*[@id="headerNonIbid"]/li/div/div/form/div/div[1]/input[1]""").click()
    driver.find_element_by_xpath(f"""//*[@id="list-brand-search"]/li[{i}]""").click()
    
    # Scroll down to load more content
    SCROLL_PAUSE_TIME = 5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    output = getMobbiData(soup)
    data = pd.concat([data, output])
    data.reset_index(inplace=True, drop=True)
    
driver.quit()

In [7]:
#DF CLEANING

In [8]:
df_c = data.copy()

In [9]:
df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   product_link          121 non-null    object
 1   product_brand         121 non-null    object
 2   product_model         121 non-null    object
 3   product_variant       121 non-null    object
 4   product_price (IDR)   121 non-null    object
 5   product_transmission  121 non-null    object
 6   product_mileage (KM)  121 non-null    object
 7   production_year       121 non-null    object
 8   product_location      121 non-null    object
dtypes: object(9)
memory usage: 8.6+ KB


In [10]:
df_c['product_price (IDR)'] = df_c['product_price (IDR)'].astype(float)

In [11]:
df_c['product_transmission'] = df_c['product_transmission'].replace({'AT': 'Automatic', 'MT': 'Manual'})

In [12]:
df_c['product_variant'] = df_c['product_variant'].str.replace(r'\s*\([^)]*\)', '', regex=True)

In [13]:
df_c.groupby("product_brand").agg({"product_link":"count"}).sort_values("product_link", ascending=False)

Unnamed: 0_level_0,product_link
product_brand,Unnamed: 1_level_1
Daihatsu,67
Toyota,44
Honda,10


In [17]:
df_c.to_csv("Internship_Kiran_mobbi_data.csv", index=False)
print("File berhasil diunduh")

File berhasil diunduh


In [19]:
import mysql.connector
import sqlalchemy

database_username = 'root'
database_password = ''
database_ip       = '127.0.0.1'
database_name     = 'astra_dm_scraping'
database_connection = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}@{2}/{3}'.
                                               format(database_username, database_password, 
                                                      database_ip, database_name), pool_recycle=1, pool_timeout=57600).connect()

df_c.to_sql(con=database_connection, name='internship_kiran_mobbi_scraping', if_exists='append', index=False, chunksize=100)
database_connection.close()