### Import Modules

In [12]:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
from ipywidgets import IntProgress
from IPython.display import display



### Initiate any global variable

In [13]:
movies_title = []
movies_sinopsis = []
movies_rating = []
movies_genre = []

# Free to adjust, because the total data of the website is about 19.000 film,
# it will be time consuming to process all the data, we gave the limit till 1000
max_data = 100

### Request from your page source (ImDB)

In [14]:

url = "https://www.imdb.com/search/title/?title_type=feature,tv_series&count=100&sort=num_votes,desc"

# Initialize Selenium WebDriver
# Ensure you have the appropriate WebDriver installed
driver = webdriver.Chrome()  
driver.get(url)

# Wait for the page to load completely (atur sleep duration sesuai kemampuan/kecepatan internet)
wait = WebDriverWait(driver, 10)

# Initial display data
data_loaded = 100
while True:
    try:
        print("data loaded: ", data_loaded)
        if data_loaded >= max_data:
            print(f"Already loaded {data_loaded} items. Stopping.")
            break

        # Locate the "Load More" button
        load_more_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'ipc-see-more__text')))  

        ActionChains(driver).move_to_element(load_more_button).click(load_more_button).perform()

        # Wait for new content to load
        time.sleep(2)  
        data_loaded += 100

    except Exception as e:
        print("No more data to load or error:", e)
        break  


data loaded:  100
Already loaded 100 items. Stopping.


In [15]:
# Get the rendered page source
html_source = driver.page_source
soup = BeautifulSoup(html_source, "html.parser")

print("Begin to extract data")
# Progress Bar
f = IntProgress(min=0, max=max_data)
display(f)

# # Element to be scrapping: titles, rating, sinopsis
titles = soup.find_all('h3', class_="ipc-title__text")
sinopsis = soup.find_all('div', class_ = "ipc-html-content-inner-div")
rating = soup.find_all('span','ipc-rating-star--rating')
print(len(rating), len(titles), len(sinopsis))

info_buttons = driver.find_elements(By.CLASS_NAME, 'ipc-icon-button.dli-info-icon')

# Loop through each button to find genre
for index, button in enumerate(info_buttons):
    try:
        wait.until(EC.element_to_be_clickable(button))
        ActionChains(driver).move_to_element(button).click(button).perform()
        modal = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "ipc-promptable-base__panel")))
        time.sleep(1)
        
        # Got the genre data
        list_items = driver.find_elements(By.XPATH, "//ul[@data-testid='btp_gl']//li[@class='ipc-inline-list__item']")
        categories = [item.text for item in list_items]
        movies_genre.append(categories)

        # Close the modal after performing actions
        close_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'ipc-promptable-base__close')))
        close_button.click()

        # Update progress bar
        f.value +=1

    except Exception as e:
        print(f"Error clicking button {index + 1}: {e}")

Begin to extract data


IntProgress(value=0)

100 101 100


In [16]:
# Close the selenium browser
driver.quit()

### Clean the data

In [17]:
# Movie title
for i in titles:
    text = i.text.strip()
    if text[0].isdigit():     
        title = text.split(".")[1]
        movies_title.append(title)
len(movies_title)


100

In [18]:
# Sinopsis
for i in sinopsis:
    text  = i.text.strip()
    movies_sinopsis.append(text)
len(movies_sinopsis)

100

In [None]:
# Rating
for i in rating:
    text = i.text.strip()
    movies_rating.append(text)

100

### Proceed Cleaned Data into JSON/CSV

In [None]:
# Ensure both arrays have the same length
# Make sure the lenght of each data the same.
movies_title_fixed = movies_title[:max_data]
movies_sinopsis_fixed = movies_sinopsis[:max_data]
movies_rating_fixed = movies_rating[:max_data]
movies_genre_fixed= movies_genre[:max_data]


if len(movies_title) != len(movies_sinopsis):
    print("Error: Arrays must have the same length.")
else:
    data = {"title": movies_title_fixed, "sinopsis": movies_sinopsis_fixed, "rating": movies_rating_fixed, "genre": movies_genre_fixed}
    df = pd.DataFrame(data)

    # Save to CSV
    csv_filename = "movies.csv"
    df.to_csv(csv_filename, index=False, encoding="utf-8")
    print(f"Data successfully saved to {csv_filename}")

    # Save to JSON
    json_filename = "movies.json"
    df.to_json(json_filename, orient="records", lines=False, force_ascii=False)
    print(f"Data successfully saved to {json_filename}")

Data successfully saved to movies2.csv
Data successfully saved to movies2.json
