# Google Scraping
---

In [1]:
import sys
sys.path.append('../')

import pandas as pd
import requests

from API_keys.config_omdb import omdb_key

import json
from pprint import pprint

from splinter import Browser
from bs4 import BeautifulSoup

import re
import time

In [2]:
from splinter import Browser
from bs4 import BeautifulSoup

In [3]:
!where chromedriver

c:\Projects\TUGAS\ETL-Melodi\venv\Scripts\chromedriver.exe


In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Scraping IMDB for Top 250 movies
___

In [5]:
# Initialize Chrome WebDriver (make sure you have chromedriver installed)
driver = webdriver.Chrome()

# Navigate to IMDb Top 250
url = "https://www.imdb.com/chart/top/"
driver.get(url)

# Wait for page to load and scroll down to load all movies
SCROLL_PAUSE_TIME = 2

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Get page source after all content is loaded
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Initialize lists to store data
movie_data = []

# Find all movie entries
movies = soup.find_all('li', class_='ipc-metadata-list-summary-item')

for movie in movies:
    movie_dict = {}
    
    # Extract title (remove ranking number from title)
    title = movie.find('h3', class_='ipc-title__text')
    if title:
        movie_dict['Title'] = title.text.strip()
    else:
        movie_dict['Title'] = 'Unknown'
    
    # Extract IMDB ID
    movie_link = movie.find('a', class_='ipc-title-link-wrapper')
    if movie_link:
        imdb_id = re.search(r'/title/(tt\d+)/', movie_link['href'])
        movie_dict['IMDb_ID'] = imdb_id.group(1) if imdb_id else 'Unknown'
    
    # Extract rating
    rating = movie.find('span', class_='ipc-rating-star--imdb')
    if rating:
        movie_dict['Rating'] = rating['aria-label'].split()[-1]
    
    # Extract year
    year = movie.find('span', class_='cli-title-metadata-item')
    movie_dict['Year'] = year.text if year else 'Unknown'
    
    movie_data.append(movie_dict)

# Close the browser
driver.quit()

# Create DataFrame
Raw_df = pd.DataFrame(movie_data)

# Display the data
print("Successfully scraped", len(Raw_df), "movies")
Raw_df.head()

Successfully scraped 250 movies


Unnamed: 0,Title,IMDb_ID,Rating,Year
0,1. The Shawshank Redemption,tt0111161,9.3,1994
1,2. The Godfather,tt0068646,9.2,1972
2,3. The Dark Knight,tt0468569,9.0,2008
3,4. The Godfather Part II,tt0071562,9.0,1974
4,5. 12 Angry Men,tt0050083,9.0,1957


### Separate 'Rank & Title' column to Rank and Title columns
---

In [11]:
# Extract rank numbers from title using str.extract()
Raw_df['Rank'] = Raw_df['Title'].str.extract(r'^(\d+)')

# Remove rank numbers and period from title 
Raw_df['Title'] = Raw_df['Title'].str.replace(r'^\d+\.\s*', '', regex=True)

# Reorder columns
Imdb_df = Raw_df[['Rank', 'Title', 'IMDb_ID', 'Year', 'Rating']]
Imdb_df.head()

Unnamed: 0,Rank,Title,IMDb_ID,Year,Rating
0,,The Shawshank Redemption,tt0111161,1994,9.3
1,,The Godfather,tt0068646,1972,9.2
2,,The Dark Knight,tt0468569,2008,9.0
3,,The Godfather Part II,tt0071562,1974,9.0
4,12.0,12 Angry Men,tt0050083,1957,9.0


### Building the URL to scrape Google
---

In [12]:
movies = Imdb_df['Title']
movies = movies.str.replace(r"[,:'.]", '', regex=True)

In [8]:
base_url = 'https://www.google.com/search?&q='

movie_list = movies.str.lower().str.replace(' ', '+', n = -1, case=None, regex=True)

query_url=[]

for movie in movie_list:
    query_url.append(f'{base_url}{movie}+watch+movie')

In [9]:
google_query_url_df = pd.DataFrame({'Rank': Imdb_df['Rank'],
                                    'Title': Imdb_df['Title'],
                                    'Google Query URL' : query_url
                                   })

google_query_url_df = google_query_url_df.set_index(['Rank'])

google_query_url_df.head(10)

Unnamed: 0_level_0,Title,Google Query URL
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The Shawshank Redemption,https://www.google.com/search?&q=the+shawshank...
2,The Godfather,https://www.google.com/search?&q=the+godfather...
3,The Dark Knight,https://www.google.com/search?&q=the+dark+knig...
4,The Godfather Part II,https://www.google.com/search?&q=the+godfather...
5,12 Angry Men,https://www.google.com/search?&q=12+angry+men+...
6,The Lord of the Rings: The Return of the King,https://www.google.com/search?&q=the+lord+of+t...
7,Schindler's List,https://www.google.com/search?&q=schindlers+li...
8,Pulp Fiction,https://www.google.com/search?&q=pulp+fiction+...
9,The Lord of the Rings: The Fellowship of the Ring,https://www.google.com/search?&q=the+lord+of+t...
10,"The Good, the Bad and the Ugly",https://www.google.com/search?&q=the+good+the+...


In [10]:
google_query_url_df.to_csv('../Output/Google_Query_Url.csv')

#### Sample Google scraping
---

Web Browser Setup

In [13]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import time

service = Service(executable_path=r'c:\Projects\TUGAS\ETL-Project\venv\Scripts\chromedriver.exe')
browser = webdriver.Chrome(service=service)


In [14]:
# Sample data
sample = 'The Godfather'
base_url = 'https://www.google.com/search?&q='
query_url = f'{base_url}{sample}+watch+movie'

# Visit the query URL
try:
    browser.get(query_url)
    time.sleep(5)  # Wait for the page to load
except Exception as e:
    print(f"Error visiting the URL: {e}")

# Parse the page content
try:
    soup = BeautifulSoup(browser.page_source, 'lxml')
except Exception as e:
    print(f"Error parsing the page source: {e}")

# Extract data
streaming = []
title = []
price = []

results1 = soup.find_all('div', class_='ellip bclEt')
for result in results1:
    streaming.append(result.text)
    title.append(sample.capitalize())

results2 = soup.find_all('div', class_='ellip rsj3fb')
for result in results2:
    price.append(result.text)

# Create DataFrame
Sample_Streaming_df = pd.DataFrame({'Title': title,
                                    'Streaming On' : streaming,
                                    'Price' : price
                                   })

Sample_Streaming_df['Price'] = Sample_Streaming_df['Price'].str.replace("$","", case = True, regex=True)
Sample_Streaming_df['Price'] = Sample_Streaming_df['Price'].str.replace("From ","", case = True, regex=True)

Sample_Streaming_df


Unnamed: 0,Title,Streaming On,Price
0,The godfather,Netflix,Langganan
1,The godfather,Vidio,Langganan
2,The godfather,Google Play Film & TV,"Dari Rp 25.000,00"
3,The godfather,Apple TV,"Dari Rp 25.000,00"


In [17]:
Sample_Streaming_df.to_csv('../Output/Sample_Google_Scraping.csv')

## Scraping Google for few movies at a time
---

In [15]:
Streaming = []
Title = []
Price = []

count = 0

movies = google_query_url_df['Title']

query_urls = google_query_url_df['Google Query URL']

base_url = 'https://www.google.com/search?&q='

In [None]:
# Loop through all movies in the query list
for count in range(len(movies)):
    try:
        # Construct and visit URL
        query_url = f'{base_url}{movies[count]}+watch+movie'
        browser.get(query_url)
        time.sleep(3)  # Wait for page load
        
        soup = BeautifulSoup(browser.page_source, 'lxml')

        # Look for streaming services
        results1 = soup.find_all('div', class_='ellip bclEt')
        if results1:
            # Found streaming options
            for result in results1:
                Streaming.append(result.text)
                Title.append(movies[count].capitalize())
                
                # Look for corresponding price
                price_div = soup.find('div', class_='ellip rsj3fb')
                Price.append(price_div.text if price_div else 'Not Available')
        else:
            # No streaming options found - add one row with NA values
            Title.append(movies[count].capitalize())
            Streaming.append('Not Available') 
            Price.append('Not Available')
            
        print(f"Processed {count + 1}/250: {movies[count]}")
            
    except Exception as e:
        # Log error but continue with next movie
        print(f"Error processing {movies[count]}: {str(e)}")
        Title.append(movies[count].capitalize())
        Streaming.append('Error')
        Price.append('Error')
        continue
        
    # Optional: Save progress periodically
    if count % 50 == 0 and count > 0:
        temp_df = pd.DataFrame({
            'Title': Title,
            'Streaming On': Streaming, 
            'Price': Price
        })
        temp_df.to_csv(f'../Output/Google_Checkpoints/Google_Scraping_checkpoint_{count}.csv', index=False)

# Create final dataframe
Streaming_df = pd.DataFrame({
    'Title': Title,
    'Streaming On': Streaming,
    'Price': Price
})

# Save complete results
Streaming_df.to_csv('../Output/Google_Checkpoints/Google_Scraping_complete.csv', index=False)

print(f"Completed processing {len(movies)} movies")

  query_url = f'{base_url}{movies[count]}+watch+movie'
  Title.append(movies[count].capitalize())
  print(f"Processed {count + 1}/250: {movies[count]}")


Processed 1/250: The Shawshank Redemption
Processed 2/250: The Godfather
Processed 3/250: The Dark Knight
Processed 4/250: The Godfather Part II
Processed 5/250: 12 Angry Men
Processed 6/250: The Lord of the Rings: The Return of the King
Processed 7/250: Schindler's List
Processed 8/250: Pulp Fiction
Processed 9/250: The Lord of the Rings: The Fellowship of the Ring
Processed 10/250: The Good, the Bad and the Ugly
Processed 11/250: Forrest Gump
Processed 12/250: The Lord of the Rings: The Two Towers


  Title.append(movies[count].capitalize())


Processed 13/250: Fight Club
Processed 14/250: Inception
Processed 15/250: Star Wars: Episode V - The Empire Strikes Back
Processed 16/250: The Matrix
Processed 17/250: Goodfellas
Processed 18/250: One Flew Over the Cuckoo's Nest
Processed 19/250: Interstellar
Processed 20/250: Se7en
Processed 21/250: It's a Wonderful Life
Processed 22/250: Seven Samurai
Processed 23/250: The Silence of the Lambs
Processed 24/250: Saving Private Ryan
Processed 25/250: City of God
Processed 26/250: The Green Mile
Processed 27/250: Life Is Beautiful
Processed 28/250: Terminator 2: Judgment Day
Processed 29/250: Star Wars: Episode IV - A New Hope
Processed 30/250: Back to the Future
Processed 31/250: Spirited Away
Processed 32/250: The Pianist
Processed 33/250: Parasite
Processed 34/250: Psycho
Processed 35/250: Gladiator
Processed 36/250: The Lion King
Processed 37/250: The Departed
Processed 38/250: Spider-Man: Across the Spider-Verse
Processed 39/250: Grave of the Fireflies
Processed 40/250: Whiplash
P

In [None]:
Streaming_df = pd.DataFrame({'Title': Title,
                             'Streaming On' : Streaming,
                             'Price' : Price
                            })


In [18]:
Streaming_df.head

<bound method NDFrame.head of                          Title           Streaming On              Price
0     The Shawshank Redemption               Apple TV  Dari Rp 25.000,00
1                The Godfather                Netflix          Langganan
2                The Godfather                  Vidio          Langganan
3                The Godfather  Google Play Film & TV          Langganan
4                The Godfather               Apple TV          Langganan
..                         ...                    ...                ...
327                   The Help          Not Available      Not Available
328             Cool Hand Luke               Apple TV  Dari Rp 25.000,00
329             Cool Hand Luke  Google Play Film & TV  Dari Rp 25.000,00
330  A Silent Voice: The Movie                Netflix          Langganan
331               Paris, Texas          Not Available      Not Available

[332 rows x 3 columns]>

Data Cleaning 

In [24]:
Streaming_df_cleaned = Streaming_df.dropna(how='any')  # Remove rows with any null values

# Clean up text fields
Streaming_df_cleaned['Title'] = (Streaming_df_cleaned['Title']
    .str.strip()  # Remove leading/trailing whitespace
    .str.title()  # Consistent capitalization
)

Streaming_df_cleaned['Streaming On'] = (Streaming_df_cleaned['Streaming On']
    .str.strip()
    .str.replace('  ', ' ')  # Remove double spaces
)

# Clean up price field
Streaming_df_cleaned['Price'] = (Streaming_df_cleaned['Price']
    .str.replace('Rp', '', regex=False)  # Remove dollar signs
    .str.replace('Dari ', '', regex=False)  # Remove "From " text
    .str.strip()
)

# Remove duplicate entries
Streaming_df_cleaned = Streaming_df_cleaned.drop_duplicates()

# Reset index after all cleaning
Streaming_df_cleaned = Streaming_df_cleaned.reset_index(drop=True)


In [25]:
Streaming_df_cleaned

Unnamed: 0,Title,Streaming On,Price
0,The Shawshank Redemption,Apple TV,"25.000,00"
1,The Godfather,Netflix,Langganan
2,The Godfather,Vidio,Langganan
3,The Godfather,Google Play Film & TV,Langganan
4,The Godfather,Apple TV,Langganan
...,...,...,...
327,The Help,Not Available,Not Available
328,Cool Hand Luke,Apple TV,"25.000,00"
329,Cool Hand Luke,Google Play Film & TV,"25.000,00"
330,A Silent Voice: The Movie,Netflix,Langganan


In [26]:
Streaming_df_cleaned.to_csv('../Output/Google_Scraping_Cleaned_Complete.csv', index=False)

In [27]:
import os
os.getcwd()

'c:\\Projects\\TUGAS\\ETL-Melodi\\Extract'