# Google Scraping
---

In [24]:
import sys
sys.path.append('../')

import pandas as pd

import re
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

browser = webdriver.Chrome()


In [25]:
!where chromedriver

d:\Applications\anaconda3\envs\cuda\chromedriver.exe


### Scraping IMDB for Top 250 movies
___

In [26]:
# Initialize Chrome WebDriver (make sure you have chromedriver installed)
driver = webdriver.Chrome()

# Navigate to IMDb Top 250
url = "https://www.imdb.com/chart/top/"
driver.get(url)

# Wait for page to load and scroll down to load all movies
SCROLL_PAUSE_TIME = 2

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Get page source after all content is loaded
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Initialize lists to store data
movie_data = []

# Find all movie entries
movies = soup.find_all('li', class_='ipc-metadata-list-summary-item')

for movie in movies:
    movie_dict = {}
    
    # Extract title (remove ranking number from title)
    title = movie.find('h3', class_='ipc-title__text')
    if title:
        movie_dict['Title'] = title.text.strip()
    else:
        movie_dict['Title'] = 'Unknown'
    
    # Extract IMDB ID
    movie_link = movie.find('a', class_='ipc-title-link-wrapper')
    if movie_link:
        imdb_id = re.search(r'/title/(tt\d+)/', movie_link['href'])
        movie_dict['IMDb_ID'] = imdb_id.group(1) if imdb_id else 'Unknown'
    
    # Extract rating
    rating = movie.find('span', class_='ipc-rating-star--imdb')
    if rating:
        movie_dict['Rating'] = rating['aria-label'].split()[-1]
    
    # Extract year
    year = movie.find('span', class_='cli-title-metadata-item')
    movie_dict['Year'] = year.text if year else 'Unknown'
    
    movie_data.append(movie_dict)

# Close the browser
driver.quit()

# Create DataFrame
Raw_df = pd.DataFrame(movie_data)

# Display the data
print("Successfully scraped", len(Raw_df), "movies")
Raw_df.head()

Successfully scraped 250 movies


Unnamed: 0,Title,IMDb_ID,Rating,Year
0,1. The Shawshank Redemption,tt0111161,9.3,1994
1,2. The Godfather,tt0068646,9.2,1972
2,3. The Dark Knight,tt0468569,9.0,2008
3,4. The Godfather Part II,tt0071562,9.0,1974
4,5. 12 Angry Men,tt0050083,9.0,1957


### Separate 'Rank & Title' column to Rank and Title columns
---

In [27]:
# Extract rank numbers from title using str.extract()
Raw_df['Rank'] = Raw_df['Title'].str.extract(r'^(\d+)')

# Remove rank numbers and period from title 
Raw_df['Title'] = Raw_df['Title'].str.replace(r'^\d+\.\s*', '', regex=True)

# Reorder columns
Imdb_df = Raw_df[['Rank', 'Title', 'IMDb_ID', 'Year', 'Rating']]
Imdb_df.head()

Unnamed: 0,Rank,Title,IMDb_ID,Year,Rating
0,1,The Shawshank Redemption,tt0111161,1994,9.3
1,2,The Godfather,tt0068646,1972,9.2
2,3,The Dark Knight,tt0468569,2008,9.0
3,4,The Godfather Part II,tt0071562,1974,9.0
4,5,12 Angry Men,tt0050083,1957,9.0


### Building the URL to scrape Google
---

In [28]:
movies = Imdb_df['Title']
movies = movies.str.replace(r"[,:'.]", '', regex=True)

In [29]:
base_url = 'https://www.google.com/search?&q='

movie_list = movies.str.lower().str.replace(' ', '+', n = -1, case=None, regex=True)

query_url=[]

for movie in movie_list:
    query_url.append(f'{base_url}{movie}+watch+movie')

In [30]:
google_query_url_df = pd.DataFrame({'Rank': Imdb_df['Rank'],
                                    'Title': Imdb_df['Title'],
                                    'Google Query URL' : query_url
                                   })

google_query_url_df = google_query_url_df.set_index(['Rank'])

google_query_url_df.head(10)

Unnamed: 0_level_0,Title,Google Query URL
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The Shawshank Redemption,https://www.google.com/search?&q=the+shawshank...
2,The Godfather,https://www.google.com/search?&q=the+godfather...
3,The Dark Knight,https://www.google.com/search?&q=the+dark+knig...
4,The Godfather Part II,https://www.google.com/search?&q=the+godfather...
5,12 Angry Men,https://www.google.com/search?&q=12+angry+men+...
6,The Lord of the Rings: The Return of the King,https://www.google.com/search?&q=the+lord+of+t...
7,Schindler's List,https://www.google.com/search?&q=schindlers+li...
8,Pulp Fiction,https://www.google.com/search?&q=pulp+fiction+...
9,The Lord of the Rings: The Fellowship of the Ring,https://www.google.com/search?&q=the+lord+of+t...
10,"The Good, the Bad and the Ugly",https://www.google.com/search?&q=the+good+the+...


In [31]:
google_query_url_df.to_csv('../Output/IMDb/Google_Query_Url.csv')

#### Sample Google scraping
---

In [32]:
def find_imdb_url(browser, max_pages=5):
    imdb_pattern = r'https://www\.imdb\.com/title/[^/\s"]+'
    
    for page in range(max_pages):
        # Search in current page
        page_source = browser.page_source
        matches = re.findall(imdb_pattern, page_source)
        
        if matches:
            return matches[0]
            
        # Try next page if available
        try:
            next_button = browser.find_element(By.ID, 'pnnext')
            next_button.click()
            time.sleep(3)
        except:
            break
    return None

# Main scraping code
sample = 'The Godfather'
base_url = 'https://www.google.com/search?&q='
query_url = f'{base_url}{sample}+watch+movie'

try:
    browser.get(query_url)
    time.sleep(5)
    
    # Get streaming data
    soup = BeautifulSoup(browser.page_source, 'lxml')
    
    streaming = []
    title = []
    price = []

    results1 = soup.find_all('div', class_='ellip bclEt')
    for result in results1:
        streaming.append(result.text)
        title.append(sample.capitalize())

    results2 = soup.find_all('div', class_='ellip rsj3fb')
    for result in results2:
        price.append(result.text)
        
    # Find IMDb URL by scanning through pages
    imdb_url = find_imdb_url(browser)
    
    # Create DataFrame
    Sample_Streaming_df = pd.DataFrame({
        'Title': title,
        'Streaming On': streaming,
        'Price': price,
        'IMDb_URL': imdb_url
    })

    Sample_Streaming_df['Price'] = Sample_Streaming_df['Price'].str.replace("$","", case=True, regex=True)
    Sample_Streaming_df['Price'] = Sample_Streaming_df['Price'].str.replace("From ","", case=True, regex=True)

except Exception as e:
    print(f"Error: {e}")

Sample_Streaming_df

Unnamed: 0,Title,Streaming On,Price,IMDb_URL
0,The godfather,Netflix,Subscription,
1,The godfather,Vidio,Subscription,
2,The godfather,Google Play Movies & TV,"Rp 25.000,00",
3,The godfather,Apple TV,"Rp 25.000,00",


In [33]:
Sample_Streaming_df.to_csv('../Output/IMDb/Sample_Google_Scraping.csv')

## Scraping Google 
---

In [12]:
Streaming = []
Title = []
Price = []
IMDb_URLs = []

count = 0

movies = google_query_url_df['Title']

query_urls = google_query_url_df['Google Query URL']

base_url = 'https://www.google.com/search?&q='

In [13]:
def find_imdb_url(browser, max_pages=5):
    imdb_pattern = r'https://www\.imdb\.com/title/[^/\s"]+'
    
    for page in range(max_pages):
        # Search in current page
        page_source = browser.page_source
        matches = re.findall(imdb_pattern, page_source)
        
        if matches:
            return matches[0]
            
        # Try next page if available
        try:
            next_button = browser.find_element(By.ID, 'pnnext')
            next_button.click()
            time.sleep(2)
        except:
            break
    return None

# Loop through all movies in the query list
for count in range(len(movies)):
    try:
        # Construct and visit URL
        query_url = f'{base_url}{movies[count]}+watch+movie'
        browser.get(query_url)
        time.sleep(2)
        
        soup = BeautifulSoup(browser.page_source, 'lxml')

        # Look for streaming services
        results1 = soup.find_all('div', class_='ellip bclEt')
        if results1:
            for result in results1:
                Streaming.append(result.text)
                Title.append(movies[count].capitalize())
                
                price_div = soup.find('div', class_='ellip rsj3fb')
                Price.append(price_div.text if price_div else 'Not Available')
                
                # Find IMDb URL for each streaming entry
                imdb_url = find_imdb_url(browser)
                IMDb_URLs.append(imdb_url)
                
                # Return to original search results
                browser.get(query_url)
                time.sleep(2)
        else:
            Title.append(movies[count].capitalize())
            Streaming.append('Not Available') 
            Price.append('Not Available')
            IMDb_URLs.append('Not Available')
            
        print(f"Processed {count + 1}/250: {movies[count]}")
            
    except Exception as e:
        print(f"Error processing {movies[count]}: {str(e)}")
        Title.append(movies[count].capitalize())
        Streaming.append('Error')
        Price.append('Error')
        IMDb_URLs.append('Error')
        continue
        
    # Save progress periodically
    if count % 50 == 0 and count > 0:
        temp_df = pd.DataFrame({
            'Title': Title,
            'Streaming On': Streaming, 
            'Price': Price,
            'IMDb_URL': IMDb_URLs
        })
        temp_df.to_csv(f'../Output/IMDb/Google_Checkpoints/Google_Scraping_checkpoint_{count}.csv', index=False)

# Create final dataframe
Streaming_df = pd.DataFrame({
    'Title': Title,
    'Streaming On': Streaming,
    'Price': Price,
    'IMDb_URL': IMDb_URLs
})

# Save complete results
Streaming_df.to_csv('../Output/IMDb/Google_Checkpoints/Google_Scraping_complete.csv', index=False)

print(f"Completed processing {len(movies)} movies")

  query_url = f'{base_url}{movies[count]}+watch+movie'
  Title.append(movies[count].capitalize())
  print(f"Processed {count + 1}/250: {movies[count]}")


Processed 1/250: The Shawshank Redemption
Processed 2/250: The Godfather
Processed 3/250: The Dark Knight
Processed 4/250: The Godfather Part II
Processed 5/250: 12 Angry Men
Processed 6/250: The Lord of the Rings: The Return of the King
Processed 7/250: Schindler's List
Processed 8/250: Pulp Fiction
Processed 9/250: The Lord of the Rings: The Fellowship of the Ring
Processed 10/250: The Good, the Bad and the Ugly
Processed 11/250: Forrest Gump
Processed 12/250: The Lord of the Rings: The Two Towers


  Title.append(movies[count].capitalize())


Processed 13/250: Fight Club
Processed 14/250: Inception
Processed 15/250: Star Wars: Episode V - The Empire Strikes Back
Processed 16/250: The Matrix
Processed 17/250: GoodFellas
Processed 18/250: One Flew Over the Cuckoo's Nest
Processed 19/250: Interstellar
Processed 20/250: Seven
Processed 21/250: It's a Wonderful Life
Processed 22/250: Seven Samurai
Processed 23/250: The Silence of the Lambs
Processed 24/250: Saving Private Ryan
Processed 25/250: City of God
Processed 26/250: The Green Mile
Processed 27/250: Life Is Beautiful
Processed 28/250: Terminator 2: Judgment Day
Processed 29/250: Star Wars: Episode IV - A New Hope
Processed 30/250: Back to the Future
Processed 31/250: Spirited Away
Processed 32/250: The Pianist
Processed 33/250: Parasite
Processed 34/250: Psycho
Processed 35/250: Gladiator
Processed 36/250: The Lion King
Processed 37/250: The Departed
Processed 38/250: Spider-Man: Across the Spider-Verse
Processed 39/250: Grave of the Fireflies
Processed 40/250: Whiplash
P

In [None]:
browser.quit()

In [18]:
Streaming_df = pd.DataFrame({'Title': Title,
                             'Streaming On' : Streaming,
                             'Price' : Price,
                             'IMDb_URL': IMDb_URLs
                            })


In [19]:
Streaming_df.head

<bound method NDFrame.head of                         Title             Streaming On              Price  \
0    The shawshank redemption                      Max       Subscription   
1    The shawshank redemption                 Apple TV       Subscription   
2               The godfather                  Netflix       Subscription   
3               The godfather                    Vidio       Subscription   
4               The godfather  Google Play Movies & TV       Subscription   
..                        ...                      ...                ...   
350                  The help            Not Available      Not Available   
351            Cool hand luke                 Apple TV  From Rp 25.000,00   
352            Cool hand luke  Google Play Movies & TV  From Rp 25.000,00   
353            A silent voice                  Netflix       Subscription   
354              Paris, texas            Not Available      Not Available   

                                 IMDb_URL  
0

Data Cleaning 

In [20]:
Streaming_df_cleaned = Streaming_df.dropna(how='any')  # Remove rows with any null values

# Clean up text fields
Streaming_df_cleaned['Title'] = (Streaming_df_cleaned['Title']
    .str.strip()  # Remove leading/trailing whitespace
    .str.title()  # Consistent capitalization
)

Streaming_df_cleaned['Streaming On'] = (Streaming_df_cleaned['Streaming On']
    .str.strip()
    .str.replace('  ', ' ')  # Remove double spaces
)

# Clean up price field
Streaming_df_cleaned['Price'] = (Streaming_df_cleaned['Price']
    .str.replace('Rp', '', regex=False)  # Remove dollar signs
    .str.replace('Dari ', '', regex=False)  # Remove "From " text
    .str.strip()
)

# Clean IMDb URLs and extract IDs
Streaming_df_cleaned['IMDb_URL'] = (Streaming_df_cleaned['IMDb_URL']
    .str.strip()
    .str.rstrip('/')  # Remove trailing slashes
)

# Extract IMDb ID using regex
Streaming_df_cleaned['imdb_id'] = (Streaming_df_cleaned['IMDb_URL']
    .str.extract(r'title/(tt\d+)', expand=False)  # Extract ttXXXXXXX pattern
    .fillna('Not Available')  # Handle missing/invalid URLs
)

# Verify IMDb IDs are in correct format
Streaming_df_cleaned.loc[~Streaming_df_cleaned['imdb_id'].str.match(r'^tt\d+$|^Not Available$'), 'imdb_id'] = 'Invalid Format'

# Drop IMDb_URL column
Streaming_df_cleaned = Streaming_df_cleaned.drop('IMDb_URL', axis=1)

# Remove duplicate entries
Streaming_df_cleaned = Streaming_df_cleaned.drop_duplicates()

# Reset index after all cleaning
Streaming_df_cleaned = Streaming_df_cleaned.reset_index(drop=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Streaming_df_cleaned['Title'] = (Streaming_df_cleaned['Title']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Streaming_df_cleaned['Streaming On'] = (Streaming_df_cleaned['Streaming On']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Streaming_df_cleaned['Price'] = (Streaming_df_cleaned['Price']
A 

In [21]:
Streaming_df_cleaned

Unnamed: 0,Title,Streaming On,Price,imdb_id
0,The Shawshank Redemption,Max,Subscription,tt0111161
1,The Shawshank Redemption,Apple TV,Subscription,tt0111161
2,The Dark Knight,Max,Subscription,tt0468569
3,The Dark Knight,Google Play Movies & TV,Subscription,tt0468569
4,The Dark Knight,Apple TV,Subscription,tt0468569
...,...,...,...,...
285,The Help,Not Available,Not Available,Not Available
286,Cool Hand Luke,Apple TV,"From 25.000,00",tt0061512
287,Cool Hand Luke,Google Play Movies & TV,"From 25.000,00",tt0061512
288,A Silent Voice,Netflix,Subscription,tt5323662


In [22]:
Streaming_df_cleaned.to_csv('../Output/IMDb/Google_Scraping_Cleaned_Complete.csv', index=False)

In [23]:
import os
os.getcwd()

'e:\\VSCProjects\\ETL-Melodi\\Extract'