## Collecting the streaming options for top 250 IMDb rated movies

### Source Link: https://rapidapi.com/utelly/api/utelly

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import requests

from API_keys.config_omdb import omdb_key

import json 
import re
import time

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC

In [2]:
!where chromedriver

c:\Projects\TUGAS\ETL-Melodi\venv\Scripts\chromedriver.exe


#### Step 1: Collecting Top 250 IMDb from IMDb website

Source Link: https://www.imdb.com/chart/top/?ref_=nv_mv_250

In [4]:
# Initialize Chrome WebDriver (make sure you have chromedriver installed)
driver = webdriver.Chrome()

# Navigate to IMDb Top 250
url = "https://www.imdb.com/chart/top/"
driver.get(url)

# Wait for page to load and scroll down to load all movies
SCROLL_PAUSE_TIME = 2

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Get page source after all content is loaded
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Initialize lists to store data
movie_data = []

# Find all movie entries
movies = soup.find_all('li', class_='ipc-metadata-list-summary-item')

for movie in movies:
    movie_dict = {}
    
    # Extract title (remove ranking number from title)
    title = movie.find('h3', class_='ipc-title__text')
    if title:
        movie_dict['Title'] = title.text.strip()
    else:
        movie_dict['Title'] = 'Unknown'
    
    # Extract IMDB ID
    movie_link = movie.find('a', class_='ipc-title-link-wrapper')
    if movie_link:
        imdb_id = re.search(r'/title/(tt\d+)/', movie_link['href'])
        movie_dict['IMDb_ID'] = imdb_id.group(1) if imdb_id else 'Unknown'
    
    # Extract rating
    rating = movie.find('span', class_='ipc-rating-star--imdb')
    if rating:
        movie_dict['Rating'] = rating['aria-label'].split()[-1]
    
    # Extract year
    year = movie.find('span', class_='cli-title-metadata-item')
    movie_dict['Year'] = year.text if year else 'Unknown'
    
    movie_data.append(movie_dict)

# Close the browser
driver.quit()

# Create DataFrame
Raw_df = pd.DataFrame(movie_data)

# Display the data
print("Successfully scraped", len(Raw_df), "movies")
Raw_df.head()

Successfully scraped 249 movies


Unnamed: 0,Title,IMDb_ID,Rating,Year
0,1. The Shawshank Redemption,tt0111161,9.3,1994
1,2. The Godfather,tt0068646,9.2,1972
2,3. The Dark Knight,tt0468569,9.0,2008
3,4. The Godfather Part II,tt0071562,9.0,1974
4,5. 12 Angry Men,tt0050083,9.0,1957


### Separate 'Rank & Title' column to Rank and Title columns
---

In [5]:
# Extract rank numbers from title using str.extract()
Raw_df['Rank'] = Raw_df['Title'].str.extract(r'^(\d+)')

# Remove rank numbers and period from title 
Raw_df['Title'] = Raw_df['Title'].str.replace(r'^\d+\.\s*', '', regex=True)

# Reorder columns
Imdb_df = Raw_df[['Rank', 'Title', 'IMDb_ID', 'Year', 'Rating']]
Imdb_df.head()

Unnamed: 0,Rank,Title,IMDb_ID,Year,Rating
0,1,The Shawshank Redemption,tt0111161,1994,9.3
1,2,The Godfather,tt0068646,1972,9.2
2,3,The Dark Knight,tt0468569,2008,9.0
3,4,The Godfather Part II,tt0071562,1974,9.0
4,5,12 Angry Men,tt0050083,1957,9.0


In [6]:
Imdb_df.to_csv('../Output/OMDb_Utelly/Top_250_IMDb.csv', index=False)

### Step 2: Collecting IMDb unique ID and other movie details from OMDb API

Source Link: http://www.omdbapi.com/

In [None]:
url = "https://www.omdbapi.com/?apikey=" + omdb_key + "&t="

In [8]:
movie_list = Imdb_df['Title']

In [9]:
results_omdb = []

for movie in movie_list:
    movie_data = requests.get(url + movie).json()
    results_omdb.append(movie_data)

In [10]:
# Saving into a JSON file
# Source Link: https://stackabuse.com/reading-and-writing-json-files-in-python-with-pandas/

with open ('../Output/OMDb_Utelly/OMDb_250.json', 'w') as f:
    json.dump(results_omdb, f)

In [11]:
# reading JSON file through dataframe

raw_omdb_df = pd.read_json('../Output/OMDb_Utelly/OMDb_250.json')
raw_omdb_df.head

<bound method NDFrame.head of                          Title    Year      Rated     Released  Runtime  \
0     The Shawshank Redemption  1994.0          R  14 Oct 1994  142 min   
1                The Godfather  1972.0          R  24 Mar 1972  175 min   
2              The Dark Knight  2008.0      PG-13  18 Jul 2008  152 min   
3        The Godfather Part II  1974.0          R  18 Dec 1974  202 min   
4                 12 Angry Men  1957.0   Approved  10 Apr 1957   96 min   
..                         ...     ...        ...          ...      ...   
244              Amores Perros  2000.0          R  13 Apr 2001  154 min   
245                    Rebecca  1940.0   Approved  12 Apr 1940  130 min   
246                   The Help  2011.0      PG-13  10 Aug 2011  146 min   
247             Cool Hand Luke  1967.0   Approved  01 Nov 1967  127 min   
248  A Silent Voice: The Movie  2016.0  Not Rated  17 Sep 2016  130 min   

                       Genre                       Director  \
0     

In [12]:
raw_omdb_df.shape

(249, 26)

In [13]:
# Showing all column titles
list(raw_omdb_df.columns)

['Title',
 'Year',
 'Rated',
 'Released',
 'Runtime',
 'Genre',
 'Director',
 'Writer',
 'Actors',
 'Plot',
 'Language',
 'Country',
 'Awards',
 'Poster',
 'Ratings',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'imdbID',
 'Type',
 'DVD',
 'BoxOffice',
 'Production',
 'Website',
 'Response',
 'Error']

In [14]:
id_imdb_list = raw_omdb_df['imdbID']

In [15]:
len(id_imdb_list)

249

In [16]:
print(id_imdb_list)

0      tt0111161
1      tt0068646
2      tt0468569
3      tt0071562
4      tt0050083
         ...    
244    tt0245712
245    tt0032976
246    tt1454029
247    tt0061512
248    tt5323662
Name: imdbID, Length: 249, dtype: object


### Step 3: Collecting streaming options for Top 250 IMDb movies

Endpoint: GET/idlookup

In [17]:
from API_keys.config import api_key

In [18]:
url = "https://utelly-tv-shows-and-movies-availability-v1.p.rapidapi.com/idlookup"

querystring = {"country":"us","source":"imdb"}

headers = {
	"x-rapidapi-key": api_key,
	"x-rapidapi-host": "utelly-tv-shows-and-movies-availability-v1.p.rapidapi.com"
}

In [19]:
id_imdb_list_sample = ['tt0111161', 'tt0068646', 'tt0071562']

In [20]:
# Convert pandas Series to clean Python list
id_imdb_list = id_imdb_list.tolist()

imdb_id = []
title = []
streaming_service = []
streaming_url = []
all_requests_json = []

for id_imdb in id_imdb_list:
    time.sleep(2)
    querystring = {
            "source_id": id_imdb,
            "source": "imdb",
            "country": "us"
        }
    
    try:
        response = requests.request("GET", url, headers=headers, params=querystring)

        json_result = response.json()
        all_requests_json.append(json_result)

        for location in json_result['collection']['locations']:
            imdb_id.append(json_result['id'])
            title.append(json_result['collection']['name'])
            streaming_service.append(location['display_name'])
            streaming_url.append(location['url'])
        
    except:
        print('Data Not Available')
        print(id_imdb)
        print('\n')

Data Not Available
tt15239678


Data Not Available
nan


Data Not Available
tt23849204


Data Not Available
tt5074352


Data Not Available
tt0055031


Data Not Available
tt29623480


Data Not Available
tt26548265


Data Not Available
tt0476735


Data Not Available
tt0035446


Data Not Available
tt0032976




In [21]:
with open ('../Output/OMDB_Utelly/json_files/utelly_all_requests_json.json', 'w') as f:
    json.dump(all_requests_json, f)

In [22]:
streaming_dict = {'IMDb ID': imdb_id,
                 'Title': title,
                 'Streaming Service': streaming_service,
                 'Streaming URL' : streaming_url
                }

In [23]:
streaming_df = pd.DataFrame(streaming_dict)
streaming_df

Unnamed: 0,IMDb ID,Title,Streaming Service,Streaming URL
0,tt0111161,The Shawshank Redemption,Amazon Instant Video,https://www.amazon.com/gp/video/detail/B001EBV...
1,tt0111161,The Shawshank Redemption,Google Play,https://play.google.com/store/movies/details/T...
2,tt0111161,The Shawshank Redemption,iTunes,https://tv.apple.com/us/movie/the-shawshank-re...
3,tt0068646,The Godfather,Amazon Instant Video,https://www.amazon.com/gp/video/detail/B00BQRP...
4,tt0068646,The Godfather,Google Play,https://play.google.com/store/movies/details/T...
...,...,...,...,...
758,tt0061512,Cool Hand Luke,iTunes,https://tv.apple.com/us/movie/cool-hand-luke/u...
759,tt0061512,Cool Hand Luke,Amazon Prime Video,https://watch.amazon.com/detail?asin=B002V6RB1...
760,tt5323662,A Silent Voice: The Movie,Amazon Instant Video,https://www.amazon.com/gp/video/detail/B08DRR9...
761,tt5323662,A Silent Voice: The Movie,Google Play,https://play.google.com/store/movies/details/A...


In [24]:
streaming_df.shape

(763, 4)

In [25]:
streaming_df.to_csv('../Output/OMDb_Utelly/streaming_df.csv', index=False)