In [1]:
import pandas as pd
pd.set_option('display.max_rows',500)
import numpy as np
import os
import gzip
import re

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By

In [6]:
def preparing_data(path, platform):
    df = pd.read_csv(path)
    df['platform'] = platform
    return df

In [7]:
def get_nulls(df):
    return df[df['imdb_rating'].isnull()].shape
    

In [8]:
netflix = preparing_data('data\\kaggle_data\\netflix_titles.csv', 'netflix')

In [9]:
amazon = preparing_data('data\\kaggle_data\\amazon_prime_titles.csv', 'amazon')

In [10]:
disney = preparing_data('data\\kaggle_data\\disney_plus_titles.csv', 'disney')

In [11]:
data = pd.concat([netflix, amazon, disney])

In [18]:
data[data.title.str.contains('The I')]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,platform
178,s179,Movie,The Interview,"Evan Goldberg, Seth Rogen","James Franco, Seth Rogen, Lizzy Caplan, Randal...",United States,"September 1, 2021",2014,R,112 min,"Action & Adventure, Comedies",Seth Rogen and James Franco star in this provo...,netflix
562,s563,Movie,Austin Powers: International Man of Mystery,Jay Roach,"Mike Myers, Elizabeth Hurley, Michael York, Mi...",United States,"July 1, 2021",1997,PG-13,90 min,Comedies,A swingin' fashion photographer by day and a g...,netflix
741,s742,Movie,Blind Intersections,Lara Saba,"Alaa Hammoud, Ghida Nouri, Carole Hajj, Chadi ...","Lebanon, United Arab Emirates","June 10, 2021",2012,TV-MA,91 min,"Dramas, International Movies",This film follows the societal challenges thre...,netflix
816,s817,Movie,The Interpreter,Sydney Pollack,"Nicole Kidman, Sean Penn, Catherine Keener, Je...","United Kingdom, France, Germany, United States","June 2, 2021",2005,PG-13,128 min,Thrillers,Silvia Broome is a translator at the United Na...,netflix
1274,s1275,TV Show,Canine Intervention,,Jas Leverette,United States,"February 24, 2021",2021,TV-PG,1 Season,Reality TV,"No dog, no breed, no behavior is unfixable for...",netflix
2458,s2459,Movie,Kenny Sebastian: The Most Interesting Person i...,Angshuman Ghosh,Kenny Sebastian,India,"May 29, 2020",2020,TV-MA,68 min,Stand-Up Comedy,"Fusing his musical and stand-up chops, Kenny S...",netflix
2636,s2637,Movie,The International Player,Ahmed Medhat,"Youssef El Sherif, Arwa Gouda, Salah Abdullah,...",Egypt,"April 25, 2020",2009,TV-14,93 min,"Dramas, International Movies, Sports Movies",A footballer for a local club aspires to join ...,netflix
2748,s2749,TV Show,Well-Intended Love,,"Xu Kaicheng, Simona Wang, Ian Yi, Huang Qian S...",China,"April 1, 2020",2020,TV-14,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...","To secure a bone marrow donation, an actress d...",netflix
3121,s3122,TV Show,Don't F**k with Cats: Hunting an Internet Killer,Mark Lewis,,"United Kingdom, United States","December 18, 2019",2019,TV-MA,1 Season,"Crime TV Shows, Docuseries",A twisted criminal's gruesome videos drive a g...,netflix
3400,s3401,TV Show,Interior Design Masters,,,United Kingdom,"October 18, 2019",2019,TV-PG,1 Season,"British TV Shows, International TV Shows, Real...",Aspiring interior designers transform a variet...,netflix


In [12]:
r = data[data.title == 'The Interview']

### Selenium extraction

In [10]:
os.environ['WDM_LOG_LEVEL'] = '0'
browser = webdriver.Chrome(ChromeDriverManager().install())
wait = WebDriverWait(browser, 10)
browser.maximize_window()
browser.get("https://www.imdb.com/")
# Wait 10 seconds for available elements to appear -> Is it necessary if I have a wait.until right after ?
# browser.implicitly_wait(10)
# Wait until the search button is available -> I store the path XPath of the button in the search variable
# I insert the title in the search box and after waiting 7 second I click the search button to look for all results
search = wait.until(ec.element_to_be_clickable((By.XPATH, '//*[@id="suggestion-search"]')))
search.send_keys('Interstellar')
search_button = browser.find_element_by_id('suggestion-search-button')
search_button.click()





In [3]:
def get_rating(value):
    
    try:
    
        os.environ['WDM_LOG_LEVEL'] = '0'
        browser = webdriver.Chrome(ChromeDriverManager().install())
        wait = WebDriverWait(browser, 10)
        browser.maximize_window()
        browser.get("https://www.imdb.com/")
        # Wait 10 seconds for available elements to appear -> Is it necessary if I have a wait.until right after ?
        #browser.implicitly_wait(10)
        # Wait until the search button is available -> I store the path XPath of the button in the search variable
        # I insert the title in the search box and after waiting 7 second I click the search button to look for all results
        search = wait.until(ec.element_to_be_clickable((By.XPATH, '//*[@id="suggestion-search"]')))
        search.send_keys(value)
        browser.implicitly_wait(7)

        search_button = browser.find_element_by_id('suggestion-search-button')
        search_button.click()

        #data_year = str(l.release_year[l.title == value].item())
        # I store the release year of the title in a string variable to better find the right title in IMdb
        data_year = str(data.release_year[data.title == value].to_list()[0])

        try:
            # I identify the table with all the results from the search that correspond to the searched title
            results_table = browser.find_element_by_class_name('findList')
            # I iterate thorugh the table and get the number of rows availale
            rows = results_table.find_elements(By.TAG_NAME, "tr")
            col = [row.find_elements(By.TAG_NAME, "td")[1].text for row in rows]
            # With a FOR loop I iterate through the table using the number of lines as range
            for rg in range(0, len(col)):
                try:
                    #There might results that share the exact same title, so I use the date in the df to identify the correct one
                    # by comparing it to the date in IMdb
                    imdb_year = re.findall('\(\d{4}\)',col[rg])[0].replace('(','').replace(')','')
                    if imdb_year == data_year:
                        '''
                        Wait for the table with the different results to be available -> explain Explicit Wait
                        selenium documentation -> https://selenium-python.readthedocs.io/waits.html
                        '''
                        # If the release year matches the function clicks the title and extract the rating
                        search_result = wait.until(ec.element_to_be_clickable((By.XPATH, f'/html/body/div[3]/div/div[2]/div[3]/div[1]/div/div[2]/table/tbody/tr[{rg+1}]/td[2]/a')))
                        search_result.click()
                        rating = browser.find_element_by_class_name('AggregateRatingButton__RatingScore-sc-1ll29m0-1.iTLWoV').text
                        # After the rating has been obtained the browser closes and starts again with the next title
                        browser.close()
                        return rating
                    else:
                        pass

                except: 
                    pass
        except:
            pass
    except:
        pass

In [13]:
os.environ['WDM_LOG_LEVEL'] = '0'
browser = webdriver.Chrome(ChromeDriverManager().install())





In [7]:
amazon_null = amazon[amazon.imdb_rating.isnull()]

In [11]:
pd.set_option('display.max_rows',5000)

In [22]:
amazon_dir = amazon_null[~amazon_null.director.isnull()]

In [23]:
amazon_dir.shape

(3398, 14)

In [5]:
def rating_2(value):
    
    try:
        os.environ['WDM_LOG_LEVEL'] = '0'
        browser = webdriver.Chrome(ChromeDriverManager().install())
        wait = WebDriverWait(browser, 10)
        browser.maximize_window()
        browser.get("https://www.imdb.com/")
        browser.implicitly_wait(10)

        search = wait.until(ec.element_to_be_clickable((By.XPATH, '//*[@id="suggestion-search"]')))
        search.send_keys(value)
        browser.implicitly_wait(7)

        search_button = browser.find_element_by_id('suggestion-search-button')
        search_button.click()
        search_result = wait.until(ec.element_to_be_clickable((By.XPATH, f'/html/body/div[3]/div/div[2]/div[3]/div[1]/div/div[2]/table/tbody/tr[1]/td[2]/a')))
        search_result.click()
        rating = browser.find_element_by_class_name('AggregateRatingButton__RatingScore-sc-1ll29m0-1.iTLWoV').text
        browser.close()
        return rating
    except:
        pass

In [16]:
amazon_null.to_csv(r'C:\Users\juanp\proyectos_varios\streaming_platforms\data\notebook_df\amazon_null.csv' , index = False)

In [81]:
os.environ['WDM_LOG_LEVEL'] = '0'
browser = webdriver.Chrome(ChromeDriverManager().install())
wait = WebDriverWait(browser, 10)
browser.maximize_window()
browser.get("https://www.imdb.com/")
# Wait 10 seconds for available elements to appear -> Is it necessary if I have a wait.until right after ?
browser.implicitly_wait(10)
# Wait until the search button is available -> I store the path XPath of the button in the search variable
# I insert the title in the search box and after waiting 7 second I click the search button to look for all results
search = wait.until(ec.element_to_be_clickable((By.XPATH, '//*[@id="suggestion-search"]')))
search.send_keys('The Interview')
browser.implicitly_wait(7)

search_button = browser.find_element_by_id('suggestion-search-button')
search_button.click()

#There might results that share the exact same title, so I use the date in the df to identify the correct one
# by comparing it to the date in IMdb
'''
Wait for the table with the different results to be available -> explain Explicit Wait
selenium documentation -> https://selenium-python.readthedocs.io/waits.html
'''
# If the release year matches the function clicks the title and extract the rating
search_result = wait.until(ec.element_to_be_clickable((By.XPATH, f'/html/body/div[3]/div/div[2]/div[3]/div[1]/div/div[2]/table/tbody/tr[1]/td[2]/a')))
search_result.click()
rating_ = browser.find_element_by_class_name('ipc-button__text')
rating_.find_element(By.TAG_NAME, "span")


#r_2 = wait.until(ec.element_to_be_clickable((By.XPATH, 
                                                  #'/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/div[2]/div/div[1]/a')))

# s_ = r_.find_elements(By.TAG_NAME, "div")

# members = wait.until(ec.element_to_be_clickable((By.XPATH, 
                                                  #'/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/div[3]/ul')))
# lineas = members.find_elements(By.TAG_NAME, "li")

#<span class="sc-7ab21ed2-1 jGRxWM">6.5</span>
#rating = browser.find_element_by_class_name('AggregateRatingButton__RatingScore-sc-1ll29m0-1.iTLWoV').text
# After the rating has been obtained the browser closes and starts again with the next title
browser.close()





NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"span"}
  (Session info: chrome=100.0.4896.127)


In [73]:
rating

NameError: name 'rating' is not defined