In [2]:
import pandas as pd
pd.set_option('display.max_rows',2000)
import numpy as np
import os
import gzip
import re

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By

In [4]:
def preparing_data(path, platform):
    df = pd.read_csv(path)
    df['platform'] = platform
    return df

In [5]:
def get_nulls(df):
    return df[df['imdb_rating'].isnull()].shape
    

In [6]:
netflix = preparing_data('data\\kaggle_data\\netflix_titles.csv', 'netflix')

In [7]:
netflix.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'platform'],
      dtype='object')

In [29]:
amazon = preparing_data('data\\kaggle_data\\amazon_prime_titles.csv', 'amazon')

In [30]:
disney = preparing_data('data\\kaggle_data\\disney_plus_titles.csv', 'disney')

In [31]:
data = pd.concat([netflix, amazon, disney])

In [32]:
data = data.drop_duplicates(subset='title', keep=False)

In [33]:
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,platform
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",netflix
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",netflix
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,netflix
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",netflix
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,netflix


In [34]:
data = data[~(data.director.isnull()) | ~(data.cast.isnull())]

In [35]:
data.fillna('unknown', axis = 1, inplace = True)

In [36]:
data['full_cast'] = data['cast'].astype('str') + ',' + data['director'] 

In [37]:
data

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,platform,full_cast
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,unknown,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",netflix,"unknown,Kirsten Johnson"
1,s2,TV Show,Blood & Water,unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",netflix,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",unknown,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,netflix,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi..."
4,s5,TV Show,Kota Factory,unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,netflix,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K..."
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",unknown,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...,netflix,"Kate Siegel, Zach Gilford, Hamish Linklater, H..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1445,s1446,Movie,X-Men Origins: Wolverine,Gavin Hood,"Hugh Jackman, Liev Schreiber, Danny Huston, wi...","United States, United Kingdom","June 4, 2021",2009,PG-13,108 min,"Action-Adventure, Family, Science Fiction",Wolverine unites with legendary X-Men to fight...,disney,"Hugh Jackman, Liev Schreiber, Danny Huston, wi..."
1446,s1447,Movie,Night at the Museum: Battle of the Smithsonian,Shawn Levy,"Ben Stiller, Amy Adams, Owen Wilson, Hank Azar...","United States, Canada","April 2, 2021",2009,PG,106 min,"Action-Adventure, Comedy, Family",Larry Daley returns to rescue some old friends...,disney,"Ben Stiller, Amy Adams, Owen Wilson, Hank Azar..."
1447,s1448,Movie,Eddie the Eagle,Dexter Fletcher,"Tom Costello, Jo Hartley, Keith Allen, Dickon ...","United Kingdom, Germany, United States","December 18, 2020",2016,PG-13,107 min,"Biographical, Comedy, Drama","True story of Eddie Edwards, a British ski-jum...",disney,"Tom Costello, Jo Hartley, Keith Allen, Dickon ..."
1448,s1449,Movie,Bend It Like Beckham,Gurinder Chadha,"Parminder Nagra, Keira Knightley, Jonathan Rhy...","United Kingdom, Germany, United States","September 18, 2020",2003,PG-13,112 min,"Buddy, Comedy, Coming of Age",Despite the wishes of their traditional famili...,disney,"Parminder Nagra, Keira Knightley, Jonathan Rhy..."


### Selenium extraction

In [38]:
def get_rating(value):
    
    # Extract the list of full_cast to later find the right title
    #full_cast = data[data.title == value].full_cast.item().split(',')
    
    full_cast = data[data.title==value].full_cast.item().split(',')
    
    len_full_cast = len(full_cast)
    
    try:
        # Set selenium lines for scrapping
        os.environ['WDM_LOG_LEVEL'] = '0'
        browser = webdriver.Chrome(ChromeDriverManager().install())
        wait = WebDriverWait(browser, 10)
        browser.maximize_window()
        browser.get("https://www.imdb.com/")
        # Wait 10 seconds for available elements to appear -> Is it necessary if I have a wait.until right after ?
        #browser.implicitly_wait(10)
        # Wait until the search button is available -> I store the path XPath of the button in the search variable
        # I insert the title in the search box and after waiting 7 second I click the search button to look for all results
        search = wait.until(ec.element_to_be_clickable((By.XPATH, '//*[@id="suggestion-search"]')))
        search.send_keys(value)
        browser.implicitly_wait(7)


        search_button = browser.find_element_by_id('suggestion-search-button')
        search_button.click()

        try:        
        # I identify the table with all the results from the search that correspond to the searched title
            results_table = browser.find_element_by_class_name('findList')
            # I iterate thorugh the table and get the number of rows availale
            rows = results_table.find_elements(By.TAG_NAME, "tr")
            col = [row.find_elements(By.TAG_NAME, "td")[1].text for row in rows]

            # With a FOR loop I iterate through the table using the number of lines as range
            for rg in range(0, len(rows)):
                try:
                    search_result = wait.until(ec.element_to_be_clickable((By.XPATH, f'/html/body/div[3]/div/div[2]/div[3]/div[1]/div/div[2]/table/tbody/tr[{rg+1}]/td[2]/a')))
                    search_result.click()    

                    movie_team = wait.until(ec.element_to_be_clickable((By.XPATH, 
                                                                      '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/div[3]/ul')))
                    members = movie_team.find_elements(By.TAG_NAME, "li")

                    members_imdb_list = [member.text for member in members]

                    count = 0

                    for cast in full_cast:

                        count+=1

                        if cast.strip() in members_imdb_list:

                            rating = browser.find_element_by_class_name('sc-7ab21ed2-1.jGRxWM').text
                            return rating
                            #browser.close()
                            break

                        elif count == len(full_cast):
                            browser.back()

                        else:
                            pass     
                except:
                    passs

        except:
            pass
    except:
        pass


In [47]:
round_8 = data[4000: 4500]

In [51]:
round_8.shape

(500, 15)

In [45]:
round_8.imdb_rating.isnull().sum()

83

In [49]:
round_8['imdb_rating'] = list(map(get_rating, round_8.title))











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  round_8['imdb_rating'] = list(map(get_rating, round_8.title))


In [55]:
round_8.to_csv('C:\\Users\\juanp\\proyectos_varios\\streaming_platforms\\data\\notebook_df\\round_8.csv', index = False)

In [55]:
full_cast = [ 'Gordon Dav', 'Nate Duncan']

len_full_cast = len(full_cast)

# Set selenium lines for scrapping
os.environ['WDM_LOG_LEVEL'] = '0'
browser = webdriver.Chrome(ChromeDriverManager().install())
wait = WebDriverWait(browser, 10)
browser.maximize_window()
browser.get("https://www.imdb.com/")
# Wait 10 seconds for available elements to appear -> Is it necessary if I have a wait.until right after ?
#browser.implicitly_wait(10)
# Wait until the search button is available -> I store the path XPath of the button in the search variable
# I insert the title in the search box and after waiting 7 second I click the search button to look for all results
search = wait.until(ec.element_to_be_clickable((By.XPATH, '//*[@id="suggestion-search"]')))
search.send_keys('The Interview')
browser.implicitly_wait(7)


search_button = browser.find_element_by_id('suggestion-search-button')
search_button.click()

# I identify the table with all the results from the search that correspond to the searched title
results_table = browser.find_element_by_class_name('findList')
# I iterate thorugh the table and get the number of rows availale
rows = results_table.find_elements(By.TAG_NAME, "tr")
col = [row.find_elements(By.TAG_NAME, "td")[1].text for row in rows]

# With a FOR loop I iterate through the table using the number of lines as range
for rg in range(0, len(rows)):
    search_result = wait.until(ec.element_to_be_clickable((By.XPATH, f'/html/body/div[3]/div/div[2]/div[3]/div[1]/div/div[2]/table/tbody/tr[{rg+1}]/td[2]/a')))
    search_result.click()    

    movie_team = wait.until(ec.element_to_be_clickable((By.XPATH, 
                                                      '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/div[3]/ul')))
    members = movie_team.find_elements(By.TAG_NAME, "li")

    members_imdb_list = [member.text for member in members]

    count = 0
    
    print(len_full_cast)

    for cast in full_cast:
        count+=1
        print(count)
        print(cast)

        if cast.strip() in members_imdb_list:

            rating = browser.find_element_by_class_name('sc-7ab21ed2-1.jGRxWM').text
            print(rating)
            break
            
        elif count == len(full_cast):
            browser.back()
            
        else:
            pass




2
1
Gordon Dav
2
Nate Duncan
2
1
Gordon Dav
2
Nate Duncan
2
1
Gordon Dav
2
Nate Duncan
8,2
