# Import relevant libraries

In [1]:
import pandas as pd
import re, requests
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep

# create crawler class

In [55]:
class crawler:
    def __init__(self):
        """Initialise the crawler"""
        self.driver = webdriver.Chrome(ChromeDriverManager().install())
        
    def get_page(self,url):
        driver = self.driver
        driver.get(url)
    
    def scroll_down(self):
        driver = self.driver
        
        sleep(0.5)
        for i in range(20): 
            driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
            sleep(0.6)
    
    def get_page_source(self):
        driver = self.driver
        return driver.page_source
    
    def get_title_links(self):
        driver = self.driver
        s = driver.page_source
        soup = BeautifulSoup(s, 'html.parser')
        source_titles = soup.findAll('a',{'class':'title'})
        links = [title['href'] for title in source_titles]
        return links

# set parameters

movies first

In [62]:
def flixable(content_type):
    """This function will return a flixable url with parameters set"""
    required_type = ['tv-shows','movies']
    if content_type in required_type:
        headers = {
                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}

        url = f'https://flixable.com/netflix-originals/genre/{content_type}/#filterContainer'
        my_parameters = {'min-rating':0, 'min-year':1920, 'max-year':2019, 'order':'title'}

        re_url = requests.get(url , my_parameters,headers = headers).url
        return re_url
    
    else:
        print('KeyError. Please enter either tv-shows or movies')

In [63]:
# crawler = crawler()
crawler.get_page('https://www.imdb.com/title/tt11194518/?ref_=plg_rt_1')

In [66]:
ratingValue = soup.find('span', {'itemprop':'ratingValue'}).text
votes = soup.find('span', {'itemprop':'ratingCount'}).text

In [69]:
pd.set_option('display.max_columns', None)  

# Get title links

In [5]:
movie_links = crawler.get_title_links()

In [49]:
def get_info(content_type, links):
    DF = pd.DataFrame(columns=['title', 'type', 'release_year', 'rating', 
                               'runtime', 'description', 'genres', 'cast',
                               'director', 'country', 'imdb_link', 'date_added'])
    for ix in range(len(links)):
        full_url = 'https://flixable.com' + links[ix]
        r = requests.get(full_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        
        title = soup.find('h1').text
        release_year = soup.find('div',{'class':'col-lg-8'}).find('h6').findAll('span')[0].text
        rating = soup.find('div',{'class':'col-lg-8'}).find('h6').findAll('span')[1].text
        runtime = soup.find('div',{'class':'col-lg-8'}).find('h6').findAll('span')[2].text
        description = soup.find('p',{'class':'card-description'}).text
        tag_a = soup.find('div',{'class':'col-lg-8'}).findAll('a')
        genres = list()
        cast = list()
        director = list()
        country = list()
        imdb_link = list()
        for a in tag_a:
            if 'genre' in a['href']:
                genres.append(a.text)

            elif 'director' in a['href']:
                director.append(a.text)

            elif 'actor' in a['href']:
                cast.append(a.text)

            elif 'country' in a['href']:
                country.append(a.text)

            elif 'www.imdb.com' in a['href']:
                imdb_link.append(a['href'])
        date_added = soup.find('div',{'class':'col-lg-8'}).findAll('p')[-1].text.lstrip().rstrip()
        
        
        df = pd.DataFrame(columns=['title', 'type', 'release_year', 'rating', 
                                   'runtime', 'description', 'genres', 'cast',
                                   'director', 'country', 'imdb_link', 'date_added'],
                         data = [{
                             'title': title,
                             'type': content_type,
                             'release_year': release_year,
                             'rating': rating,
                             'runtime': runtime,
                             'description': description,
                             'genres': genres,
                             'cast': cast,
                             'director': director,
                             'country': country,
                             'imdb_link': imdb_link,
                             'date_added': date_added
                         }])
        DF = DF.append(df, ignore_index=True)
        
    return DF

In [50]:
movie_db = get_info('movies',movie_links)

In [58]:
tvshow_links = crawler.get_title_links()

In [59]:
tvshow_db = get_info('tvshow', tvshow_links)

In [71]:
dataset = movie_db.append(tvshow_db, ignore_index=True)

In [70]:
def parser(url):
    """Return bs4 object"""
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

In [None]:
ratingValue = soup.find('span', {'itemprop':'ratingValue'}).text
votes = soup.find('span', {'itemprop':'ratingCount'}).text

In [94]:
dataset.shape

(1411, 12)

In [93]:
for i in range(dataset.shape[0]):
    if dataset['imdb_link'].iloc[i] == []:
        print(i)

322
1274
1349
1350


In [95]:
import timeit

start = timeit.default_timer()
rating_value = list()
votes = list()
count = 1
for link in dataset['imdb_link']:
    if count == 100 or count == 300 or count == 500 or count == 700 or count == 900 or count == 1200 or count == 1400:
        print(count)
    count += 1
    try:
        soup = parser(link[0])
        ratingValue = soup.find('span', {'itemprop':'ratingValue'}).text
        ratingCount = soup.find('span', {'itemprop':'ratingCount'}).text
        rating_value.append(ratingValue)
        votes.append(ratingCount)
    except:
        rating_value.append('NaN')
        votes.append('NaN')
        
stop = timeit.default_timer()
print('Time: ', stop - start) 

100
300
500
700
900
1200
1400
Time:  2323.908137589984


In [99]:
dataset['rating_value'] = rating_value
dataset['votes'] = votes

In [103]:
t = int(2323/60)
print(f"It took {t} minutes to scrape rating values and votes.")

It took 38 minutes to scrape rating values and votes.
