In [1]:
# Import some standard libraries
import time # count the elapsed time in scraping operations and allow to pause during the scraping process
import re # regular expressions
import datetime # manipulate the datetime object
import pandas as pd
import numpy as np
import scipy as sp # to perform some statistics for box & whiskers plots

import urllib.request  as urlreq # make http requests
import urllib.error as urlerr # for handling error exceptions 
from bs4 import BeautifulSoup # for web scraping

In [2]:
def imdb_scraping(id_list):
    '''This function performs the web scrapping from IMDB to retrieve the movie Title, Release Year,\
    Release Date in Portugal, Score, Genre, Directors, Stars and Runtime given a list of movies IDs to search for'''
    start_time = time.time()
    print('StartTime ' + time.strftime('%H:%M:%S', time.gmtime(start_time)))
    
    # create the lists to build the dataset afterwards
    IMDB_id = []
    IMDBtitle = []
    IMDBurl = []
    IMDBpicurl = []
    IMDByear = []
    IMDByear_pt = []
    IMDBscore = []
    IMDBvotes = []
    IMDBgenres = []
    IMDBdirectors = []
    IMDBstars = []
    IMDBstarspicurl = []
    IMDBstarspicBIGurl = []
    IMDBruntime = []
    IMDBbudget = []
    IMDBgross = []
    IMDBstoryline = []
    IMDBsummarytext = []
    
    # iterate through each element in the list (from the input list)
    for imdb_id in id_list:
        # create some auxiliary variables to initialize with null values 
        # (they will be used to ensure that an empty value will be appended for each element and a dataset with the
        # same number of rows as the input list will be returned by this function)
        IMDB_id_temp = imdb_id
        IMDB_url = 'https://www.imdb.com/title/tt' + IMDB_id_temp # build the url for scraping
        IMDBtitle_temp = ''
        IMDBurl_temp =''
        IMDBpicurl_temp =''
        IMDByear_temp = ''
        RatingValue_temp = ''
        RatingVotes_temp = ''
        list_Directors_temp = []
        list_Stars_temp = []
        list_Starspicurl_temp = []
        list_StarspicBIGurl_temp = []
        Release_info_temp = ''
        list_Genre_temp = []
        runtime_temp = ''
        budget_temp = ''
        gross_temp = ''
        storyline_temp = ''
        summarytext_temp = ''
        
        error_bool = False

        # open the html web page
        try: page = urlreq.urlopen(IMDB_url)
        except urlerr.URLError as e:
            print(e.reason)
            error_bool = True

        if(error_bool == False):
            # parse the html web page using BeautifulSoup
            soup = BeautifulSoup(page, 'html.parser', from_encoding = 'utf-8')   

            # get the relevant fields
            # scrap the movie Title(h1) and the Release Year(titleYear)
            IMDBtitle_temp = re.sub('\xa0', '', soup.h1.find(text = True, recursive = False))

            if(soup.find('span', {'id': 'titleYear'}) != None):
                IMDByear_temp = soup.find('span', {'id': 'titleYear'}).a.text
            
            # scrap the movie trailer url
            slate = soup.find('div', attrs = {'class': 'slate'})
            if(slate != None):
                for a in slate.find_all('a'):
                    if(a.get('href') != None):
                        if('/video/' in a.get('href')):
                            IMDBurl_temp = 'https://www.imdb.com' + a.get('href')
                            
            # scrap the movie picture url
            poster = soup.find('div', attrs = {'class': 'poster'})
            if(poster != None):
                for a in poster.find_all('a'):
                    img = poster.find('img')
                    if(img != None):
                        if(img.get('src') != None):
                            IMDBpicurl_temp = img.get('src')
            
            # scrap the storyline
            storyline = soup.find('div', attrs = {'class': 'article', 'id': 'titleStoryLine'})
            if(storyline != None):
                div = storyline.find('div', attrs = {'class': 'inline canwrap'})
                if(div != None):
                    storyline_temp = re.sub(r'[\t\r\n]', '', div.find('span').get_text())
                    
            # scrap the summary text
            summarytext = soup.find('div', attrs = {'class': 'summary_text'})
            if(summarytext != None):
                summarytext_temp = re.sub(r'[\t\r\n]', '', summarytext.get_text())
                
            # scrap the Score(ratingValue) + Votes
            if(soup.find('span', attrs = {'itemprop': 'ratingValue'}) != None):
                RatingValue_temp = re.sub('\s+', '', soup.find('span', attrs = {'itemprop': 'ratingValue'}).get_text())
                RatingVotes_temp = re.sub('\s+', '', soup.find('span', attrs = {'itemprop': 'ratingCount'}).get_text().
                                          replace('.',''))

            # scrap the Directors & Stars: they are arranged in similar html nodes
            credit_summary = soup.find_all('div', attrs = {'class': 'credit_summary_item'})
            for div in credit_summary:
                h4 = div.find('h4', attrs = {'class': 'inline'}).get_text()

                # get the list of directors (iterate through all elements)
                if('Director' in h4):
                    for a in div.find_all('a'):
                        if(not('credit' in a.get_text())): # remove the credits tag
                            list_Directors_temp.append(a.get_text())

                # get the list of stars (iterate through all elements): consider only the main stars
                elif('Stars' in h4):
                    for a in div.find_all('a'):
                        if('See full' not in a.get_text()): # remove the link to see full list of stars
                            if(not('credit' in a.get_text())): # remove the credits tag
                                list_Stars_temp.append(a.get_text())
                                
            # scrap the pictures of the stars
            tds = soup.find_all('td', attrs = {'class': 'primary_photo'})
            if(tds != None):
                url_aux = ''
                for star in list_Stars_temp:
                    for a in tds:
                        img = a.find('img', alt = True)
                        if(star in img.get('alt')):
                            if(img.get('loadlate') != None):
                                list_Starspicurl_temp.append(img.get('loadlate'))
                                list_StarspicBIGurl_temp.append(img.get('loadlate').replace('._V1_UX32_CR0,0,32,44_AL_',\
                                                                                    '._V1_UY317_CR2,0,214,317_AL_')) 
                            break
       
                            
            # scrap the Release Date in Portugal (release info) & Genres(genres): they are arranged
            # in similar html nodes
            subtext = soup.find('div', attrs = {'class': 'subtext'})
            for a in subtext.find_all('a'):
                if('genres' in a.get('href')): # the genres are under the href tag
                    list_Genre_temp.append(a.get_text())
                elif('releaseinfo' in a.get('href')): # the releaseinfo is under the href tag
                    Release_info_temp = a.get_text().split('(')[0].strip()
            
            # scrap the Runtime(time) & Budget
            txt_block = soup.find_all('div', attrs = {'class': 'txt-block'}) # the div where all specs are stored
            for div in txt_block:
                h4 = div.find('h4', attrs = {'class': 'inline'}) # the header where the Runtime value is stored
                if(h4 != None):
                    if('Runtime' in h4.get_text()):
                        runtime_temp = div.find('time').get_text() # Runtime(time)
                    elif('Budget' in h4.get_text()):
                        budget_temp =  re.compile(r'[^\d.,]+').sub('', 
                                           div.get_text().split('(')[0].strip().replace(',','')).strip(" ") # Budget
                    elif('Cumulative Worldwide Gross' in h4.get_text()):
                        gross_temp =  re.compile(r'[^\d.,]+').sub('', 
                                           div.get_text().split('(')[0].strip().replace(',','')).strip(" ") # Gross
                        
   
        # append all the auxiliary variables to the lists
        IMDB_id.append(IMDB_id_temp)
        IMDBtitle.append(IMDBtitle_temp)
        IMDBurl.append(IMDBurl_temp)
        IMDBpicurl.append(IMDBpicurl_temp)
        IMDByear.append(IMDByear_temp)
        IMDByear_pt.append(Release_info_temp)
        IMDBscore.append(RatingValue_temp)
        IMDBvotes.append(RatingVotes_temp)
        IMDBgenres.append(list_Genre_temp)
        IMDBdirectors.append(list_Directors_temp)
        IMDBstars.append(list_Stars_temp)
        IMDBstarspicurl.append(list_Starspicurl_temp)
        IMDBstarspicBIGurl.append(list_StarspicBIGurl_temp)
        IMDBruntime.append(runtime_temp)
        IMDBbudget.append(budget_temp)
        IMDBgross.append(gross_temp)
        IMDBstoryline.append(storyline_temp)
        IMDBsummarytext.append(summarytext_temp)

        # Wait for 50 milliseconds (prevent the site to interrupt the process)
        time.sleep(.050)

    elapsed_time = time.time() - start_time
    print('ElapsedTime ' + time.strftime('%H:%M:%S', time.gmtime(elapsed_time)))
    
    # build the pandas DataFrame from the lists
    global moviesIMDB
    moviesIMDB = pd.DataFrame(
        {'IMDB_ID': IMDB_id,
         'Title': IMDBtitle,
         'TrailerUrl':IMDBurl,
         'PicUrl': IMDBpicurl,
         'Year': IMDByear,
         'YearPT': IMDByear_pt,
         'Score': IMDBscore,
         'Votes': IMDBvotes,
         'Genre': IMDBgenres,
         'Directors': IMDBdirectors,
         'Stars': IMDBstars,
         'StarsPicUrl': IMDBstarspicurl,
         'StarsPicBIGUrl': IMDBstarspicBIGurl,
         'Duration': IMDBruntime,
         'Budget': IMDBbudget,
         'Gross': IMDBgross,
         'Storyline': IMDBstoryline,
         'SummaryText': IMDBsummarytext
        })
    return moviesIMDB # return the DataFrame

In [4]:
links = pd.read_csv('C:/Users/nuno2/OneDrive - NOVAIMS/S2/Big_Data_Foundations/Project/ml-20m/links.csv',
                    sep = ',', encoding = 'utf-8', dtype = {'movieId': int,'imdbId': str,'tmdbId': float})
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
movieId    27278 non-null int32
imdbId     27278 non-null object
tmdbId     27026 non-null float64
dtypes: float64(1), int32(1), object(1)
memory usage: 532.9+ KB


In [5]:
links.head(n = 10)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
5,6,113277,949.0
6,7,114319,11860.0
7,8,112302,45325.0
8,9,114576,9091.0
9,10,113189,710.0


In [6]:
pd.set_option('display.max_colwidth', -1)
scraping_id = ['0112682']
IMDB = imdb_scraping(scraping_id)
IMDB.T

StartTime 08:20:54
ElapsedTime 00:00:02


Unnamed: 0,0
IMDB_ID,0112682
Title,A Cidade das Crianças Perdidas
TrailerUrl,https://www.imdb.com/video/imdb/vi4236902681?playlistId=tt0112682
PicUrl,"https://m.media-amazon.com/images/M/MV5BZGQxZDMwYzYtYmFjNi00NWYyLThjZjAtMDJhODZhYTkyZDNhXkEyXkFqcGdeQXVyNTAyODkwOQ@@._V1_UY268_CR7,0,182,268_AL_.jpg"
Year,1995
YearPT,12 July 1996
Score,7.6
Votes,62199
Genre,"[Fantasy, Sci-Fi]"
Directors,"[Marc Caro, Jean-Pierre Jeunet]"


In [7]:
dim_max = 2000
pos_inic = 0
pos_end = dim_max-1


for part in range(0,int(np.ceil((len(links)/dim_max)))):
    print('....')
    print(pos_inic)
    print(pos_end)
    scraping_id = list(links['imdbId'][pos_inic:pos_end + 1])
    IMDB = imdb_scraping(scraping_id)
    export_csv = IMDB.to_csv('C:/Users/nuno2/OneDrive - NOVAIMS/S2/Big_Data_Foundations/Project/ml-20m/IMDB_' +
                             str(part) + '.csv', index = None, header = True)
    pos_inic = pos_end + 1
    if((pos_end + dim_max) > len(links)):
        pos_end = len(links) - 1
    else:
        pos_end = pos_end + dim_max


....
0
1999
StartTime 08:21:27
Not Found
ElapsedTime 01:06:54
....
2000
3999
StartTime 09:28:21
ElapsedTime 01:01:33
....
4000
5999
StartTime 10:29:54
ElapsedTime 00:59:55
....
6000
7999
StartTime 11:29:50
ElapsedTime 01:00:11
....
8000
9999
StartTime 12:30:02
ElapsedTime 00:58:40
....
10000
11999
StartTime 13:28:43
ElapsedTime 00:58:17
....
12000
13999
StartTime 14:27:00
ElapsedTime 00:59:32
....
14000
15999
StartTime 15:26:33
ElapsedTime 01:00:15
....
16000
17999
StartTime 16:26:48
Not Found
ElapsedTime 01:00:49
....
18000
19999
StartTime 17:27:38
ElapsedTime 00:59:54
....
20000
21999
StartTime 18:27:32
ElapsedTime 00:59:42
....
22000
23999
StartTime 19:27:15
ElapsedTime 01:00:15
....
24000
25999
StartTime 20:27:30
ElapsedTime 01:01:24
....
26000
27277
StartTime 21:28:55
ElapsedTime 00:37:48
