# Scraping the Data from Multiple Pages/Websites

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from time import time
from IPython.core.display import clear_output

In [23]:
#Data from 5 pages and differnt years
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2018)]

In [26]:
start_time = time()
requests = 0
for _ in range(5):
    requests += 1
    sleep(randint(1,3))
    current_time = time()
    elapsed_time = current_time - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)

Request: 1; Frequency: 0.331806401437502 requests/s
Request: 2; Frequency: 0.49795416102615764 requests/s
Request: 3; Frequency: 0.49785774858708187 requests/s
Request: 4; Frequency: 0.5681198104115445 requests/s
Request: 5; Frequency: 0.4974670396821831 requests/s


In [34]:
# Create list to store info
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Create time to monitor the loop
start_time = time()
requests = 0
for year_url in years_url:
    for page in pages:
        response = get('https://www.imdb.com/search/title?release_date=' + year_url +'&sort=num_votes,desc&page=' + page)
        sleep(randint(8,15))
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        if requests > 72:
            warn('Number of requests was greater than expected.')
            break
        page_html = BeautifulSoup(response.text, 'html.parser')
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
        for container in mv_containers:
            if container.find('div', class_ = 'ratings-metascore') is not None:
                name = container.h3.a.text
                names.append(name)
                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)
                imdb = float(container.strong.text)
                imdb_ratings.append(imdb)
                m_score = container.find('span', class_ = 'metascore').text
                metascores.append(int(m_score))
                vote = container.find('span', attrs = {'name':'nv'})['data-value']
                votes.append(int(vote))

Request:72; Frequency: 0.07904252487347589 requests/s


In [35]:
movie_ratings = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})
print(movie_ratings.info())
movie_ratings.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3244 entries, 0 to 3243
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movie      3244 non-null   object 
 1   year       3244 non-null   object 
 2   imdb       3244 non-null   float64
 3   metascore  3244 non-null   int64  
 4   votes      3244 non-null   int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 126.8+ KB
None


Unnamed: 0,movie,year,imdb,metascore,votes
0,Gladiator,(2000),8.5,67,1395652
1,Memento,(2000),8.4,80,1167025
2,Snatch,(2000),8.3,55,810061
3,Requiem for a Dream,(2000),8.3,68,794274
4,X-Men,(2000),7.4,64,587178
5,Cast Away,(2000),7.8,73,550152
6,American Psycho,(2000),7.6,64,523525
7,Unbreakable,(2000),7.3,62,399986
8,Mission: Impossible II,(2000),6.1,59,324962
9,Meet the Parents,(2000),7.0,73,320266


In [36]:
movie_ratings = movie_ratings[['movie', 'year', 'imdb', 'metascore', 'votes']]
movie_ratings.head()

Unnamed: 0,movie,year,imdb,metascore,votes
0,Gladiator,(2000),8.5,67,1395652
1,Memento,(2000),8.4,80,1167025
2,Snatch,(2000),8.3,55,810061
3,Requiem for a Dream,(2000),8.3,68,794274
4,X-Men,(2000),7.4,64,587178


In [37]:
movie_ratings['year'].unique()

array(['(2000)', '(I) (2000)', '(2001)', '(2002)', '(2003)', '(2004)',
       '(I) (2004)', '(2005)', '(I) (2005)', '(2006)', '(I) (2006)',
       '(2007)', '(I) (2007)', '(2008)', '(I) (2008)', '(2009)',
       '(I) (2009)', '(2010)', '(I) (2010)', '(2011)', '(I) (2011)',
       '(2012)', '(I) (2012)', '(2013)', '(I) (2013)', '(2014)',
       '(I) (2014)', '(II) (2014)', '(2015)', '(I) (2015)', '(II) (2015)',
       '(2016)', '(II) (2016)', '(IX) (2016)', '(I) (2016)', '(2017)',
       '(I) (2017)'], dtype=object)

In [38]:
movie_ratings.loc[:, 'year'] = movie_ratings['year'].str[-5:-1].astype(int)

In [40]:
movie_ratings.describe().loc[['min', 'max'], ['imdb', 'metascore']]

Unnamed: 0,imdb,metascore
min,4.1,24.0
max,9.0,100.0


In [41]:
movie_ratings['n_imdb'] = movie_ratings['imdb'] * 10
movie_ratings.head(3)

Unnamed: 0,movie,year,imdb,metascore,votes,n_imdb
0,Gladiator,2000,8.5,67,1395652,85.0
1,Memento,2000,8.4,80,1167025,84.0
2,Snatch,2000,8.3,55,810061,83.0


In [42]:
movie_ratings.to_csv('movie_ratings.csv')