### Installation

In [1]:
from bs4 import BeautifulSoup
import requests

### Copy webpage source page

In [2]:
my_html = """
<html>

<head>
</head>

<body>
    <div style="border: 1px solid">
        There isn't much in this file, except a list of to-do items. 
        <ul>
          <li>Make coffee</li>
          <li>Sweep the floor</li>
          <li>Go to the store</li>
          <li>Write BeautifulSoup lecture</li>
        </ul>
    </div>
</body>

</html>
"""

### Display

In [3]:
from IPython.core.display import display, HTML
display(HTML(my_html))     # make sure Jupyter knows to display it as HTML

In [4]:
soup = BeautifulSoup(my_html, "html5lib")

In [5]:
soup

<html><head>
</head>

<body>
    <div style="border: 1px solid">
        There isn't much in this file, except a list of to-do items. 
        <ul>
          <li>Make coffee</li>
          <li>Sweep the floor</li>
          <li>Go to the store</li>
          <li>Write BeautifulSoup lecture</li>
        </ul>
    </div>



</body></html>

### Use .find(), .find_all()

In [6]:
soup.find('li') #Grabs the first element tagged as li (type=bs4.element.Tag)

<li>Make coffee</li>

In [7]:
soup.find('li').text #(type=str)

'Make coffee'

In [8]:
soup.find_all('li')

[<li>Make coffee</li>,
 <li>Sweep the floor</li>,
 <li>Go to the store</li>,
 <li>Write BeautifulSoup lecture</li>]

In [9]:
todos = [element.text for element in soup.find_all('li')]

print(todos)

['Make coffee', 'Sweep the floor', 'Go to the store', 'Write BeautifulSoup lecture']


### Scrape items on a test webpage

In [12]:
with open('../websource/page1.html') as page:
    test_html = page.read()
soup = BeautifulSoup(test_html,'lxml')

In [13]:
def loadpage(page_html):
    with open('../websource/' + page_html) as page:
        test_html = page.read()
    soup = BeautifulSoup(test_html,'lxml')
    return soup

In [14]:
source_list=["page1.html","page2.html","page3.html",\
             "page1_2014.html","page2_2014.html","page3_2014.html",\
             "page1_2015.html","page2_2015.html",\
             "page1_2016.html","page2_2016.html","page3_2016.html",\
             "page1_2017.html","page2_2017.html","page3_2017.html",\
            ]

In [232]:
source_list1=["page1.html","page2.html","page3.html",\
             "page1_2017.html","page2_2017.html","page3_2017.html",\
            ]
source_list2=["page1_2014.html","page2_2014.html","page3_2014.html",\
             "page1_2015.html","page2_2015.html",\
             "page1_2016.html","page2_2016.html","page3_2016.html"
            ]

In [15]:
for i,x in enumerate([1,2,3]):
    print(i,x)

0 1
1 2
2 3


In [280]:
soup_list=[]
for source in (source_list):
    soup_list.append(loadpage(source))

In [281]:
soup_list1=[]
for source in (source_list1):
    soup_list1.append(loadpage(source))

In [282]:
soup_list2=[]
for source in (source_list2):
    soup_list2.append(loadpage(source))

In [17]:
soup_list[0]

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<script type="text/javascript">var ue_t0=ue_t0||+new Date();</script>
<script type="text/javascript">
window.ue_ihb = (window.ue_ihb || window.ueinit || 0) + 1;
if (window.ue_ihb === 1) {

var ue_csm = window,
    ue_hob = +new Date();
(function(d){var e=d.ue=d.ue||{},f=Date.now||function(){return+new Date};e.d=function(b){return f()-(b?0:d.ue_t0)};e.stub=function(b,a){if(!b[a]){var c=[];b[a]=function(){c.push([c.slice.call(arguments),e.d(),d.ue_id])};b[a].replay=function(b){for(var a;a=c.shift();)b(a[0],a[1],a[2])};b[a].isStub=1}};e.exec=function(b,a){return function(){try{return b.apply(this,arguments)}catch(c){ueLogError(c,{attribution:a||"undefined",logLevel:"WARN"})}}}})(ue_csm);


    var ue_err_chan = 'jserr';
(function(d,e){function h(f,b){if(!(a.ec>a.mxe)&&f){a.ter.push(f);b=b||{};var c=f.logLevel||b.logLevel;c&&c!==k&&c!==m&&c!==n&&c!==p||a.ec++;c&&c!=k||a.ecf++;b.pageURL="

### requests

In [25]:
def loadpage_url(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup

In [26]:
url_p1="https://www.imdb.com/list/ls084133587/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=4dc7ad1a-76a6-49eb-9acb-5d6959572df8&pf_rd_r=NBZKRW93Q003357XR6S9&pf_rd_s=right-4&pf_rd_t=48201&pf_rd_i=watchlist&ref_=wt_otl_1&sort=release_date,asc&st_dt=&mode=detail&page=1"

In [27]:
url_p2="https://www.imdb.com/list/ls084133587/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=4dc7ad1a-76a6-49eb-9acb-5d6959572df8&pf_rd_r=NBZKRW93Q003357XR6S9&pf_rd_s=right-4&pf_rd_t=48201&pf_rd_i=watchlist&ref_=wt_otl_1&sort=release_date,asc&st_dt=&mode=detail&page=2"

In [28]:
url_p3=""

In [29]:
url = 'https://www.imdb.com/list/ls084133587/\
?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=4dc7ad1a-76a6-49eb-9acb-5d6959572df8&pf_rd_r=NBZKRW93Q003357XR6S9&pf_rd_s=right-4&pf_rd_t=48201&pf_rd_i=watchlist&ref_=wt_otl_1&sort=release_date,asc&st_dt=&mode=detail&page=1' 

response = requests.get(url)

In [30]:
response.status_code  #200 = success!

200

In [31]:
response.text[:1000]  #First 1000 characters of the HTML

'\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///list/ls084133587?src=mdot">\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>Films adapted from books - IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>\n<script>\n    if (typeof uet == \'function\') {\n      uet("be", "LoadTitle", {wb: 1});\n    }\n</script>\n<script>\n    if (

In [32]:
page = response.text

In [83]:
soup = BeautifulSoup(page, "lxml")
# print(soup.prettify())

#### Scrape movie titles on page1 (1-100)

In [276]:
def get_title(soup):
    title_list=[]
    year_list=[]
    c_list=[]
    runtime_list=[]
    rating_list1=[] # Find rating in my_list (pre 2014, 2017-)
    for div in soup.find_all('div', class_="lister-item-content"):
        
        for title in div.find_all('a')[0]:
            title_list.append(title)
        
        for year in div.find_all('span', class_="lister-item-year"):
            year_list.append(year.text)
            
    for div in soup.find_all('div', class_="lister-item-content"):      
        if div.find('span', class_="certificate") is None:
            certificate="Not Rated"
        else:
            for c in div.find_all('span', class_="certificate"):
                certificate=c.text
        c_list.append(certificate)
        
    for div in soup.find_all('div', class_="lister-item-content"):
        if div.find('span', class_="runtime") is None:
            runtime="No runtime"
        else:
            for r in div.find_all('span', class_="runtime"):
                runtime=r.text
        runtime_list.append(runtime)
            

    # Find rating in my_list (pre 2014, 2017-)
    for div in soup_list[0].find_all('div', class_="lister-item-content"):
        rating = div.find_all('span', class_="ipl-rating-star__rating")[0].text
        rating_list1.append(span)
    
    return (title_list,year_list,c_list,runtime_list,rating_list1)

In [307]:
def get_title2(soup):
    title_list=[]
    year_list=[]
    c_list=[]
    runtime_list=[]
    rating_list2=[] # Find rating in my_list (2014-2016)
    for div in soup.find_all('div', class_="lister-item-content"):
        
        for title in div.find_all('a')[0]:
            title_list.append(title)
        
        for year in div.find_all('span', class_="lister-item-year"):
            year_list.append(year.text)
            
    for div in soup.find_all('div', class_="lister-item-content"):      
        if div.find('span', class_="certificate") is None:
            certificate="Not Rated"
        else:
            for c in div.find_all('span', class_="certificate"):
                certificate=c.text
        c_list.append(certificate)
        
    for div in soup.find_all('div', class_="lister-item-content"):
        if div.find('span', class_="runtime") is None:
            runtime="No runtime"
        else:
            for r in div.find_all('span', class_="runtime"):
                runtime=r.text
        runtime_list.append(runtime)
            

    # Find rating in IMDb search (2014-2016)
    for div in soup_list[3].find_all('div', class_="inline-block ratings-imdb-rating"):
        rating_list2.append(div.get('data-value'))
    
    return (title_list,year_list,c_list,runtime_list,rating_list2)

In [193]:
rating_list=[]
for div in soup_list[2].find_all('div', class_="ipl-rating-star__rating"):
    print(div.get('data-value'))
    print(div.text)
    rating_list.append(div.get('data-value'))
# len((rating_list)) inline-block ratings-imdb-rating

In [None]:
# Find rating in IMDb search (2014-2016)
rating_list1=[]
for div in soup_list[3].find_all('div', class_="inline-block ratings-imdb-rating"):
#     print(div.get('data-value'))
    rating_list1.append(div.get('data-value'))

# Find rating in my_list (pre 2014, 2017-)
rating_list2=[]
for div in soup_list[0].find_all('div', class_="lister-item-content"):
    rating = div.find_all('span', class_="ipl-rating-star__rating")[0].text
#     print(rating)
    rating_list2.append(span)

In [178]:
soup_list[1].find(text=re.compile('rating'))

'Your ratings'

In [79]:
title_list1 = get_title(soup_list[0])[0]

### Find all movie titles

In [316]:
# Find all values in my_list (pre 2014, 2017-)
title_list=[]
year_list=[]
c_list=[]
runtime_list=[]
rating_list1=[]
for soup in soup_list1:
    title_list += get_title(soup)[0]
    year_list += get_title(soup)[1]
    c_list += get_title(soup)[2]
    runtime_list += get_title(soup)[3]
    rating_list1 += get_title2(soup)[4]
len(title_list),len(year_list),len(c_list),len(rating_list1)

(383, 383, 383, 300)

In [315]:
# Find all values in my_list (2014-2016)
title_list=[]
year_list=[]
c_list=[]
runtime_list=[]
rating_list2=[]
for soup in soup_list2:
    title_list += get_title2(soup)[0]
    year_list += get_title2(soup)[1]
    c_list += get_title2(soup)[2]
    runtime_list += get_title2(soup)[3]
    rating_list2 += get_title2(soup)[4]
len(title_list),len(year_list),len(c_list),len(rating_list2)

(329, 329, 329, 400)

In [324]:
test_list = [x for x in rating_list2 if (float(x)>0) & (float(x)<10)]
test_list

['5.6',
 '6.6',
 '4.1',
 '6.8',
 '6.8',
 '5.5',
 '7.2',
 '7.2',
 '5.3',
 '5.5',
 '6.8',
 '6.4',
 '6.2',
 '7.3',
 '6.5',
 '6.2',
 '6.4',
 '6.3',
 '5.1',
 '5.7',
 '1.5',
 '6.7',
 '6.6',
 '6.2',
 '6.2',
 '6.4',
 '6.9',
 '6.1',
 '6.5',
 '5.4',
 '7.8',
 '7.7',
 '7.2',
 '6.2',
 '7.1',
 '6.6',
 '5',
 '7.9',
 '5.4',
 '5',
 '4.9',
 '6.6',
 '7.3',
 '7.2',
 '5.9',
 '7.7',
 '5.1',
 '7.3',
 '6.5',
 '5.5',
 '5.6',
 '6.6',
 '4.1',
 '6.8',
 '6.8',
 '5.5',
 '7.2',
 '7.2',
 '5.3',
 '5.5',
 '6.8',
 '6.4',
 '6.2',
 '7.3',
 '6.5',
 '6.2',
 '6.4',
 '6.3',
 '5.1',
 '5.7',
 '1.5',
 '6.7',
 '6.6',
 '6.2',
 '6.2',
 '6.4',
 '6.9',
 '6.1',
 '6.5',
 '5.4',
 '7.8',
 '7.7',
 '7.2',
 '6.2',
 '7.1',
 '6.6',
 '5',
 '7.9',
 '5.4',
 '5',
 '4.9',
 '6.6',
 '7.3',
 '7.2',
 '5.9',
 '7.7',
 '5.1',
 '7.3',
 '6.5',
 '5.5',
 '5.6',
 '6.6',
 '4.1',
 '6.8',
 '6.8',
 '5.5',
 '7.2',
 '7.2',
 '5.3',
 '5.5',
 '6.8',
 '6.4',
 '6.2',
 '7.3',
 '6.5',
 '6.2',
 '6.4',
 '6.3',
 '5.1',
 '5.7',
 '1.5',
 '6.7',
 '6.6',
 '6.2',
 '6.2',
 '6.4',


In [77]:
soup.find_all('div', class_="lister-item-content")

[<div class="lister-item-content">
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">201.</span>
 <a href="/title/tt1405517/?ref_=ttls_li_tt">The Secret Life of Houdini</a>
 <span class="lister-item-year text-muted unbold"></span>
 </h3>
 <p class="text-muted text-small">
 <span class="genre">
 Action, Thriller            </span>
 <span class="ghost">|</span>
 <b>Announced</b>
 </p>
 <p class="">
     A fantasy that recasts Harry Houdini as a spy for Britain and a debunker of con artists.</p>
 <p class="text-muted text-small">
     Director:
 <a href="/name/nm0870469/?ref_=ttls_li_dr_0">Dan Trachtenberg</a>
 </p>
 <div class="wtw-option-standalone" data-baseref="wl_li" data-tconst="tt1405517" data-watchtype="minibar"></div>
 </div>,
 <div class="lister-item-content">
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">202.</span>
 <a href="/title/tt1235533/?ref_=ttls_li_tt">Lie Down in Darkness</a>
 <span class="list

In [84]:
soup.find('title')

<title>Films adapted from books - IMDb</title>

In [203]:
title_list=[]
for div in soup_list[3].find_all('div', class_="lister-item-content"):
    for title in div.find_all('a')[0]:
        #print(title)
        title_list.append(title)
len(title_list)

50

In [204]:
for div in soup_list[3].find_all('div', class_="lister-item-content"):    
    for c in div.find_all('span', class_="certificate"):
#         print("1"+ c.text)

SyntaxError: unexpected EOF while parsing (<ipython-input-204-0c178ec69036>, line 3)

In [129]:
for div in soup_list[3].find_all('div', class_="lister-item-content"):
    print(div)

<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt2219674/?ref_=adv_li_tt">Letters of a Portuguese Nun</a>
<span class="lister-item-year text-muted unbold">(2014)</span>
</h3>
<p class="text-muted">
<span class="runtime">75 min</span>
<span class="ghost">|</span>
<span class="genre">
Drama, History, Romance            </span>
</p>
<div class="ratings-bar">
<div class="inline-block ratings-imdb-rating" data-value="5.6" name="ir">
<span class="global-sprite rating-star imdb-rating"></span>
<strong>5.6</strong>
</div>
<div class="inline-block ratings-user-rating">
<span class="userRatingValue" data-tconst="tt2219674" id="urv_tt2219674">
<span class="global-sprite rating-star no-rating"></span>
<span class="rate" data-no-rating="Rate this" data-value="0" name="ur">Rate this</span>
</span>
<div class="starBarWidget" id="sb_tt2219674">
<div class="rating rating-list" data-auth="BCYm1-osmnKC5M1kFkM

In [154]:
c_list=[]
for div in soup_list[3].find_all('div', class_="lister-item-content"): 
    if div.find('span', class_="certificate") is None:
        certificate="Not Rated"
        print(certificate)
    else:
        for c in div.find_all('span', class_="certificate"):
#             print(c.text)
            certificate=c.text
    c_list.append(certificate)
len(c_list)

Not Rated
TV-PG
Not Rated
R
R
Not Rated
Not Rated
Not Rated
M
PG-13
Not Rated
R
PG-13
Not Rated
Not Rated
PG-13
PG-13
PG-13
Not Rated
Not Rated
Not Rated
Not Rated
PG-13
TV-PG
Not Rated
Not Rated
Not Rated
R
Not Rated
Not Rated
PG
PG-13
PG-13
R
Not Rated
R
Not Rated
PG-13
G
Not Rated
TV-MA
Not Rated
Not Rated
Not Rated
Not Rated
PG
Not Rated
PG
PG-13
R


50

In [140]:
soup_list[3].find('div', class_="lister-item-content").find('span', class_="certificate") is None

True

In [131]:
c_list=[]
for div in soup_list[3].find_all('div', class_="lister-item-content"): 
    if div.find('span', class_="certificate") is None:
        certificate="Not Rated"
        print(certificate)
    else:
        for c in div.find_all('span', class_="certificate"):
#             print(c.text)
            certificate=c.text
    c_list.append(certificate)
len(c_list)

<span class="certificate">R</span>

In [201]:
runtime_list=[]
for div in soup_list[3].find_all('div', class_="lister-item-content"):
    for runtime in div.find_all('span', class_="runtime"):
#         print(year)
#         print(runtime.text)
        runtime_list.append(runtime.text)
len(runtime_list)

49

In [333]:
# Find rating in my_list (pre 2014, 2017-)
rating_list1=[]
for div in soup_list1[3].find_all('div', class_="lister-item-content"):
    rating = div.find_all('span', class_="ipl-rating-star__rating")[0].text
#     print(rating)
    rating_list1.append(span)
len(rating_list1)

IndexError: list index out of range

In [306]:
# Find rating in IMDb search (2014-2016)
rating_list2=[]
for div in soup_list2[7].find_all('div', class_="inline-block ratings-imdb-rating"):
#     print(div.get('data-value'))
    rating_list2.append(div.get('data-value'))
len(rating_list2)

24

In [331]:
len(soup_list1)

6

In [345]:
div=soup_list1[3].find_all('div', class_="lister-item-content")
div

[<div class="lister-item-content">
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">101.</span>
 <a href="/title/tt4558396/?ref_=adv_li_tt">Egon Schiele: Death and the Maiden</a>
 <span class="lister-item-year text-muted unbold">(2016)</span>
 </h3>
 <p class="text-muted">
 <span class="runtime">110 min</span>
 <span class="ghost">|</span>
 <span class="genre">
 Biography, Drama, History            </span>
 </p>
 <div class="ratings-bar">
 <div class="inline-block ratings-imdb-rating" data-value="6.5" name="ir">
 <span class="global-sprite rating-star imdb-rating"></span>
 <strong>6.5</strong>
 </div>
 <div class="inline-block ratings-user-rating">
 <span class="userRatingValue" data-tconst="tt4558396" id="urv_tt4558396">
 <span class="global-sprite rating-star no-rating"></span>
 <span class="rate" data-no-rating="Rate this" data-value="0" name="ur">Rate this</span>
 </span>
 <div class="starBarWidget" id="sb_tt4558396">
 <div class="rating rating-

In [265]:
div = soup_list[0].find_all('div', class_="lister-item-content")[0]
div.find_all('span', class_="ipl-rating-star__rating")[0].text

'7.3'

In [349]:
div = soup_list[0].find_all('div', class_="lister-item-content")[0]
div

<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt0416212/?ref_=ttls_li_tt">The Secret Life of Bees</a>
<span class="lister-item-year text-muted unbold">(2008)</span>
</h3>
<p class="text-muted text-small">
<span class="certificate">PG-13</span>
<span class="ghost">|</span>
<span class="runtime">114 min</span>
<span class="ghost">|</span>
<span class="genre">
Drama            </span>
</p>
<div class="ipl-rating-widget">
<div class="ipl-rating-star small">
<span class="ipl-rating-star__star">
<svg class="ipl-icon ipl-star-icon" fill="#000000" height="24" viewbox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg">
<path d="M0 0h24v24H0z" fill="none"></path>
<path d="M12 17.27L18.18 21l-1.64-7.03L22 9.24l-7.19-.61L12 2 9.19 8.63 2 9.24l5.46 4.73L5.82 21z"></path>
<path d="M0 0h24v24H0z" fill="none"></path>
</svg>
</span>
<span class="ipl-rating-star__rating">7.3</span>
</div>
<div class=

In [224]:
div.get('data-value')

'5.5'

In [231]:
soup_list[0].find_all('span', class_="ipl-rating-star__rating")[0].text

'7.3'

In [240]:
soup_list[0].find_all('span', class_="ipl-rating-star__rating")[0]

<span class="ipl-rating-star__rating">7.3</span>

In [123]:
import re

In [125]:
rt_string = soup_list[0].find(text=re.compile('Run'))
print(rt_string)

Runtime


In [126]:
rt_string.findNext()

<option value="date_added:descending">Date Added</option>