# Web scraping Disney movies collection data 

In [145]:
import requests
from bs4 import BeautifulSoup as bs
import re

In [146]:
#load the page
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

#convert to bs object
soup = bs(r.content)


In [147]:
#find all headings with size 3 to get the years in string
soup.find_all("h3")

[<h3><span id="1930s.E2.80.931940s"></span><span class="mw-headline" id="1930s–1940s">1930s–1940s</span></h3>,
 <h3><span class="mw-headline" id="1950s">1950s</span></h3>,
 <h3><span class="mw-headline" id="1960s">1960s</span></h3>,
 <h3><span class="mw-headline" id="1970s">1970s</span></h3>,
 <h3><span class="mw-headline" id="1980s">1980s</span></h3>,
 <h3><span class="mw-headline" id="1990s">1990s</span></h3>,
 <h3><span class="mw-headline" id="2000s">2000s</span></h3>,
 <h3><span class="mw-headline" id="2010s">2010s</span></h3>,
 <h3><span class="mw-headline" id="2020s">2020s</span></h3>,
 <h3><span class="mw-headline" id="Undated_films">Undated films</span></h3>,
 <h3 class="vector-menu-heading" id="p-personal-label">
 <span class="vector-menu-heading-label">Personal tools</span>
 </h3>,
 <h3 class="vector-menu-heading" id="p-namespaces-label">
 <span class="vector-menu-heading-label">Namespaces</span>
 </h3>,
 <h3 class="vector-menu-heading" id="p-views-label">
 <span class="vecto

We will get a value of None in the first header3 because we have two span elements inside this first header. So we need to manually replace the first value of the list above with the respective year, i.e., 1930s-1940s.

In [148]:
#collecting the number of years first
#create an emtpy list of years
years_list = []
for years in soup.find_all("h3"):
    
    if years.string is not None:
        years = years.string.replace('s','')
        years_list.append(years)
    
    print(years)

<h3><span id="1930s.E2.80.931940s"></span><span class="mw-headline" id="1930s–1940s">1930s–1940s</span></h3>
1950
1960
1970
1980
1990
2000
2010
2020
Undated film
<h3 class="vector-menu-heading" id="p-personal-label">
<span class="vector-menu-heading-label">Personal tools</span>
</h3>
<h3 class="vector-menu-heading" id="p-namespaces-label">
<span class="vector-menu-heading-label">Namespaces</span>
</h3>
<h3 class="vector-menu-heading" id="p-views-label">
<span class="vector-menu-heading-label">Views</span>
</h3>
<h3>
<label for="searchInput">Search</label>
</h3>
<h3 class="vector-menu-heading" id="p-navigation-label">
<span class="vector-menu-heading-label">Navigation</span>
</h3>
<h3 class="vector-menu-heading" id="p-interaction-label">
<span class="vector-menu-heading-label">Contribute</span>
</h3>
<h3 class="vector-menu-heading" id="p-tb-label">
<span class="vector-menu-heading-label">Tools</span>
</h3>
<h3 class="vector-menu-heading" id="p-coll-print_export-label">
<span class="vect

In [149]:
#printing the list
years_list

['1950',
 '1960',
 '1970',
 '1980',
 '1990',
 '2000',
 '2010',
 '2020',
 'Undated film']

In [150]:
#insert the years 1930s-1940s as a first index value
years_list.insert(0,'1930-1940')

In [151]:
#printing the years list
years_list

['1930-1940',
 '1950',
 '1960',
 '1970',
 '1980',
 '1990',
 '2000',
 '2010',
 '2020',
 'Undated film']

Now upon inspecting the table we want to extract, we have links for each of the movie title and that will provide us with more information about each movie. We need to loop through each of these movie title and get the link and access the page for that movie.

In [173]:
for table in soup.find_all("table"):
    print(table.prettify())
    print("-" *70)

<table class="wikitable sortable" style="width:100%;">
 <tbody>
  <tr>
   <th style="width:35%;">
    Title
   </th>
   <th style="width:9em;">
    Release date
   </th>
   <th>
    Notes
   </th>
  </tr>
  <tr>
   <td>
    <i>
     <a href="/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons" title="Academy Award Review of Walt Disney Cartoons">
      Academy Award Review of Walt Disney Cartoons
     </a>
    </i>
   </td>
   <td>
    May 19, 1937
   </td>
   <td>
    Anthology film. Distributed by
    <a href="/wiki/United_Artists" title="United Artists">
     United Artists
    </a>
    .
   </td>
  </tr>
  <tr>
   <td>
    <i>
     <a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">
      Snow White and the Seven Dwarfs
     </a>
    </i>
   </td>
   <td>
    December 21, 1937
   </td>
   <td>
    First film to be distributed by
    <a href="/wiki/RKO_Pictures" title="RKO Pictures">
     RKO Radio Pictures
    </a>
    .
 