# Web scraping Disney movies collection data 

In [409]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import re

In [341]:
#load the page
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

#convert to bs object
soup = bs(r.content)


In [342]:
#find all headings with size 3 to get the years in string
soup.find_all("h3")

[<h3><span id="1930s.E2.80.931940s"></span><span class="mw-headline" id="1930s–1940s">1930s–1940s</span></h3>,
 <h3><span class="mw-headline" id="1950s">1950s</span></h3>,
 <h3><span class="mw-headline" id="1960s">1960s</span></h3>,
 <h3><span class="mw-headline" id="1970s">1970s</span></h3>,
 <h3><span class="mw-headline" id="1980s">1980s</span></h3>,
 <h3><span class="mw-headline" id="1990s">1990s</span></h3>,
 <h3><span class="mw-headline" id="2000s">2000s</span></h3>,
 <h3><span class="mw-headline" id="2010s">2010s</span></h3>,
 <h3><span class="mw-headline" id="2020s">2020s</span></h3>,
 <h3><span class="mw-headline" id="Undated_films">Undated films</span></h3>,
 <h3 class="vector-menu-heading" id="p-personal-label">
 <span class="vector-menu-heading-label">Personal tools</span>
 </h3>,
 <h3 class="vector-menu-heading" id="p-namespaces-label">
 <span class="vector-menu-heading-label">Namespaces</span>
 </h3>,
 <h3 class="vector-menu-heading" id="p-views-label">
 <span class="vecto

We will get a value of None in the first header3 because we have two span elements inside this first header. So we need to manually replace the first value of the list above with the respective year, i.e., 1930s-1940s.

In [343]:
#collecting the number of years first
#create an emtpy list of years
years_list = []
for years in soup.find_all("h3"):
    
    if years.string is not None:
        years = years.string.replace('s','')
        years_list.append(years)
    
    print(years)

<h3><span id="1930s.E2.80.931940s"></span><span class="mw-headline" id="1930s–1940s">1930s–1940s</span></h3>
1950
1960
1970
1980
1990
2000
2010
2020
Undated film
<h3 class="vector-menu-heading" id="p-personal-label">
<span class="vector-menu-heading-label">Personal tools</span>
</h3>
<h3 class="vector-menu-heading" id="p-namespaces-label">
<span class="vector-menu-heading-label">Namespaces</span>
</h3>
<h3 class="vector-menu-heading" id="p-views-label">
<span class="vector-menu-heading-label">Views</span>
</h3>
<h3>
<label for="searchInput">Search</label>
</h3>
<h3 class="vector-menu-heading" id="p-navigation-label">
<span class="vector-menu-heading-label">Navigation</span>
</h3>
<h3 class="vector-menu-heading" id="p-interaction-label">
<span class="vector-menu-heading-label">Contribute</span>
</h3>
<h3 class="vector-menu-heading" id="p-tb-label">
<span class="vector-menu-heading-label">Tools</span>
</h3>
<h3 class="vector-menu-heading" id="p-coll-print_export-label">
<span class="vect

In [344]:
#printing the list
years_list

['1950',
 '1960',
 '1970',
 '1980',
 '1990',
 '2000',
 '2010',
 '2020',
 'Undated film']

In [345]:
#insert the years 1930s-1940s as a first index value
years_list.insert(0,'1930-1940')
years_list.insert(9,'Upcoming')

In [346]:
#printing the years list
years_list

['1930-1940',
 '1950',
 '1960',
 '1970',
 '1980',
 '1990',
 '2000',
 '2010',
 '2020',
 'Upcoming',
 'Undated film']

In [347]:
for tag in soup.find_all("href"):
    print(tag)

Now upon inspecting the table we want to extract, we have links for each of the movie title and that will provide us with more information about each movie. We need to loop through each of these movie title and get the link and access the page for that movie.

In [348]:
#create empty list of links to movie title
data = {} 

init = 0
#loop through the table tags
for table in soup.find_all("table"):
    movies =[]
    #loop through each table to get all the links
    for link in table.find_all("a"):
        
        #try and except to ignore the errors in title extraction
        try:
            #get the link of movie title
            title_page = 'https://en.wikipedia.org' + link['href']
            #add it to the list of movies
            movies.append(title_page)
        except:
            pass
     
    
    try:
        data[years_list[init]] = movies
        init = init +1
    except:
        pass

    

In [352]:
#we can check the data for upcoming films 
data['Upcoming']

['https://en.wikipedia.org/wiki/Pinocchio_(2022_live-action_film)',
 'https://en.wikipedia.org#cite_note-25',
 'https://en.wikipedia.org/wiki/ImageMovers',
 'https://en.wikipedia.org/wiki/Chris_Weitz',
 'https://en.wikipedia.org/wiki/Disney%2B',
 'https://en.wikipedia.org#cite_note-Pinocchio2022_EW-26',
 'https://en.wikipedia.org#cite_note-27',
 'https://en.wikipedia.org/wiki/Hocus_Pocus_2',
 'https://en.wikipedia.org#cite_note-28',
 'https://en.wikipedia.org/wiki/David_Kirschner',
 'https://en.wikipedia.org/wiki/Disney%2B',
 'https://en.wikipedia.org/wiki/Strange_World_(film)',
 'https://en.wikipedia.org#cite_note-DisneyFox_Release_Date-29',
 'https://en.wikipedia.org/wiki/Walt_Disney_Animation_Studios',
 'https://en.wikipedia.org#cite_note-30',
 'https://en.wikipedia.org/wiki/Disenchanted_(film)',
 'https://en.wikipedia.org#cite_note-Disenchanted_CS-31',
 'https://en.wikipedia.org#cite_note-32',
 'https://en.wikipedia.org/wiki/Barry_Sonnenfield',
 'https://en.wikipedia.org/wiki/Barry

In [362]:
#Create a loop that can go through each of the movie page and collect the relevant data

for link in data.values():
    for items in link:
        page = requests.get(items)
        page_data = bs(page.content)
        
        for table in page_data.find_all("tr"):
            

IndentationError: expected an indented block (3158477037.py, line 9)

In [421]:
page = requests.get("https://en.m.wikipedia.org/wiki/Turning_Red")

In [433]:
df_movie= pd.read_html(page.content)

In [446]:
df1=pd.DataFrame([data])

In [447]:
df1

Unnamed: 0,1930-1940,1950,1960,1970,1980,1990,2000,2010,2020,Upcoming,Undated film
0,[https://en.wikipedia.org/wiki/Academy_Award_R...,[https://en.wikipedia.org/wiki/Cinderella_(195...,[https://en.wikipedia.org/wiki/Toby_Tyler_or_1...,[https://en.wikipedia.org/wiki/King_of_the_Gri...,[https://en.wikipedia.org/wiki/Midnight_Madnes...,[https://en.wikipedia.org/wiki/DuckTales_the_M...,[https://en.wikipedia.org/wiki/The_Tigger_Movi...,[https://en.wikipedia.org/wiki/Alice_in_Wonder...,[https://en.wikipedia.org/wiki/Timmy_Failure:_...,[https://en.wikipedia.org/wiki/Pinocchio_(2022...,[https://en.wikipedia.org/wiki/Diary_of_a_Wimp...
