# Web scraping Disney movies collection data 

In [360]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import re

In [361]:
#load the page
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

#convert to bs object
soup = bs(r.content)


In [362]:
#find all headings with size 3 to get the years in string
soup.find_all("h3")

[<h3><span id="1930s.E2.80.931940s"></span><span class="mw-headline" id="1930s–1940s">1930s–1940s</span></h3>,
 <h3><span class="mw-headline" id="1950s">1950s</span></h3>,
 <h3><span class="mw-headline" id="1960s">1960s</span></h3>,
 <h3><span class="mw-headline" id="1970s">1970s</span></h3>,
 <h3><span class="mw-headline" id="1980s">1980s</span></h3>,
 <h3><span class="mw-headline" id="1990s">1990s</span></h3>,
 <h3><span class="mw-headline" id="2000s">2000s</span></h3>,
 <h3><span class="mw-headline" id="2010s">2010s</span></h3>,
 <h3><span class="mw-headline" id="2020s">2020s</span></h3>,
 <h3><span class="mw-headline" id="Undated_films">Undated films</span></h3>,
 <h3 class="vector-menu-heading" id="p-personal-label">
 <span class="vector-menu-heading-label">Personal tools</span>
 </h3>,
 <h3 class="vector-menu-heading" id="p-namespaces-label">
 <span class="vector-menu-heading-label">Namespaces</span>
 </h3>,
 <h3 class="vector-menu-heading" id="p-views-label">
 <span class="vecto

We will get a value of None in the first header3 because we have two span elements inside this first header. So we need to manually replace the first value of the list above with the respective year, i.e., 1930s-1940s.

In [363]:
#collecting the number of years first
#create an emtpy list of years
years_list = []
for years in soup.find_all("h3"):
    
    if years.string is not None:
        years = years.string.replace('s','')
        years_list.append(years)
    
    print(years)

<h3><span id="1930s.E2.80.931940s"></span><span class="mw-headline" id="1930s–1940s">1930s–1940s</span></h3>
1950
1960
1970
1980
1990
2000
2010
2020
Undated film
<h3 class="vector-menu-heading" id="p-personal-label">
<span class="vector-menu-heading-label">Personal tools</span>
</h3>
<h3 class="vector-menu-heading" id="p-namespaces-label">
<span class="vector-menu-heading-label">Namespaces</span>
</h3>
<h3 class="vector-menu-heading" id="p-views-label">
<span class="vector-menu-heading-label">Views</span>
</h3>
<h3>
<label for="searchInput">Search</label>
</h3>
<h3 class="vector-menu-heading" id="p-navigation-label">
<span class="vector-menu-heading-label">Navigation</span>
</h3>
<h3 class="vector-menu-heading" id="p-interaction-label">
<span class="vector-menu-heading-label">Contribute</span>
</h3>
<h3 class="vector-menu-heading" id="p-tb-label">
<span class="vector-menu-heading-label">Tools</span>
</h3>
<h3 class="vector-menu-heading" id="p-coll-print_export-label">
<span class="vect

In [364]:
#printing the list
years_list

['1950',
 '1960',
 '1970',
 '1980',
 '1990',
 '2000',
 '2010',
 '2020',
 'Undated film']

In [365]:
#insert the years 1930s-1940s as a first index value
years_list.insert(0,'1930-1940')
years_list.insert(9,'Upcoming')

In [366]:
#printing the years list
years_list

['1930-1940',
 '1950',
 '1960',
 '1970',
 '1980',
 '1990',
 '2000',
 '2010',
 '2020',
 'Upcoming',
 'Undated film']

In [367]:
for tag in soup.find_all("href"):
    print(tag)

Now upon inspecting the table we want to extract, we have links for each of the movie title and that will provide us with more information about each movie. We need to loop through each of these movie title and get the link and access the page for that movie.

In [368]:
#create empty list of links to movie title
data = {} 

init = 0
#loop through the table tags
for table in soup.find_all("table"):
    movies =[]
    #loop through each table to get all the links
    for link in table.find_all("a"):
        
        #try and except to ignore the errors in title extraction
        try:
            #get the link of movie title
            title_page = 'https://en.wikipedia.org' + link['href']
            #add it to the list of movies
            movies.append(title_page)
        except:
            pass
     
    
    try:
        data[years_list[init]] = movies
        init = init +1
    except:
        pass

    

In [369]:
#we can check the data for upcoming films 
data['Upcoming']

['https://en.wikipedia.org/wiki/Pinocchio_(2022_live-action_film)',
 'https://en.wikipedia.org#cite_note-25',
 'https://en.wikipedia.org/wiki/ImageMovers',
 'https://en.wikipedia.org/wiki/Chris_Weitz',
 'https://en.wikipedia.org/wiki/Disney%2B',
 'https://en.wikipedia.org#cite_note-Pinocchio2022_EW-26',
 'https://en.wikipedia.org#cite_note-27',
 'https://en.wikipedia.org/wiki/Hocus_Pocus_2',
 'https://en.wikipedia.org#cite_note-28',
 'https://en.wikipedia.org/wiki/David_Kirschner',
 'https://en.wikipedia.org/wiki/Disney%2B',
 'https://en.wikipedia.org/wiki/Strange_World_(film)',
 'https://en.wikipedia.org#cite_note-DisneyFox_Release_Date-29',
 'https://en.wikipedia.org/wiki/Walt_Disney_Animation_Studios',
 'https://en.wikipedia.org#cite_note-30',
 'https://en.wikipedia.org/wiki/Disenchanted_(film)',
 'https://en.wikipedia.org#cite_note-Disenchanted_CS-31',
 'https://en.wikipedia.org#cite_note-32',
 'https://en.wikipedia.org/wiki/Barry_Sonnenfield',
 'https://en.wikipedia.org/wiki/Barry

## Using Pandas library to get the table



In [370]:
page = requests.get("https://en.m.wikipedia.org/wiki/Turning_Red")

In [371]:
df_movie= pd.read_html(page.content)[0]

In [372]:
df_movie.set_index(df_movie.columns[0]).transpose()

Turning Red,Official promotional poster,Directed by,Screenplay by,Story by,Produced by,Starring,Cinematography,Edited by,Music by,Productioncompanies,Distributed by,Release dates,Running time,Country,Language,Budget,Box office
Turning Red.1,Official promotional poster,Domee Shi,Julia Cho Domee Shi,Domee Shi Julia Cho Sarah Streicher,Lindsey Collins,Rosalie Chiang Sandra Oh Ava Morse Hyein Park ...,Mahyar Abousaeedi Jonathan Pytko,Nicholas C. Smith Steve Bloom,Ludwig Göransson,Walt Disney Pictures Pixar Animation Studios,Walt Disney StudiosMotion Pictures,"March 1, 2022El Capitan Theatre) March 11, 2022",100 minutes[1],United States,English,$175 million[2],$19.9 million[3]


The problem here is that we cannot get the names separated with commas or any such information that needs to be punctuated for that matter. 

## Using Beautiful Soup to extract tables

In [373]:
#Create a loop that can go through each of the movie page and collect the relevant data
'''
for link in data.values():
    for items in link:
        page = requests.get(items)
        page_data = bs(page.content)
        
        for table in page_data.find_all("tr"):
 '''           

'\nfor link in data.values():\n    for items in link:\n        page = requests.get(items)\n        page_data = bs(page.content)\n        \n        for table in page_data.find_all("tr"):\n '

In [455]:
page = requests.get("https://en.m.wikipedia.org/wiki/Raya_and_the_Last_Dragon")

In [456]:
data = bs(page.content)

In [457]:
def key_finder(word):
    final_word = ""
    init_num = 0 
    for prop in keys_list:
        count =0

        for letter in word:
            if letter in prop:
                count +=1

        if count > init_num:
            init_num = count 
            final_word = prop

    return final_word

In [460]:
#create empty dictionary with the keys having different properties of a movie
dict_details_orig = {'Title':'','Based on':'','Directed by':'','Written by':'','Screenplay by':'','Story by':'','Produced by':'','Starring':'',\
                     'Cinematography':'','Edited by':'','Music by':'', 'Productioncompanies':'',\
                     'Distributed by':'','Narrated by':'','Created by':'','Genre':'', \
                     'Release dates':'','Running time':'','Country':'','Language':'','Budget':'','Box office':''}

keys_list = [keys for keys in dict_details_orig.keys()]

#find the first table of the movie page
table = data.find("table")

#initialize a number to
num =1
init = 0
idx = 0

#iterate through the table to find th and td tagsf
for items in table.find_all(["th","td"]): 
    
    #get the name of the movie
    if init == 0:
        dict_details_orig['Title'] = items.get_text()
        init += 1
        #print(items.get_text(), num)
    else:
        
        #if the item is even we get the th tag's text as keys of a dictionary
        if num % 2 != 0:
            
            #print(items.get_text(), num)
            key = key_finder(items.get_text()[:10])
            if key in keys_list:
                key = key

        #else we get the td tag values
        else:
            #print(items.get_text(), num)
           
            value = items.get_text().replace("\n",",")
            if value[0] == ',':
                value = value[1:] 
            if num !=2:
                dict_details_orig[key] = value
            else:
                pass
            #print("key = "+ key)

    #print(dict_details_orig)
    num += 1

        

In [461]:
dict_details_orig

{'Title': 'Raya and the Last Dragon',
 'Directed by': 'Don Hall,Carlos López Estrada',
 'Screenplay by': 'Qui Nguyen,Adele Lim',
 'Story by': 'Paul Briggs,Don Hall,Adele Lim,Carlos López Estrada,Kiel Murray,Qui Nguyen,John Ripa,Dean Wellins',
 'Produced by': 'Osnat Shurer,Peter Del Vecho',
 'Starring': 'Kelly Marie Tran,Awkwafina,Izaac Wang,Gemma Chan,Daniel Dae Kim,Benedict Wong,Sandra Oh,Thalia Tran,Lucille Soong,Alan Tudyk',
 'Cinematography': '',
 'Edited by': 'Fabienne Rawley,Shannon Stein',
 'Music by': 'James Newton Howard',
 'Productioncompanies': 'Walt Disney Pictures,Walt Disney Animation Studios',
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Narrated by': '',
 'Release dates': 'March\xa05,\xa02021\xa0(2021-03-05) (United States)',
 'Running time': '107 minutes[1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$100 million+[citation needed]',
 'Box office': '$130.4 million[2]'}

In [497]:
def add_null_values(dict_of_props):
    for det in dict_of_props.items():
        if det[1] == '':
            dict_of_props[det[0]] = np.NaN

    return dict_of_props


{'Title': 'Raya and the Last Dragon',
 'Directed by': 'Don Hall,Carlos López Estrada',
 'Screenplay by': 'Qui Nguyen,Adele Lim',
 'Story by': 'Paul Briggs,Don Hall,Adele Lim,Carlos López Estrada,Kiel Murray,Qui Nguyen,John Ripa,Dean Wellins',
 'Produced by': 'Osnat Shurer,Peter Del Vecho',
 'Starring': 'Kelly Marie Tran,Awkwafina,Izaac Wang,Gemma Chan,Daniel Dae Kim,Benedict Wong,Sandra Oh,Thalia Tran,Lucille Soong,Alan Tudyk',
 'Cinematography': nan,
 'Edited by': 'Fabienne Rawley,Shannon Stein',
 'Music by': 'James Newton Howard',
 'Productioncompanies': 'Walt Disney Pictures,Walt Disney Animation Studios',
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Narrated by': nan,
 'Release dates': 'March\xa05,\xa02021\xa0(2021-03-05) (United States)',
 'Running time': '107 minutes[1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$100 million+[citation needed]',
 'Box office': '$130.4 million[2]'}

In [500]:
values_list = []
for values in dict_details_orig.values():
    values_list.append(values)
values_list

['Raya and the Last Dragon',
 'Don Hall,Carlos López Estrada',
 'Qui Nguyen,Adele Lim',
 'Paul Briggs,Don Hall,Adele Lim,Carlos López Estrada,Kiel Murray,Qui Nguyen,John Ripa,Dean Wellins',
 'Osnat Shurer,Peter Del Vecho',
 'Kelly Marie Tran,Awkwafina,Izaac Wang,Gemma Chan,Daniel Dae Kim,Benedict Wong,Sandra Oh,Thalia Tran,Lucille Soong,Alan Tudyk',
 nan,
 'Fabienne Rawley,Shannon Stein',
 'James Newton Howard',
 'Walt Disney Pictures,Walt Disney Animation Studios',
 'Walt Disney Studios Motion Pictures',
 nan,
 'March\xa05,\xa02021\xa0(2021-03-05) (United States)',
 '107 minutes[1]',
 'United States',
 'English',
 '$100 million+[citation needed]',
 '$130.4 million[2]']

In [521]:
dict_details_orig_columns = ['Title','Directed by','Screenplay by','Story by','Produced by','Starring',\
                     'Cinematography','Edited by','Music by', 'Productioncompanies',\
                     'Distributed by','Narrated by', \
                     'Release dates','Running time','Country','Language','Budget','Box office']
df= pd.DataFrame(columns = dict_details_orig_columns)
df = df.append(pd.DataFrame([values_list], 
     columns=dict_details_orig_columns), 
     ignore_index=True)

  df = df.append(pd.DataFrame([values_list],


In [522]:
df


Unnamed: 0,Title,Directed by,Screenplay by,Story by,Produced by,Starring,Cinematography,Edited by,Music by,Productioncompanies,Distributed by,Narrated by,Release dates,Running time,Country,Language,Budget,Box office
0,Raya and the Last Dragon,"Don Hall,Carlos López Estrada","Qui Nguyen,Adele Lim","Paul Briggs,Don Hall,Adele Lim,Carlos López Es...","Osnat Shurer,Peter Del Vecho","Kelly Marie Tran,Awkwafina,Izaac Wang,Gemma Ch...",,"Fabienne Rawley,Shannon Stein",James Newton Howard,"Walt Disney Pictures,Walt Disney Animation Stu...",Walt Disney Studios Motion Pictures,,"March 5, 2021 (2021-03-05) (United States)",107 minutes[1],United States,English,$100 million+[citation needed],$130.4 million[2]
