# Imports

In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import help_functions as hf

# Performing web scraping from the main page on disney movies wikipedia

In [2]:
# Using the requests library to access the website
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# Using BeautifulSoup for parsing html from website e selecionando tabela
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")
base_path = "https://en.wikipedia.org/"
movie_info_list = []

# Selecting the information of each movie in the table
for index, movie in enumerate(movies):
    
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        movie_info_list.append(hf.get_info_box(full_path))
    except Exception as e:
        print(movie.get_text())
        print(e)




  soup = bs(r.content)


Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
The London Connection
'NoneType' object has no attribute 'find'
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
Sister Act 3
'NoneType' object has no attribute 'find'
The Thief
'NoneType' object has no attribute 'find_all'
Tom Sawyer
'NoneType' object has no attribute 'find_all'
Tower of Terror
'NoneType' object has no attribute 'find_all'
Tron: Ares
'NoneType' object has no attribute 'find'
FC Barcelona
'NoneType' object has no attribute 'find_all'
Young Woman and the Sea
'NoneType' object has no attribute 'find_all'


In [3]:
# Print the number of films
print(len(movie_info_list))
# Print the first three records
print(movie_info_list[0:3])

520
[{'title': 'Academy Award Review of', 'Production company': 'Walt Disney Productions', 'Distributed by': 'United Artists', 'Release date': ['May 19, 1937'], 'Running time': '41 minutes (74 minutes 1966 release)', 'Country': 'United States', 'Language': 'English', 'Box office': '$45.472'}, {'title': 'Snow White and the Seven Dwarfs', 'Directed by': ['David Hand', 'William Cottrell', 'Wilfred Jackson', 'Larry Morey', 'Perce Pearce', 'Ben Sharpsteen'], 'Written by': ['Ted Sears', 'Richard Creedon', 'Otto Englander', 'Dick Rickard', 'Earl Hurd', 'Merrill De Maris', 'Dorothy Ann Blank', 'Webb Smith'], 'Based on': 'Snow White by The Brothers Grimm', 'Produced by': 'Walt Disney', 'Starring': ['Adriana Caselotti', 'Lucille La Verne', 'Harry Stockwell', 'Roy Atwell', 'Pinto Colvig', 'Otis Harlan', 'Scotty Mattraw', 'Billy Gilbert', 'Eddie Collins', 'Moroni Olsen', 'Stuart Buchanan'], 'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'], 'Production company': 'Walt Disney Productio

In [4]:
# Saving data in json
hf.save_data_json('dataframe_disney.json', movie_info_list)

# Creating and clearing the DataFrame


In [5]:
df = pd.DataFrame(movie_info_list)
df.reset_index()
df.head(10)

Unnamed: 0,title,Production company,Distributed by,Release date,Running time,Country,Language,Box office,Directed by,Written by,...,Original concept by,Music,Lyrics,Book,Basis,Productions,Awards,Created by,Original work,Owner
0,Academy Award Review of,Walt Disney Productions,United Artists,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,,,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,RKO Radio Pictures,,83 minutes,United States,English,$418 million,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...",...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,RKO Radio Pictures,,88 minutes,United States,English,$164 million,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,RKO Radio Pictures,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),"[Samuel Armstrong, James Algar, Bill Roberts, ...",,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,RKO Radio Pictures,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",Alfred Werker (live action) Hamilton Luske (an...,Live-action: Ted Sears Al Perkins Larry Clemmo...,...,,,,,,,,,,
5,Dumbo,Walt Disney Productions,RKO Radio Pictures,,64 minutes,United States,English,>$1.3 million (est. United States/Canada renta...,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...",,...,,,,,,,,,,
6,Bambi,Walt Disney Productions,RKO Radio Pictures,,70 minutes,United States,English,$267.4 million,Supervising director David Hand Sequence direc...,,...,,,,,,,,,,
7,Saludos Amigos,Walt Disney Productions,RKO Radio Pictures,,42 minutes,United States,,$1.135 million (worldwide rentals),"[Norman Ferguson, Wilfred Jackson, Jack Kinney...",,...,,,,,,,,,,
8,Victory Through Air Power,Walt Disney Productions,United Artists,"[July 17, 1943]",70 min,United States,English,"$799,000",Animated sequences: James Algar Clyde Geronimi...,Story direction: Perce Pearce Story adaptation...,...,,,,,,,,,,
9,The Three Caballeros,Walt Disney Productions,RKO Radio Pictures,,71 minutes,United States,,$3.355 million (worldwide rentals),"[Norman Ferguson, Clyde Geronimi, Jack Kinney,...",,...,,,,,,,,,,


In [6]:
# Performs a split in the Running time column, selecting the minutes before the space
df['Running time'] = df['Running time'].astype(str).apply(lambda x: x.split(' ')[0])
# Finding elements == 'nan' and setting them 0
df.loc[df['Running time'] == 'nan'] = '0'
# Deleting DataFrame rows that have value == 0
df = df[df['title'] != '0']
# Converting column to int
df['Running time'] = df['Running time'].astype(int)
df
    

Unnamed: 0,title,Production company,Distributed by,Release date,Running time,Country,Language,Box office,Directed by,Written by,...,Original concept by,Music,Lyrics,Book,Basis,Productions,Awards,Created by,Original work,Owner
0,Academy Award Review of,Walt Disney Productions,United Artists,"[May 19, 1937]",41,United States,English,$45.472,,,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,RKO Radio Pictures,,83,United States,English,$418 million,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...",...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,RKO Radio Pictures,,88,United States,English,$164 million,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,RKO Radio Pictures,"[November 13, 1940]",126,United States,English,$76.4–$83.3 million (United States and Canada),"[Samuel Armstrong, James Algar, Bill Roberts, ...",,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,RKO Radio Pictures,"[June 27, 1941]",74,United States,English,"$960,000 (worldwide rentals)",Alfred Werker (live action) Hamilton Luske (an...,Live-action: Ted Sears Al Perkins Larry Clemmo...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514,SpaceCamp,ABC Motion Pictures,20th Century Fox,"[June 6, 1986 (United States)]",107,United States,English,"$9,697,739 (USA)",Harry Winer,,...,,,,,,,,,,
516,The Aristocats,Walt Disney Productions,Buena Vista Distribution,,79,United States,English,$191 million,Wolfgang Reitherman,,...,,,,,,,,,,
517,The Sword in the Stone,Walt Disney Productions,Buena Vista Distribution,"[December 25, 1963]",74,United States,English,$22.2 million (United States and Canada),"[Wolfgang Reitherman, Hamilton Luske (Uncredit...",,...,,,,,,,,,,
518,Tinker Bell,DisneyToon Studios,Walt Disney Studios Home Entertainment,,468,United States,English,,"[Bradley Raymond ( 1 , 3 & 4 ), Klay Hall ( 2 ...",,...,,,,,,,,,,


In [7]:
# Use a lambda function with get_data for each element of datafreme's df['Release date'] column, applying the new date format
df['Release date'] = df['Release date'].apply(lambda x: hf.get_date(x))

df

Unnamed: 0,title,Production company,Distributed by,Release date,Running time,Country,Language,Box office,Directed by,Written by,...,Original concept by,Music,Lyrics,Book,Basis,Productions,Awards,Created by,Original work,Owner
0,Academy Award Review of,Walt Disney Productions,United Artists,1937/5/19,41,United States,English,$45.472,,,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,RKO Radio Pictures,,83,United States,English,$418 million,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...",...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,RKO Radio Pictures,,88,United States,English,$164 million,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,RKO Radio Pictures,1940/11/13,126,United States,English,$76.4–$83.3 million (United States and Canada),"[Samuel Armstrong, James Algar, Bill Roberts, ...",,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,RKO Radio Pictures,1941/6/27,74,United States,English,"$960,000 (worldwide rentals)",Alfred Werker (live action) Hamilton Luske (an...,Live-action: Ted Sears Al Perkins Larry Clemmo...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514,SpaceCamp,ABC Motion Pictures,20th Century Fox,1986/6/6,107,United States,English,"$9,697,739 (USA)",Harry Winer,,...,,,,,,,,,,
516,The Aristocats,Walt Disney Productions,Buena Vista Distribution,,79,United States,English,$191 million,Wolfgang Reitherman,,...,,,,,,,,,,
517,The Sword in the Stone,Walt Disney Productions,Buena Vista Distribution,1963/12/25,74,United States,English,$22.2 million (United States and Canada),"[Wolfgang Reitherman, Hamilton Luske (Uncredit...",,...,,,,,,,,,,
518,Tinker Bell,DisneyToon Studios,Walt Disney Studios Home Entertainment,,468,United States,English,,"[Bradley Raymond ( 1 , 3 & 4 ), Klay Hall ( 2 ...",,...,,,,,,,,,,


In [8]:
# Converting column to datetime type
df['Release date'] = pd.to_datetime(df['Release date'])
df['Release date'].dtypes



dtype('<M8[ns]')

In [9]:
# Selecting numeric string value and converting to int
df['Box office'] = df['Box office'].apply(lambda x: hf.split_value(x))
df['Box office'] = pd.to_numeric(df['Box office'])

df

Unnamed: 0,title,Production company,Distributed by,Release date,Running time,Country,Language,Box office,Directed by,Written by,...,Original concept by,Music,Lyrics,Book,Basis,Productions,Awards,Created by,Original work,Owner
0,Academy Award Review of,Walt Disney Productions,United Artists,1937-05-19,41,United States,English,45.0,,,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,RKO Radio Pictures,NaT,83,United States,English,418.0,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...",...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,RKO Radio Pictures,NaT,88,United States,English,164.0,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,RKO Radio Pictures,1940-11-13,126,United States,English,76.0,"[Samuel Armstrong, James Algar, Bill Roberts, ...",,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,RKO Radio Pictures,1941-06-27,74,United States,English,960.0,Alfred Werker (live action) Hamilton Luske (an...,Live-action: Ted Sears Al Perkins Larry Clemmo...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514,SpaceCamp,ABC Motion Pictures,20th Century Fox,1986-06-06,107,United States,English,9.0,Harry Winer,,...,,,,,,,,,,
516,The Aristocats,Walt Disney Productions,Buena Vista Distribution,NaT,79,United States,English,191.0,Wolfgang Reitherman,,...,,,,,,,,,,
517,The Sword in the Stone,Walt Disney Productions,Buena Vista Distribution,1963-12-25,74,United States,English,22.0,"[Wolfgang Reitherman, Hamilton Luske (Uncredit...",,...,,,,,,,,,,
518,Tinker Bell,DisneyToon Studios,Walt Disney Studios Home Entertainment,NaT,468,United States,English,,"[Bradley Raymond ( 1 , 3 & 4 ), Klay Hall ( 2 ...",,...,,,,,,,,,,


In [10]:
# Selecting numeric string value and converting to int
df['Budget'] = df['Budget'].apply(lambda x: hf.split_budget(x))
df['Box office'] = pd.to_numeric(df['Box office'])
print(df['Budget'])

0         None
1         1.49
2          2.6
3         2.28
4      600.000
        ...   
514         18
516          4
517          3
518       None
519      35-40
Name: Budget, Length: 499, dtype: object


In [11]:
# Saving the dataframe in pickle format  
hf.save_data_pickle('dataframe_disney_cleaned.pickle', df)

In [12]:
# Loaded the DataFrame
data = hf.load_data_pickle('dataframe_disney_cleaned.pickle')
print(data[0:5])

                             title       Production company  \
0          Academy Award Review of  Walt Disney Productions   
1  Snow White and the Seven Dwarfs  Walt Disney Productions   
2                        Pinocchio  Walt Disney Productions   
3                         Fantasia  Walt Disney Productions   
4             The Reluctant Dragon  Walt Disney Productions   

       Distributed by Release date  Running time        Country Language  \
0      United Artists   1937-05-19            41  United States  English   
1  RKO Radio Pictures          NaT            83  United States  English   
2  RKO Radio Pictures          NaT            88  United States  English   
3  RKO Radio Pictures   1940-11-13           126  United States  English   
4  RKO Radio Pictures   1941-06-27            74  United States  English   

   Box office                                        Directed by  \
0        45.0                                                NaN   
1       418.0  [David Hand, 