# Scraping with Pandas

In [1]:
import pandas as pd

We can use the `read_html` function in Pandas to automatically scrape any tabular data from a page.

In [2]:
url = 'https://www.filmsite.org/bestpics4.html'

In [3]:
tables = pd.read_html(url)
tables[1]

Unnamed: 0,0,1,2,3,4,5
0,Film Year,Best Picture Academy Award Winners,Director (* Did not also win Best Director),Title Screen and Studio or Production Company,NOT Nominated for Best Picture,Should Have Won
1,,,,,,
2,1960,The Apartment,Billy Wilder,United Artists,Psycho Spartacus,Psycho
3,1961,West Side Story,Robert Wise and Jerome Robbins,United Artists,Breakfast at Tiffany's Splendor in the Grass ...,OK or The Hustler
4,1962,Lawrence of Arabia,David Lean,Columbia,The Manchurian Candidate (1962) Ride the High...,OK or To Kill a Mockingbird
...,...,...,...,...,...,...
56,2014,Birdman,Alejandro González Iñárritu,Fox Searchlight,Into the Woods Foxcatcher Unbroken Gone Gir...,The Grand Budapest Hotel or Boyhood or Amer...
57,2015,Spotlight,Tom McCarthy*,Open Road Films,Carol The Danish Girl The Hateful Eight Joy ...,The Revenant
58,2016,Moonlight,Barry Jenkins*,A24/Plan B Entertainment,Rogue One: A Star Wars Story Jackie,La La Land Manchester By the Sea or Moonlight
59,2017,The Shape of Water,Guillermo del Toro,Fox Searchlight,"I, Tonya Wonder Woman Blade Runner 2049",


In [4]:
type(tables)

list

In [5]:
df = tables[1]
df.columns = ['0', '1', '2', '3', '4', '5']
df.head()

Unnamed: 0,0,1,2,3,4,5
0,Film Year,Best Picture Academy Award Winners,Director (* Did not also win Best Director),Title Screen and Studio or Production Company,NOT Nominated for Best Picture,Should Have Won
1,,,,,,
2,1960,The Apartment,Billy Wilder,United Artists,Psycho Spartacus,Psycho
3,1961,West Side Story,Robert Wise and Jerome Robbins,United Artists,Breakfast at Tiffany's Splendor in the Grass ...,OK or The Hustler
4,1962,Lawrence of Arabia,David Lean,Columbia,The Manchurian Candidate (1962) Ride the High...,OK or To Kill a Mockingbird


In [6]:
df_new=df[2:]
df_new.head()

Unnamed: 0,0,1,2,3,4,5
2,1960,The Apartment,Billy Wilder,United Artists,Psycho Spartacus,Psycho
3,1961,West Side Story,Robert Wise and Jerome Robbins,United Artists,Breakfast at Tiffany's Splendor in the Grass ...,OK or The Hustler
4,1962,Lawrence of Arabia,David Lean,Columbia,The Manchurian Candidate (1962) Ride the High...,OK or To Kill a Mockingbird
5,1963,Tom Jones,Tony Richardson,United Artists,Hud The Birds The Haunting,OK or The Birds
6,1964,My Fair Lady,George Cukor,Warner Bros,A Hard Day's Night The Servant A Shot in the...,Dr. Strangelove: Or How I Learned to Stop Worr...


In [7]:
df_new_name=df_new[['0', '1', '2', '3']]
df_new_name.head()

Unnamed: 0,0,1,2,3
2,1960,The Apartment,Billy Wilder,United Artists
3,1961,West Side Story,Robert Wise and Jerome Robbins,United Artists
4,1962,Lawrence of Arabia,David Lean,Columbia
5,1963,Tom Jones,Tony Richardson,United Artists
6,1964,My Fair Lady,George Cukor,Warner Bros


In [8]:
df_movies=df_new_name.rename(columns={'0': 'Year', '1': 'Movie', '2': 'Director', '3': 'Studio'})
df_movies.head()

Unnamed: 0,Year,Movie,Director,Studio
2,1960,The Apartment,Billy Wilder,United Artists
3,1961,West Side Story,Robert Wise and Jerome Robbins,United Artists
4,1962,Lawrence of Arabia,David Lean,Columbia
5,1963,Tom Jones,Tony Richardson,United Artists
6,1964,My Fair Lady,George Cukor,Warner Bros


In [9]:
movie_list=df_movies["Movie"].tolist()
movie_list

['The Apartment',
 'West Side Story',
 'Lawrence of Arabia',
 'Tom Jones',
 'My Fair Lady',
 'The Sound of Music',
 'A Man for All Seasons',
 'In the Heat of the Night',
 'Oliver!',
 'Midnight Cowboy',
 'Patton',
 'The French Connection',
 'The Godfather',
 'The Sting',
 'The Godfather Part II',
 "One Flew Over The Cuckoo's Nest",
 'Rocky',
 'Annie Hall',
 'The Deer Hunter',
 'Kramer vs. Kramer',
 'Ordinary People',
 'Chariots of Fire',
 'Gandhi',
 'Terms of Endearment',
 'Amadeus',
 'Out of Africa',
 'Platoon',
 'The Last Emperor',
 'Rain Man',
 'Driving Miss Daisy',
 'Dances with Wolves',
 'The Silence of the Lambs',
 'Unforgiven',
 "Schindler's List",
 'Forrest Gump',
 'Braveheart',
 'The English Patient',
 'Titanic',
 'Shakespeare in Love',
 'American Beauty',
 'Gladiator',
 'A Beautiful Mind',
 'Chicago',
 'The Lord of the Rings: The Return of the King',
 'Million Dollar Baby',
 'Crash',
 'The Departed',
 'No Country for Old Men',
 'Slumdog Millionaire',
 'The Hurt Locker',
 "The 

## Adding data from Open movie Data Base

In [10]:
import requests
import json
from pprint import pprint
#from config import api_key
url = "http://www.omdbapi.com/?t="
api_key = "&apikey=d7cf747"

In [11]:
movie_db=[]
titles=[]
ratings=[]
for m in movie_list:
    response = requests.get(url + m + api_key).json()
    title=response['Title']
    titles.append(title)
    rating=response['imdbRating']
    ratings.append(rating)
    movie_db.append(response)
movie_db

[{'Title': 'The Apartment',
  'Year': '1960',
  'Rated': 'Not Rated',
  'Released': '16 Sep 1960',
  'Runtime': '125 min',
  'Genre': 'Comedy, Drama, Romance',
  'Director': 'Billy Wilder',
  'Writer': 'Billy Wilder, I.A.L. Diamond',
  'Actors': 'Jack Lemmon, Shirley MacLaine, Fred MacMurray, Ray Walston',
  'Plot': 'A man tries to rise in his company by letting its executives use his apartment for trysts, but complications and a romance of his own ensue.',
  'Language': 'English',
  'Country': 'USA',
  'Awards': 'Won 5 Oscars. Another 19 wins & 8 nominations.',
  'Poster': 'https://m.media-amazon.com/images/M/MV5BNzkwODFjNzItMmMwNi00MTU5LWE2MzktM2M4ZDczZGM1MmViXkEyXkFqcGdeQXVyNDY2MTk1ODk@._V1_SX300.jpg',
  'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.3/10'},
   {'Source': 'Rotten Tomatoes', 'Value': '94%'},
   {'Source': 'Metacritic', 'Value': '94/100'}],
  'Metascore': '94',
  'imdbRating': '8.3',
  'imdbVotes': '149,612',
  'imdbID': 'tt0053604',
  'Type': 'movie',


In [12]:
movie_imdb=pd.DataFrame({"Movie": titles,
                       "Rating": ratings})
movie_imdb.head()

Unnamed: 0,Movie,Rating
0,The Apartment,8.3
1,West Side Story,7.5
2,Lawrence of Arabia,8.3
3,Tom Jones,6.5
4,My Fair Lady,7.8
