# Movies

# Pick a website and describe your objective
- Browse through different sites and pick on to scrape.
- Identify the information you'd like to scrape from the site. Decide the format of the output - CSV file.
- Summarize your project idea and outline your strategy in a Juptyer notebook.

## Outline
- We are going to scrape https://www.themoviedb.org/movie
- Get a list of movies with other information for each movie

In [269]:
!pip install requests --upgrade --quiet

In [270]:
import requests

In [271]:
topics_url = "https://www.themoviedb.org/movie"

In [272]:
response = requests.get(topics_url)

In [273]:
response.status_code

200

In [274]:
len(response.text)

199983

In [275]:
page_contents = response.text

In [276]:
page_contents[:1000]

'<!DOCTYPE html>\n<html lang="en" class="no-js">\n  <head>\n    <title>Popular Movies &#8212; The Movie Database (TMDb)</title>\n    <meta http-equiv="X-UA-Compatible" content="IE=edge" />\n    <meta http-equiv="cleartype" content="on">\n    <meta charset="utf-8">\n    \n    <meta name="keywords" content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast">\n    <meta name="mobile-web-app-capable" content="yes">\n    <meta name="apple-mobile-web-app-capable" content="yes">\n    <meta name="HandheldFriendly" content="True">\n    <meta name="MobileOptimized" content="320">\n    \n    <meta name="viewport" content="width=1120">\n    \n    <meta name="msapplication-TileImage" content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png">\n<meta name="msapplication-TileColor" content="#032541">\n<meta name="theme-color" content="#032541">\n<link rel="apple-touch-ico

In [277]:
with open('webpage.html','w') as f:
    f.write(page_contents)

## Use Beautiful Soup to parse and extract information

In [278]:
!pip install beautifulsoup4 --upgrade --quiet

In [279]:
from bs4 import BeautifulSoup

In [280]:
doc = BeautifulSoup(page_contents,'html.parser')

### Movie Title

In [281]:
movie_title = doc.find_all('h2')[4:]

In [282]:
len(movie_title)

20

In [283]:
movie_title[:5]

[<h2><a href="/movie/399566" title="Godzilla vs. Kong">Godzilla vs. Kong</a></h2>,
 <h2><a href="/movie/791373" title="Zack Snyder's Justice League">Zack Snyder's Justice League</a></h2>,
 <h2><a href="/movie/412656" title="Chaos Walking">Chaos Walking</a></h2>,
 <h2><a href="/movie/527774" title="Raya and the Last Dragon">Raya and the Last Dragon</a></h2>,
 <h2><a href="/movie/460465" title="Mortal Kombat">Mortal Kombat</a></h2>]

### Release Date

In [284]:
release_date = doc.find_all('p')

In [285]:
len(release_date)

33

In [286]:
release_date = release_date[1:21]

In [287]:
release_date[:5]

[<p>Mar 24, 2021</p>,
 <p>Mar 18, 2021</p>,
 <p>Feb 24, 2021</p>,
 <p>Mar 03, 2021</p>,
 <p>Apr 07, 2021</p>]

### User Rating

In [288]:
user_rating = doc.find_all('div' , {'class':'percent'})

In [289]:
len(user_rating)

20

In [290]:
user_ratings = []
for i in user_rating:
    data = ((i.parent)['data-percent'])
    user_ratings.append(float(data))
print(len(user_ratings))

20


### Movie Information URL

In [291]:
movie_url = doc.find_all('a')

In [292]:
movie_urls = []
base_url = 'https://www.themoviedb.org'

for i in movie_title:
    for j in movie_url:
        if j.text == i.text:
            movie_urls.append(base_url + j['href'])

print(len(movie_urls))
print(movie_urls)

20
['https://www.themoviedb.org/movie/399566', 'https://www.themoviedb.org/movie/791373', 'https://www.themoviedb.org/movie/412656', 'https://www.themoviedb.org/movie/527774', 'https://www.themoviedb.org/movie/460465', 'https://www.themoviedb.org/movie/544401', 'https://www.themoviedb.org/movie/587807', 'https://www.themoviedb.org/movie/664767', 'https://www.themoviedb.org/movie/644083', 'https://www.themoviedb.org/movie/458576', 'https://www.themoviedb.org/movie/793723', 'https://www.themoviedb.org/movie/464052', 'https://www.themoviedb.org/movie/587996', 'https://www.themoviedb.org/movie/581389', 'https://www.themoviedb.org/movie/632357', 'https://www.themoviedb.org/movie/802504', 'https://www.themoviedb.org/movie/775996', 'https://www.themoviedb.org/movie/581387', 'https://www.themoviedb.org/movie/797394', 'https://www.themoviedb.org/movie/522444']


### Basic Overview Dataframe Using Pandas

In [293]:
movie_titles = []
release_dates = []


for i in movie_title:
    movie_titles.append(i.text)
    
for i in release_date:
    release_dates.append(i.text)

print(len(movie_titles))
print(movie_titles)
print(len(release_dates))
print(release_dates)
print(len(user_ratings))
print(user_ratings)

20
['Godzilla vs. Kong', "Zack Snyder's Justice League", 'Chaos Walking', 'Raya and the Last Dragon', 'Mortal Kombat', 'Cherry', 'Tom & Jerry', "Mortal Kombat Legends: Scorpion's Revenge", 'Twist', 'Monster Hunter', 'Sentinelle', 'Wonder Woman 1984', 'Below Zero', 'Space Sweepers', 'The Unholy', 'Just Say Yes', 'Outside the Wire', 'Ashfall', 'Secret Magic Control Agency', 'Black Water: Abyss']
20
['Mar 24, 2021', 'Mar 18, 2021', 'Feb 24, 2021', 'Mar 03, 2021', 'Apr 07, 2021', 'Feb 26, 2021', 'Feb 11, 2021', 'Apr 12, 2020', 'Jan 22, 2021', 'Dec 03, 2020', 'Mar 05, 2021', 'Dec 16, 2020', 'Jan 29, 2021', 'Feb 05, 2021', 'Mar 31, 2021', 'Apr 02, 2021', 'Jan 15, 2021', 'Dec 19, 2019', 'Mar 18, 2021', 'Jul 09, 2020']
20
[83.0, 85.0, 75.0, 83.0, 75.0, 76.0, 73.0, 84.0, 68.0, 71.0, 61.0, 68.0, 64.0, 72.0, 61.0, 58.0, 65.0, 65.0, 72.0, 49.0]


In [294]:
!pip install pandas --upgrade --quiet

In [295]:
import pandas as pd

In [296]:
topics_dict = {
    'title' : movie_titles , 
    'release date' : release_dates ,
    'user rating' : user_ratings , 
    'url' : movie_urls
}

In [297]:
topics_df = pd.DataFrame.from_dict(topics_dict)

In [298]:
topics_df

Unnamed: 0,title,release date,user rating,url
0,Godzilla vs. Kong,"Mar 24, 2021",83.0,https://www.themoviedb.org/movie/399566
1,Zack Snyder's Justice League,"Mar 18, 2021",85.0,https://www.themoviedb.org/movie/791373
2,Chaos Walking,"Feb 24, 2021",75.0,https://www.themoviedb.org/movie/412656
3,Raya and the Last Dragon,"Mar 03, 2021",83.0,https://www.themoviedb.org/movie/527774
4,Mortal Kombat,"Apr 07, 2021",75.0,https://www.themoviedb.org/movie/460465
5,Cherry,"Feb 26, 2021",76.0,https://www.themoviedb.org/movie/544401
6,Tom & Jerry,"Feb 11, 2021",73.0,https://www.themoviedb.org/movie/587807
7,Mortal Kombat Legends: Scorpion's Revenge,"Apr 12, 2020",84.0,https://www.themoviedb.org/movie/664767
8,Twist,"Jan 22, 2021",68.0,https://www.themoviedb.org/movie/644083
9,Monster Hunter,"Dec 03, 2020",71.0,https://www.themoviedb.org/movie/458576


## Create CSV file(s) with the extracted information

In [299]:
topics_df.to_csv('topics.csv')

## Getting Information Out of a topic page

In [300]:
def topic_page(topics_url):
    # Download the page
    response = requests.get(topics_url)
    # Check successful response
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    # Parse using Beautiful soup
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc

#### Trial on first URL

In [301]:
topic_page_url = movie_urls[0]

In [302]:
topic_page_url

'https://www.themoviedb.org/movie/399566'

In [303]:
response = requests.get(topic_page_url)

In [304]:
response.status_code

200

In [305]:
len(response.text)

100177

In [306]:
topic_doc = BeautifulSoup(response.text,'html.parser')

In [307]:
#movie genre
movie_genres = topic_doc.find('span',{'class':'genres'})
movie_genres = movie_genres.contents[1::2]
movie_genre = []
for i in movie_genres:
    movie_genre.append(i.text)
movie_genre

['Action', 'Science Fiction']

In [308]:
#movie runtime
movie_time = topic_doc.find('span',{'class':'runtime'})
movie_time.text.strip()

'1h 53m'

In [309]:
movie_desc = topic_doc.find('p')
movie_desc.text.strip()

'In a time when monsters walk the Earth, humanity’s fight for its future sets Godzilla and Kong on a collision course that will see the two most powerful forces of nature on the planet collide in a spectacular battle for the ages.'

In [310]:
#top cast of movie
movie_cast = topic_doc.find_all('li',{'class':'card'})
movie_cast = [i.text[4:] for i in movie_cast]
movies_cast = []
for i in movie_cast:
    s = i.split('\n')
    movies_cast.append(s[0])
movies_cast

['Alexander Skarsgård',
 'Millie Bobby Brown',
 'Rebecca Hall',
 'Brian Tyree Henry',
 'Shun Oguri',
 'Eiza González',
 'Julian Dennison',
 'Lance Reddick',
 'Kyle Chandler']

### Generalising code

In [311]:
def get_movie_info(topic_doc):
    #movie genre
    movie_genres = topic_doc.find('span',{'class':'genres'})
    movie_genres = movie_genres.contents[1::2]
    movie_genre = []
    for i in movie_genres:
        movie_genre.append(i.text)
    movies_genre = ''
    for i in movie_genre:
        movies_genre += i + ' '
    
    #movie runtime
    movie_time = topic_doc.find('span',{'class':'runtime'})
    movie_time = movie_time.text.strip()
    
    #top cast of movie
    movie_cast = topic_doc.find_all('li',{'class':'card'})
    movie_cast = [i.text[4:] for i in movie_cast]
    movie_cast = movie_cast[:3]
    movies_cast = ''
    for i in movie_cast:
        s = i.split('\n')
        movies_cast += s[0] +','
    
    return movies_genre , movie_time , movies_cast[:-1]
    

In [312]:
movies_genre , movie_time , movies_cast = get_movie_info(topic_doc)

In [316]:
genre = []
time = []
cast = []

for i in range(len(movie_urls)):
    topic_page_url = movie_urls[i]
    topic_doc = topic_page(topic_page_url)
    movies_genre , movie_time , movies_cast = get_movie_info(topic_doc)
    genre.append(movies_genre)
    time.append(movie_time)
    cast.append(movies_cast)

topics_df['genre'] = genre
topics_df['runtime'] = time
topics_df['top cast'] = cast

In [314]:
topics_df

Unnamed: 0,title,release date,user rating,url,genre,runtime,top cast
0,Godzilla vs. Kong,"Mar 24, 2021",83.0,https://www.themoviedb.org/movie/399566,Action Science Fiction,1h 53m,"Alexander Skarsgård,Millie Bobby Brown,Rebecca..."
1,Zack Snyder's Justice League,"Mar 18, 2021",85.0,https://www.themoviedb.org/movie/791373,Action Adventure Fantasy Science Fiction,4h 2m,"Ben Affleck,Henry Cavill,Gal Gadot"
2,Chaos Walking,"Feb 24, 2021",75.0,https://www.themoviedb.org/movie/412656,Science Fiction Action Adventure Thriller,1h 49m,"Tom Holland,Daisy Ridley,Mads Mikkelsen"
3,Raya and the Last Dragon,"Mar 03, 2021",83.0,https://www.themoviedb.org/movie/527774,Animation Adventure Fantasy Family Action,1h 47m,"Kelly Marie Tran,Awkwafina,Izaac Wang"
4,Mortal Kombat,"Apr 07, 2021",75.0,https://www.themoviedb.org/movie/460465,Fantasy Action Adventure,1h 50m,"Lewis Tan,Jessica McNamee,Josh Lawson"
5,Cherry,"Feb 26, 2021",76.0,https://www.themoviedb.org/movie/544401,Crime Drama,2h 20m,"Tom Holland,Ciara Bravo,Jack Reynor"
6,Tom & Jerry,"Feb 11, 2021",73.0,https://www.themoviedb.org/movie/587807,Comedy Family Animation,1h 41m,"Chloë Grace Moretz,Michael Peña,Colin Jost"
7,Mortal Kombat Legends: Scorpion's Revenge,"Apr 12, 2020",84.0,https://www.themoviedb.org/movie/664767,Animation Action Fantasy,1h 20m,"Patrick Seitz,Jordan Rodrigues,Jennifer Carpenter"
8,Twist,"Jan 22, 2021",68.0,https://www.themoviedb.org/movie/644083,Crime Drama Action,1h 30m,"Raff Law,Michael Caine,Lena Headey"
9,Monster Hunter,"Dec 03, 2020",71.0,https://www.themoviedb.org/movie/458576,Fantasy Action Adventure,1h 44m,"Milla Jovovich,Tony Jaa,T.I."
