## Scraping movies from IMDB Top 1000

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

In [9]:
headers = {'Accept-Language': 'en-US, en;q=0.5'}

In [10]:
url_list = ['https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc']

for i in range(101, 1001, 100):
    url = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start={}&ref_=adv_nxt'
    formated_url = url.format(str(i))
    url_list.append(formated_url)
url_list

['https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc',
 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=101&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=201&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=301&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=401&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=501&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=601&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=701&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=801&ref_=adv_nxt',
 'https://www.imdb.com/search/title/?groups

In [11]:
def get_movies_from_url(url, data):
    
    html_text = requests.get(url, headers = headers).text
    soup = BeautifulSoup(html_text, 'lxml')
    movies = soup.find_all('div', class_ = 'lister-item mode-advanced') 
    
    for movie in movies:
        
        link_imdb = 'https://www.imdb.com' + movie.find('h3', class_ = 'lister-item-header').a['href']
        
        link_poster = movie.find('div', class_ = 'lister-item-image float-left').a.img['loadlate']
        
        title = movie.find('h3', class_ = 'lister-item-header').a.text
        
        year = movie.find('span', class_ = 'lister-item-year text-muted unbold').text
        
        certificate = movie.find('span', class_ = 'certificate').text.strip() if movie.find('span', class_ = 'certificate') else None
        
        runtime = movie.find('span', class_ = 'runtime').text.strip() 
        
        genre = movie.find('span', class_ = 'genre').text.strip()
        
        rating = movie.find('div',class_= 'inline-block ratings-imdb-rating').strong.text
        
        meta_score = movie.find('span', class_ = 'metascore favorable').text.strip() if movie.find('span', class_ = 'metascore favorable') else None
        
        overview = movie.find_all('p', class_= 'text-muted')[1].text.strip()
        
        ds = movie.find('p', class_ = '').find_all('a')
        director = ds[0].text
        star_1 = ds[1].text
        star_2 = ds[2].text
        star_3 = ds[3].text
        star_4 = ds[4].text
        
        nv = movie.find_all('span', attrs={'name':'nv'})
        vote = nv[0].text
        gross = nv[1].text if (len(nv) > 1) and (nv[1].text[0] != '#') else None  
        
        data.append([link_imdb, link_poster, title, year, certificate, 
                     runtime, genre, rating, meta_score, overview, 
                     director, star_1, star_2, star_3, star_4, vote, gross])

In [12]:
data = []
for url in url_list:
    time.sleep(3)
    get_movies_from_url(url, data)

In [13]:
df = pd.DataFrame(data, columns= ["link_imdb", "link_poster", "title", "year", "certificate"
                                  , "runtime", "genre", "rating", "meta_score", "overview"
                                  , "director","star_1", "star_2", "star_3", "star_4", "vote", "gross"])

In [14]:
df.to_csv('imdb_top_1000_movies.csv', index = False)