<a href="https://colab.research.google.com/github/jordanco-bgu/social_network_movies/blob/main/Imdb_crawler_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMDB Web Crawler

This is a web crawler that takes a list of movie names and extracts information from their IMDb pages. Information extracted is movie budget, gross worldwide revenue, and IMDb rating.

In [None]:
!pip install tqdm



In [None]:
!mkdir ./datasets

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
social_attributes_df = pd.read_csv('./drive/MyDrive/social_network_attributes_dataset.csv')
movie_title_list = social_attributes_df['movie_name'].tolist()
movie_title_list

['The Night Flier',
 'The Interpreter',
 'The Skull',
 'The Hippopotamus',
 'Snails in the Rain',
 'Final Destination 2',
 'Resurrecting the Champ',
 'Camp',
 'John Wick: Chapter 2',
 'Skyline',
 'So Undercover',
 'Love Is All There Is',
 'Jobs',
 'Foul Play',
 'Wait Until Dark',
 'Men in War',
 'Free Men',
 'The Best Way to Walk',
 'Police Academy 6: City Under Siege',
 'A Tale of Love and Darkness',
 'Be Yourself!',
 'Life of the Party',
 'Chocolat',
 'Cold in July',
 'The Art of Getting By',
 'Frivolous Lola',
 "We're the Millers",
 'Altar',
 'Peelers',
 'We Go On',
 'A Price Above Rubies',
 'The Lake House',
 'The Thief Lord',
 'Coffin 2',
 'Hostage',
 'Rango',
 '2001 Maniacs',
 'Børning',
 'Death Hunt',
 '633 Squadron',
 'Just Bea',
 'The Importance of Being Earnest',
 'The Remains',
 'Quintet',
 'Owning Mahowny',
 'Fargo',
 'Winter Sleep',
 'Carry on Cowboy',
 'Wichita',
 "My Father's Glory",
 'Bonnie and Clyde',
 'The Hangover Part III',
 'Stay Alive',
 'Suspicion',
 'When Did Y

In [None]:
imdb_url = 'https://www.imdb.com/search/title/?title={title}'

# get first imdb movie page link in search result
def get_movie(soup):
  movie = soup.find("a", class_="ipc-title-link-wrapper")
  movie_link = ''

  if movie:
    movie_link = "https://www.imdb.com" + movie["href"]

  return movie_link

# scrape data from imdb movie page
def get_movie_data(movie):
  movie_info = {}

  movie_page = get_movie_imdb_page(movie)
  soup  = BeautifulSoup(movie_page, 'html.parser')

  movie_info.update({"Title": get_title(soup)})
  movie_info.update({"Budget": get_budget(soup)})
  movie_info.update({"Revenue": get_box_office_revenue(soup)})
  movie_info.update({"Rating": get_rating(soup)})

  return movie_info

In [None]:
# get imdb page of a given movie
def get_movie_imdb_page(movie_url):
  return requests.get(movie_url, headers={'User-Agent': 'Mozilla/6.0'}).text

# get title of the movie
def get_title(soup):
  full_title = soup.find('title').text.strip()
  return full_title[:-14]

# get budget of movie
def get_budget(soup):
  try:
    wrapper = soup.find('span', text='Budget').findNext('div')
    budget = wrapper.find('span').text

    return budget
  except AttributeError:
    return None

# get revenue of movie
def get_box_office_revenue(soup):
  try:
    wrapper = soup.find('span', text='Gross worldwide').findNext('div')
    gross_revenue = wrapper.find('span').text

    return gross_revenue
  except AttributeError:
    return None

# get imdb rating of movie
def get_rating(soup):
  try:
    wrapper = soup.find('span', class_='sc-bde20123-1')
    rating = wrapper.text

    return rating
  except AttributeError:
    return None

In [None]:
from tqdm import tqdm

def scrape_imdb_movies(movie_list):
  movies_data = []

  for title in tqdm(movie_title_list):
    search_title = title.replace(' ', '%20').lower()

    html = requests.get(imdb_url.format(title=search_title), headers={'User-Agent': 'Mozilla/6.0'}).text
    soup = BeautifulSoup(html, 'html.parser')

    movie = get_movie(soup)

    if movie == '':
      continue

    movie_info = get_movie_data(movie)
    movies_data.append(movie_info)

  return movies_data

movies_df = pd.DataFrame(scrape_imdb_movies(movie_title_list))
movies_df

  wrapper = soup.find('span', text='Budget').findNext('div')
  wrapper = soup.find('span', text='Gross worldwide').findNext('div')
100%|██████████| 15538/15538 [11:24:07<00:00,  2.64s/it]


Unnamed: 0,Title,Budget,Revenue,Rating
0,The Night Flier,"$1,000,000 (estimated)","$125,397",6.0
1,The Interpreter of Silence (TV Mini Serie,,,7.5
2,Indiana Jones and the Kingdom of the Crystal S...,"$185,000,000 (estimated)","$786,636,033",6.2
3,I'm for the Hippopotamus,,,6.6
4,Snails in the Rain,"₪1,000,000 (estimated)",,6.3
...,...,...,...,...
15483,Ladies in Lavender,,"$20,421,130",7.0
15484,Snow Dogs,"$33,000,000 (estimated)","$115,035,090",5.2
15485,City for Conquest,"$920,000 (estimated)",,7.2
15486,"New York, I Love You","$14,700,000 (estimated)","$9,961,023",6.2


In [None]:
# convert dataframe to csv

movies_df.to_csv('./datasets/movie_information_dataset.csv')