# Scrapping movies on IMDB

In [181]:
import requests, csv
from bs4 import BeautifulSoup
import re
import pandas as pd
import tqdm
import numpy as np

In [175]:
def movie_imdb(imdb_id):

    #SELECT SYNOPSIS, GENRES, RATING, NUMBER RATE, YEAR, TITLE, DURATION, COUNTRY,
    #SELECT LANGUAGE, BUDGET, PRODUCTION, INCOME
    response = requests.get("https://www.imdb.com/title/"+imdb_id)
    soup = BeautifulSoup(response.text)

    #select title and year
    tit = soup.find_all('div', {'class': 'titleBar'})
    tit_y = [text.text for text in tit]
    tit_y = re.sub('\n', '', tit_y[0])
    title = tit_y.split("\xa0")[0]
    if "original title" in tit_y:
        original_title = tit_y.split(") ")[1].split(" (")[0]
    else :
        original_title = title
    year = int((tit_y.split("\xa0(")[1]).split(")")[0])

    #select synopsis and genres
    syn = soup.find_all(id="titleStoryLine")
    syn_list = [text.text for text in syn]
    syn_list = re.sub('\n', '', syn_list[0])
    syno = re.sub('EditStoryline    ', '', syn_list)
    synopsis = ""

    if "Plot Summary" in syno:
        synopsis = syno.split("Plot Summary")[0]
    if "Written by" in syno:
        synopsis = syno.split("Written by")[0]
    
    genres = (syn_list.split("Genres:")[1]).split(" Certificate")[0]
    if "Certificate" in re.sub("\xa0", "", genres):
        genres = re.sub("\xa0", "", genres).strip().replace("|", ",").split("Certificate")[0]
    else :
        genres = re.sub("\xa0", "", genres).strip().replace("|", ",")[:-1]
    if "Motion Picture" in re.sub("\xa0", "", genres):
        genres = re.sub("\xa0", "", genres).strip().replace("|", ",").split("Motion Picture")[0]
    if "Parents Guide" in re.sub("\xa0", "", genres):
        genres = re.sub("\xa0", "", genres).strip().replace("|", ",").split("Parents Guide")[0]

    #select rating
    try:
        rate = soup.find_all(itemprop="ratingValue")
        rating = [text.text for text in rate][0]
    except IndexError:
        rating = ""

    #select number of ratings
    try:
        n_rate = soup.find_all(itemprop="ratingCount")
        num_rate = [text.text for text in n_rate]
        num_rate = int(re.sub(',', '', num_rate[0]))
    except IndexError:
        num_rate = ""
        
    #select country, language, budget, duration, production, income
    other = soup.find(id='titleDetails').find_all('div', {'class':'txt-block'})
    test = [text.text for text in other]


    country = ""
    language = ""
    budget = ""
    duration = ""
    production = ""
    income = ""

    for i in range(0, len(test)):
        if "Country:" in re.sub('\n', '', test[i]):
            country = re.sub('\n', '', test[i]).split("Country:")[1].replace("|", ", ")

    for i in range(0, len(test)):
        if "Language:" in re.sub('\n', '', test[i]):
            language = re.sub('\n', '', test[i]).split("Language:")[1].replace("|", ", ")

    for i in range(0, len(test)):
        if "Budget:" in re.sub('\n', '', test[i]):
            budget = re.sub('\n', '', test[i]).split("Budget:")[1].split("   ")[0]

    for i in range(0, len(test)):
        if "Production Co:" in re.sub('\n', '', test[i]):
            production = ((re.sub('\n', '', test[i]).split("Production Co:")[-1]).split(" See more")[0].strip()).split(",")[0]

    for i in range(0, len(test)):
        if "Cumulative Worldwide Gross:" in re.sub('\n', '', test[i]):
            income = (re.sub('\n', '', test[i]).split("Cumulative Worldwide Gross:")[1]).split("   ")[0].strip()
            
    #SELECT CASTING, DIRECTOR, WRITERS
    response = requests.get("https://www.imdb.com/title/"+imdb_id+"/fullcredits")
    soup2 = BeautifulSoup(response.text)

    #select casting
    cast_list = soup2.find_all('table', {'class': 'cast_list'})
    cast = []
    casting = ""
    for i in range(1, 30):
        try:
            cast.append((str(cast_list).split('img alt="')[i]).split('" class="')[0])
            casting = str(cast)
            casting = re.sub("'", "", casting[1:-1])
        except IndexError:
            break

    #select director
    try:
        direct = soup2.find('table', {'class': 'simpleTable simpleCreditsTable'}) \
                .find_all('td', {'class': 'name'})
        director = [text.text for text in direct]
        director = str(director)[1:-1].replace("\\n ", "").replace("'", "")
    except (IndexError, AttributeError):
        director = ""
    
    #select writers
    try:
        write = soup2.find_all('table', {'class': 'simpleTable simpleCreditsTable'})[1] \
                .find_all('td', {'class': 'name'})
        writers = [text.text for text in write]
        writers = str(writers)[1:-1].replace("\\n ", "").replace("'", "")
    except (IndexError, AttributeError):
        writers = ""
        
    
    #SELECT RUNTIME
    response = requests.get("https://www.imdb.com/title/"+imdb_id+"/technical")
    soup3 = BeautifulSoup(response.text)

    time = soup3.find('table', {'class': 'dataTable labelValueTable'}).find_all('tr', {'class': 'odd'})
    runtime = [text.text for text in time]
    try:
        if ("min (" in (re.sub('\n', '', runtime[0]))) and (" min)" in (re.sub('\n', '', runtime[0]))):
            duration = int((re.sub('\n', '', runtime[0]).split("min (")[1]).split(" min)")[0].strip())
        elif "hr (" in (re.sub('\n', '', runtime[0])): 
            duration = int((re.sub('\n', '', runtime[0]).split("hr (")[1]).split(" min)")[0].strip())
        else :
            duration = int((re.sub('\n', '', runtime[0]).split("Runtime")[1]).split(" min ")[0].strip())
    except ValueError:
        duration = ""

    return imdb_id, "", "", title, original_title, year, director, casting, genres, duration, country, language, \
    writers, production, synopsis, rating, num_rate, budget, income


In [176]:
df = pd.read_excel('to_scrap.xlsx')

  """Entry point for launching an IPython kernel.


In [177]:
imdb_id = df["imdb"].values

In [178]:
imdb_id

array(['tt6341832', 'tt9654108', 'tt9013182', 'tt6111574', 'tt7390588',
       'tt10288566', 'tt6777370'], dtype=object)

In [179]:
movies = []

In [180]:
for movie in tqdm.tqdm(imdb_id):
    movies.append(movie_imdb(movie))

  0%|          | 0/7 [00:01<?, ?it/s]


IndexError: list index out of range

In [9]:
movies

[('tt7979580',
  '',
  '',
  'Les Mitchell contre les machines',
  'The Mitchells vs the Machines',
  2021,
  'Michael Rianda, Jeff Rowe',
  'Abbi Jacobson, Danny McBride, Maya Rudolph, Michael Rianda, Eric André, Olivia Colman, Fred Armisen, Beck Bennett, Chrissy Teigen, John Legend, Charlyne Yi, Blake Griffin, "Conan OBrien", Doug the Pug, Melissa Sturm, Doug Nicholas, Madeleine McGraw, Ellen Wightman, Sasheer Zamata, Elle Mills, Alex Hirsch, Jay Pharoah, Natalie Canizares, Jeff Rowe, Zeno Robinson, Grey Griffin, Will Allegra, Alison Rich, Natalia del Riego',
  'Animation, Adventure, Comedy, Family, Sci-Fi',
  113,
  'USA, Canada, France, Hong Kong',
  'English',
  'Michael Rianda, Jeff Rowe',
  'Sony Pictures Animation',
  'An animated action-comedy about an ordinary family who find themselves in the middle of their biggest family challenge yet...saving the world from the robot apocalypse. No big deal, right? It all starts when creative outsider Katie Mitchell is accepted into the f

In [10]:
columns_name = ["imdb_id", "vu", "à voir", "title", "original_title", "year", "director", "casting", "genres", "duration", "country", "language", \
"writers", "production", "synopsis", "rating", "num_rate", "budget", "income"]

In [11]:
result = pd.DataFrame(list(movies), columns = columns_name)
result = result.set_index("imdb_id")

In [12]:
result.to_csv("movies_scraped.csv")

# Tests

In [238]:
imdb_id, vu, a_voir, title, original_title, year, director, casting, genres, duration, country, language, \
    writers, production, synopsis, rating, num_rate, budget, income = movie_imdb("tt8521876")

In [239]:
test = [(imdb_id, vu, a_voir, title, original_title, year, director, casting, genres, duration, country, language, \
    writers, production, synopsis, rating, num_rate, budget, income)]

In [240]:
columns_name = ["imdb_id", "vu", "à voir", "title", "original_title", "year", "director", "casting", "genres", "duration", "country", "language", \
"writers", "production", "synopsis", "rating", "num_rate", "budget", "income"]

In [241]:
result = pd.DataFrame(test, columns = columns_name)
result = result.set_index("imdb_id")

In [242]:
result.to_csv("add.csv")

In [193]:
response = requests.get("https://www.imdb.com/title/tt6777370")
soup = BeautifulSoup(response.text)

In [194]:
soup

<!DOCTYPE html>
<html lang="en-US" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/"><head><script></script><style data-styled="" data-styled-version="5.2.1">.esMsKP{width:100%;}/*!sc*/
data-styled.g32[id="IMDbEditorialSingle__StyledSlateCard-ppbgrx-0"]{content:"esMsKP,"}/*!sc*/
.cXfGIs{-webkit-flex-shrink:0;-ms-flex-negative:0;flex-shrink:0;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;margin-left:0.25rem;margin-right:auto;-webkit-order:1;-ms-flex-order:1;order:1;position:relative;}/*!sc*/
@media screen and (min-width:1024px){.cXfGIs{margin-left:auto;margin-right:0.5rem;-webkit-order:0;-ms-flex-order:0;order:0;padding-left:0;}}/*!sc*/
@media (

In [196]:
soup.find_all('div', {'id': 'title-overview-widget'})

[]

In [51]:
test = []
for ligne in soup:
    if "<title>" in ligne:
        test.append(ligne)


In [52]:
test

[]

In [758]:
response = requests.get("https://www.imdb.com/title/tt5440848/fullcredits")
soup2 = BeautifulSoup(response.text)


In [709]:
response = requests.get("https://www.imdb.com/title/tt0008843/technical")
soup3 = BeautifulSoup(response.text)

In [84]:
def year(imdb_id):

    #SELECT SYNOPSIS, GENRES, RATING, NUMBER RATE, YEAR, TITLE, DURATION, COUNTRY,
    #SELECT LANGUAGE, BUDGET, PRODUCTION, INCOME
    response = requests.get("https://www.imdb.com/title/"+imdb_id)
    soup = BeautifulSoup(response.text)

    #select title and year
    tit = soup.find_all('div', {'class': 'titleBar'})
    tit_y = [text.text for text in tit]
    tit_y = re.sub('\n', '', tit_y[0])
  
    year = int((tit_y.split("\xa0(")[1]).split(")")[0])

    return imdb_id, year

In [76]:
df = pd.read_csv('date.csv')

In [77]:
imdb_id = df["date"].values

In [87]:
movies = []

In [130]:
df = pd.read_csv('genre.csv')

In [138]:
imdb_tot = df["imdb_id"].values

In [139]:
genre_tot = df["genres"].values

In [127]:
df2 = pd.read_csv('changenre.csv')

In [143]:
imdb = df2["imdb_id"].values

In [163]:
new_genre = []

In [164]:
for i, val1 in tqdm.tqdm(enumerate(imdb_tot)):
    if val1 in imdb:
        new_genre.append((genre_tot[i] + ", Mystery"))
    else :
        new_genre.append(genre_tot[i])
            

55513it [00:01, 42484.89it/s]


In [175]:
new_genre[372]

'Drama, Mystery'

In [174]:
genre_tot[372]

'Drama'

In [173]:
np.where(imdb_tot =="tt7282468")

(array([372]),)

In [176]:
col = ["imdb_id", "genres"]

In [186]:
finish = imdb_tot, genre_tot

In [189]:
res = pd.DataFrame(list(zip(imdb_tot, new_genre)), columns = col)


In [190]:
res

Unnamed: 0,imdb_id,genres
0,tt0416449,"Action, Drama"
1,tt8579674,"Drama, Thriller, War"
2,tt0078723,"Action, Comedy, War"
3,tt1190080,"Action, Adventure, Sci-Fi"
4,tt1022603,"Comedy, Drama, Romance"
...,...,...
55508,tt1519328,Drama
55509,tt8463258,"Crime, Drama"
55510,tt0272425,"Action, Fantasy, Sci-Fi"
55511,tt2524674,"Comedy, Drama"


In [191]:
res.to_csv("add.csv")

In [53]:
from selenium import webdriver

In [68]:
DRIVER_PATH = "/home/leo/code/leomockel/selenium/bin/chromedriver"

In [118]:
BASE_URL = "https://www.imdb.com/title/tt6777370"

In [73]:
driver = webdriver.Chrome(DRIVER_PATH)

In [119]:
driver.get(BASE_URL)

In [120]:
driver.title

'Sans pitié (2017) - IMDb'

In [134]:
buttons = driver.find_elements_by_class_name("title_block")

In [135]:
buttons

[]

In [122]:
test = []
for button in buttons:
    test.append(button.text)

In [127]:
test[1].split("\n")

['Menu',
 'All',
 'Watchlist',
 'Sign In',
 'Welcome to the new version of this page.',
 'Learn more',
 'Report an issue',
 'Sans pitié',
 'Original title: Bulhandang',
 '2017',
 '12',
 '1h 57min',
 'IMDb RATING',
 '6.7',
 '/10',
 '2.1K',
 'YOUR RATING',
 'Rate',
 'Cast & crew',
 'User reviews',
 'IMDbPro',
 'All topics',
 'Trailer1:36',
 '1 VIDEO',
 '28 PHOTOS',
 'Action',
 'Crime',
 'Drama',
 'A young cop finds his loyalties divided after befriending a notorious criminal while undercover in prison.',
 'Director',
 'Sung-hyun Byun',
 'Writers',
 'Sung-hyun ByunMin-soo KimMyeong-chan Park(comic)',
 'Stars',
 'Kyung-gu SolSi-wan YimKyeong-Yeong Lee',
 'See production, box office & company info',
 'Add to Watchlist',
 '10',
 'User reviews',
 '35',
 'Critic reviews',
 '53',
 'Metascore',
 'Awards',
 '9 wins & 21 nominations',
 'Videos',
 '1',
 'Trailer 1:36',
 'The Merciless',
 'Photos',
 '28',
 'SPONSORED',
 'Top cast',
 'Edit',
 'Kyung-gu Sol',
 'Han Jae-ho',
 'Si-wan Yim',
 'Jo Hyun-so

In [198]:
r = requests.get("https://www.imdb.com/title/tt6777370/")

In [154]:
def scrap (id):
    r = requests.get("https://www.imdb.com/title/"+id)
    bs = BeautifulSoup(r.text)
    for movie in bs.findAll('td','title'):
        title = movie.find('a').contents[0]
        genres = movie.find('span','genre').findAll('a')
        genres = [g.contents[0] for g in genres]
        runtime = movie.find('span','runtime').contents[0]
        rating = movie.find('span','value').contents[0]
        year = movie.find('span','year_type').contents[0]
        imdbID = movie.find('span','rating-cancel').a['href'].split('/')[2]
    return title, genres,runtime, rating, year, imdbID

In [199]:
r = requests.get("https://www.imdb.com/title/tt6777370/")

In [200]:
bs = BeautifulSoup(r.text)

In [203]:
bs2 = bs.findAll('div')

In [225]:
bs2[1].findAll('h1', {'class': re.compile('^TitleHeader')})

[<h1 class="TitleHeader__TitleText-sc-1wu6n3d-0 dxSWFG" data-testid="hero-title-block__title" textlength="10">Sans pitié</h1>]

In [217]:
title = movie.find('a')