# Scrapping movies on IMDB

In [838]:
import requests, csv
from bs4 import BeautifulSoup
import re
import pandas as pd

In [821]:
def movie_imdb(imdb_id):
    
    #SELECT SYNOPSIS, GENRES, RATING, NUMBER RATE, YEAR, TITLE, DURATION, COUNTRY, 
    #SELECT LANGUAGE, BUDGET, PRODUCTION, INCOME
    response = requests.get("https://www.imdb.com/title/"+imdb_id)
    soup = BeautifulSoup(response.text)
    
    #select title and year
    tit = soup.find_all('div', {'class': 'titleBar'})
    tit_y = [text.text for text in tit]
    tit_y = re.sub('\n', '', tit_y[0])
    title = tit_y.split("\xa0(")[0]
    year = int((tit_y.split("\xa0(")[1]).split(")")[0])
    
    #select synopsis and genres
    syn = soup.find_all(id="titleStoryLine")
    syn_list = [text.text for text in syn]
    syn_list = re.sub('\n', '', syn_list[0])
    syno = re.sub('EditStoryline    ', '', syn_list)
    synopsis = ""
  
    if "Written by" in syno:
        synopsis = syno.split("Written by")[0]
    if "Plot Summary" in syno:
        synopsis = syno.split("Plot Summary")[0]
    
    genres = (syn_list.split("Genres:")[1]).split(" MysteryCertificate")[0]
    if "Certificate" in re.sub("\xa0", "", genres):
        genres = re.sub("\xa0", "", genres).strip().replace("|", ",").split("Certificate")[0]
    else :
        genres = re.sub("\xa0", "", genres).strip().replace("|", ",")[:-1]
    
    #select rating
    rate = soup.find_all(itemprop="ratingValue")
    rating = [text.text for text in rate][0]
    
    #select number of ratings
    n_rate = soup.find_all(itemprop="ratingCount")
    num_rate = [text.text for text in n_rate]
    num_rate = int(re.sub(',', '', num_rate[0]))
    
    #select country, language, budget, duration, production, income
    other = soup.find(id='titleDetails').find_all('div', {'class':'txt-block'})
    test = [text.text for text in other]
    
    country = ""
    language = ""
    budget = ""
    duration = ""
    production = ""
    income = ""
    
    for i in range(0, len(test)):
        if "Country" in re.sub('\n', '', test[i]):
            country = re.sub('\n', '', test[i]).split("Country:")[1].replace("|", ", ")
        
    for i in range(0, len(test)):
        if "Language" in re.sub('\n', '', test[i]):
            language = re.sub('\n', '', test[i]).split("Language:")[1].replace("|", ", ")
        
    for i in range(0, len(test)):
        if "Budget" in re.sub('\n', '', test[i]):
            budget = re.sub('\n', '', test[i]).split("Budget:")[1].split("   ")[0]
        
    for i in range(0, len(test)):
        if "Runtime" in re.sub('\n', '', test[i]):
            duration = int((re.sub('\n', '', test[i]).split("Runtime:")[1]).split(" min")[0])
        
    for i in range(0, len(test)):
        if "Production Co" in re.sub('\n', '', test[i]) and "," in re.sub('\n', '', test[i]):
            production = (re.sub('\n', '', test[i]).split("Production Co:")[1]).split(",")[0].strip()
        if "Production Co" in re.sub('\n', '', test[i]) and "," not in re.sub('\n', '', test[i]):
            production = (re.sub('\n', '', test[i]).split("Production Co:")[1]).split(" See more")[0].strip()
            
    for i in range(0, len(test)):
        if "Cumulative Worldwide Gross" in re.sub('\n', '', test[i]):
            income = (re.sub('\n', '', test[i]).split("Cumulative Worldwide Gross:")[1]).split("   ")[0].strip()
        
    
    
    #SELECT CASTING, DIRECTOR, WRITERS
    response = requests.get("https://www.imdb.com/title/"+imdb_id+"/fullcredits")
    soup2 = BeautifulSoup(response.text)
    
    #select casting
    cast_list = soup2.find_all('table', {'class': 'cast_list'})
    cast = []
    for i in range(1, 16):
        try: 
            cast.append((str(cast_list).split('img alt="')[i]).split('" class="')[0])
            casting = str(cast)
            casting = re.sub("'", "", casting[1:-1])
        except IndexError:
            break
    
    #select director
    direct = soup2.find('table', {'class': 'simpleTable simpleCreditsTable'}) \
            .find_all('td', {'class': 'name'})
    director = [text.text for text in direct]
    director = str(director)[1:-1].replace("\\n ", "").replace("'", "")
    
    #select writers
    write = soup2.find_all('table', {'class': 'simpleTable simpleCreditsTable'})[1] \
            .find_all('td', {'class': 'name'})
    writers = [text.text for text in write]
    writers = str(writers)[1:-1].replace("\\n ", "").replace("'", "")
    
    
    
    return title, year, director, casting, genres, duration, country, language, \
    writers, production, synopsis, rating, num_rate, budget, income

In [825]:
title, year, director, casting, genres, duration, country, language, \
    writers, production, synopsis, rating, num_rate, budget, income = movie_imdb("tt10682266")

In [826]:
production

'Happy Madison Productions'

In [686]:
df = pd.read_excel('to_scrap.xlsx')

In [690]:
imdb_id = df["imdb"].values

In [803]:
movies = []

In [804]:
for movie in imdb_id:
    movies.append(movie_imdb(movie))

In [739]:
columns_name = ["title", "year", "director", "casting", "genres", "duration", "country", "language", \
"writers", "production", "synopsis", "rating", "num_rate", "budget", "income"]

In [805]:
result = pd.DataFrame(movies, columns = columns_name)

In [806]:
final_df = pd.concat([df, result.iloc[:, 1:]], axis=1)

In [808]:
final_df.to_csv("movies_scraped.csv")