In [70]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import time

In [2]:
df = pd.read_csv('imdb_ids.csv')
df.head()

Unnamed: 0,name,year,domestic_gross,mojo_id,imdb_id
0,Spider-Man,2002.0,403706375.0,rl678659585,tt0145487
1,Star Wars: Episode II - Attack of the Clones,2002.0,302191252.0,rl2809366017,tt0121765
2,Harry Potter and the Chamber of Secrets,2002.0,261988482.0,rl1433110017,tt0295297
3,Signs,2002.0,227966634.0,rl1685161473,tt0286106
4,My Big Fat Greek Wedding,2002.0,241438208.0,rl342132225,tt0259446


In [3]:
ids = df[-df['imdb_id'].isna()].imdb_id
len(ids)

4194

In [4]:
ids[:5]

0    tt0145487
1    tt0121765
2    tt0295297
3    tt0286106
4    tt0259446
Name: imdb_id, dtype: object

In [7]:
url = "https://www.imdb.com/title/"
response = requests.get(url+ids[1], headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(response.text, 'html.parser')

In [30]:
# extracting budget
budget = soup.find_all('div', class_='sc-f65f65be-0 fVkLRr')[3].find(class_="ipc-metadata-list-item__list-content-item")
budget.text

'$115,000,000 (estimated)'

In [99]:
# extracting director and actor IDs
header = soup.find('div', class_='sc-dffc6c81-3 jFHENY').find_all('li', class_="ipc-inline-list__item")

# initialize empty list
data = {}

# create a dictionary of ids within the list
for item in header:
    data[item.text] = item.find('a').get('href').split('/')[2]
    
data

{'George Lucas': 'nm0000184',
 'Jonathan Hales': 'nm0355054',
 'Hayden Christensen': 'nm0159789',
 'Natalie Portman': 'nm0000204',
 'Ewan McGregor': 'nm0000191'}

In [66]:
# extract data from ombd using API
api_key = '&apikey=6cf40218'
url = 'http://www.omdbapi.com/?i='

In [188]:
response = requests.get(url+ids[0]+api_key)

In [189]:
response.json()

{'Title': 'Spider-Man',
 'Year': '2002',
 'Rated': 'PG-13',
 'Released': '03 May 2002',
 'Runtime': '121 min',
 'Genre': 'Action, Adventure, Sci-Fi',
 'Director': 'Sam Raimi',
 'Writer': 'Stan Lee, Steve Ditko, David Koepp',
 'Actors': 'Tobey Maguire, Kirsten Dunst, Willem Dafoe',
 'Plot': 'After being bitten by a genetically-modified spider, a shy teenager gains spider-like abilities that he uses to fight injustice as a masked superhero and face a vengeful enemy.',
 'Language': 'English',
 'Country': 'United States',
 'Awards': 'Nominated for 2 Oscars. 17 wins & 63 nominations total',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BZDEyN2NhMjgtMjdhNi00MmNlLWE5YTgtZGE4MzNjMTRlMGEwXkEyXkFqcGdeQXVyNDUyOTg3Njg@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '7.4/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '90%'},
  {'Source': 'Metacritic', 'Value': '73/100'}],
 'Metascore': '73',
 'imdbRating': '7.4',
 'imdbVotes': '852,452',
 'imdbID': 'tt0145487',
 'T

In [192]:
temp = pd.DataFrame.from_dict(response.json()).head(1)
temp

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response
0,Spider-Man,2002,PG-13,03 May 2002,121 min,"Action, Adventure, Sci-Fi",Sam Raimi,"Stan Lee, Steve Ditko, David Koepp","Tobey Maguire, Kirsten Dunst, Willem Dafoe",After being bitten by a genetically-modified s...,...,73,7.4,852452,tt0145487,movie,25 Apr 2013,"$407,022,860",,,True


In [191]:
data

{'George Lucas': 'nm0000184',
 'Jonathan Hales': 'nm0355054',
 'Hayden Christensen': 'nm0159789',
 'Natalie Portman': 'nm0000204',
 'Ewan McGregor': 'nm0000191'}

In [134]:
for actors in temp['Actors'].str.split(','):
    for actor in actors:
        actor = actor.strip()
        print(actor, data[actor])

Hayden Christensen nm0159789
Natalie Portman nm0000204
Ewan McGregor nm0000191


In [168]:
# create ActorID column
if ',' not in temp['Actors'][0]:
    temp['ActorID'] = temp['Actors'].map(data)
else:
    temp['ActorID'] = [[data[actor.strip()] for actor in actors] for actors in temp['Actors'].str.split(',')]

In [169]:
if ',' not in temp['Director'][0]:
    temp['DirectorID'] = temp['Director'].map(data)
else:
    temp['DirectorID'] = [[data[director.strip()] for director in directors] for directors in temp['Director'].str.split(',')]
temp

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,DirectorID,ActorID
0,Star Wars: Episode II - Attack of the Clones,2002,PG,16 May 2002,142 min,"Action, Adventure, Fantasy",George Lucas,"George Lucas, Jonathan Hales","Hayden Christensen, Natalie Portman, Ewan McGr...","Ten years after initially meeting, Anakin Skyw...",...,738330,tt0121765,movie,10 Apr 2015,"$310,676,740",,,True,nm0000184,"[nm0159789, nm0000204, nm0000191]"


In [179]:
temp.columns

Index(['Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director',
       'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster',
       'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type',
       'DVD', 'BoxOffice', 'Production', 'Website', 'Response', 'DirectorID',
       'ActorID'],
      dtype='object')

In [463]:
def get_imdb_data(id):
    '''
    gets the data from imdbd
    '''
    url = "https://www.imdb.com/title/"
    response = requests.get(url+id, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # extracting budget
    box_office = soup.find_all('div', class_='sc-f65f65be-0 fVkLRr')

    for i in range(1,len(box_office)):
        text = box_office[i].find(class_="ipc-metadata-list-item__list-content-item")
        if text is None:
            continue
        else:
            text = text.text
            if '$' in text or '£' in text:
                budget = text.split(' ')[0]
                break
            else: 
                budget = 'NA'
            
    # extracting header IDS
    header = soup.find('div', class_='sc-dffc6c81-3 jFHENY').find_all('li', class_="ipc-inline-list__item")

    # initialize empty list
    data = {}

    # create a dictionary of ids within the list
    for item in header:
        data[item.text.title()] = item.find('a').get('href').split('/')[2]
        
    return budget, data

def get_omdb_data(id, data):
    '''
    gets the data from omdb
    '''
    # extract data from ombd using API
    api_key = '&apikey=6cf40218'
    url = 'http://www.omdbapi.com/?i='
    response = requests.get(url+id+api_key)
    
    # create a temporary dataframe - only want the first rwo
    temp = pd.DataFrame.from_dict(response.json()).head(1)
    
    # create ActorID column
    if ',' not in temp['Actors'][0]:
        temp['ActorID'] = temp['Actors'].map(data)
    else:
        temp['ActorID'] = [[data[actor.strip().title()] for actor in actors] for actors in temp['Actors'].str.split(',')]
    
    # create DirectorID column
    if ',' not in temp['Director'][0]:
        temp['DirectorID'] = temp['Director'].map(data)
    else:
        temp['DirectorID'] = [[data[director.strip().title()] for director in directors] for directors in temp['Director'].str.split(',')]

    return temp
    

def movie_data(ids):
    '''
    takes a list of imdb movie IDs, collects data from IMDb and OMDb, and returns a dataframe
    '''
    
    # initialize dataframe
    cols = ['Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director',
       'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster',
       'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type',
       'DVD', 'BoxOffice', 'Production', 'Website', 'Response', 'DirectorID',
       'ActorID','Budget']
    df = pd.DataFrame(columns = cols)
    
    counter = 1
    total = len(ids)
    
    for id in ids:
        try:
            budget, data = get_imdb_data(id)
            temp = get_omdb_data(id, data)
            temp['Budget'] = budget
            df = pd.concat([df, temp], ignore_index=True)
            print(f"{(counter)/total*100:.2f}% complete", end='\r')
            counter += 1
            time.sleep(0.4)
        except Exception as e:
            try:
                time.sleep(5)
                budget, data = get_imdb_data(id)
                temp = get_omdb_data(id, data)
                temp['Budget'] = budget
                df = pd.concat([df, temp], ignore_index=True)
                print(f"{(counter)/total*100:.2f}% complete", end='\r')
                counter += 1
            except Exception:
                continue
        
    return df

In [464]:
movies_data = movie_data(ids[3576:])

98.71% complete

In [363]:
# movies_a = movies_data (1-1190)

In [422]:
# movies_c = movies_data (3192-3575)

In [392]:
# movies_b = movies_data (1191-3191)

In [466]:
# movies_d = movies_data (3576-)

In [480]:
movies = pd.concat([movies_a, movies_b, movies_c, movies_d], ignore_index=True)
movies.head()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbID,Type,DVD,BoxOffice,Production,Website,Response,DirectorID,ActorID,Budget
0,Spider-Man,2002,PG-13,03 May 2002,121 min,"Action, Adventure, Sci-Fi",Sam Raimi,"Stan Lee, Steve Ditko, David Koepp","Tobey Maguire, Kirsten Dunst, Willem Dafoe",After being bitten by a genetically-modified s...,...,tt0145487,movie,25 Apr 2013,"$407,022,860",,,True,nm0000600,"[nm0001497, nm0000379, nm0000353]","$139,000,000"
1,Star Wars: Episode II - Attack of the Clones,2002,PG,16 May 2002,142 min,"Action, Adventure, Fantasy",George Lucas,"George Lucas, Jonathan Hales","Hayden Christensen, Natalie Portman, Ewan McGr...","Ten years after initially meeting, Anakin Skyw...",...,tt0121765,movie,10 Apr 2015,"$310,676,740",,,True,nm0000184,"[nm0159789, nm0000204, nm0000191]","$115,000,000"
2,Harry Potter and the Chamber of Secrets,2002,PG,15 Nov 2002,161 min,"Adventure, Family, Fantasy",Chris Columbus,"J.K. Rowling, Steve Kloves","Daniel Radcliffe, Rupert Grint, Emma Watson",An ancient prophecy seems to be coming true wh...,...,tt0295297,movie,15 Jan 2008,"$262,641,637",,,True,nm0001060,"[nm0705356, nm0342488, nm0914612]","$100,000,000"
3,Signs,2002,PG-13,02 Aug 2002,106 min,"Drama, Mystery, Sci-Fi",M. Night Shyamalan,M. Night Shyamalan,"Mel Gibson, Joaquin Phoenix, Rory Culkin",A widowed former reverend living with his chil...,...,tt0286106,movie,11 Aug 2016,"$227,966,634",,,True,nm0796117,"[nm0000154, nm0001618, nm0191412]","$72,000,000"
4,My Big Fat Greek Wedding,2002,PG,02 Aug 2002,95 min,"Comedy, Drama, Romance",Joel Zwick,Nia Vardalos,"Nia Vardalos, John Corbett, Michael Constantine",A young Greek woman falls in love with a non-G...,...,tt0259446,movie,10 Aug 2016,"$241,438,208",,,True,nm0959034,"[nm0889522, nm0179173, nm0176073]","$5,000,000"


In [482]:
movies.to_csv('movies.csv', index=False)

In [356]:
url = "https://www.imdb.com/title/"
response = requests.get(url+ids[121], headers={'User-Agent': 'Mozilla/5.0'})
honey = BeautifulSoup(response.text, 'html.parser')
    
# extracting budget
# box_office = honey.find_all('div', class_='sc-f65f65be-0 fVkLRr')

#     for i in range(1,len(box_office)):
#         text = box_office[i].find(class_="ipc-metadata-list-item__list-content-item")
#         if text is None:
#             continue
#         else:
#             text = text.text
#             if '$' in text:
#                 budget = text.split(' ')[0]
#                 break
#             else: 
#                 budget = 'NA'
            
# extracting header IDS
# header = honey.find('div', class_='sc-dffc6c81-3 jFHENY').find_all('li', class_="ipc-inline-list__item")

# # initialize empty list
# data = {}

# # create a dictionary of ids within the list
# for item in header:
#     data[item.text] = item.find('a').get('href').split('/')[2]

In [333]:
# extract data from ombd using API
api_key = '&apikey=6cf40218'
url = 'http://www.omdbapi.com/?i='
response = requests.get(url+ids[201]+api_key)
    
# # create a temporary dataframe - only want the first rwo
# temp = pd.DataFrame.from_dict(response.json()).head(1)

# # for actors in temp['Actors'].str.split(','):
# #     for actor in actors:
# #         actor = actor.strip().title()
# #         print(data[actor])
    
# # create ActorID column
# if ',' not in temp['Actors'][0]:
#     temp['ActorID'] = temp['Actors'].map(data)
# else:
#     temp['ActorID'] = [[data[actor.strip().title()] for actor in actors] for actors in temp['Actors'].str.split(',')]
    
# # create DirectorID column
# if ',' not in temp['Director'][0]:
#     temp['DirectorID'] = temp['Director'].map(data)
# else:
#     temp['DirectorID'] = [[data[director.strip().title()] for director in directors] for directors in temp['Director'].str.split(',')]


In [334]:
response.text

'{"Response":"False","Error":"Request limit reached!"}'

In [324]:
temp

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,ActorID,DirectorID
0,The Honeymooners,2005,PG-13,10 Jun 2005,90 min,"Comedy, Family, Romance",John Schultz,"Danny Jacobson, David Sheffield, Barry W. Blau...","Cedric the Entertainer, Mike Epps, Gabrielle U...",Working class New York bus driver Ralph Kramde...,...,7491,tt0373908,movie,15 Jun 2011,"$12,843,849",,,True,"[nm0147825, nm0258402, nm0005517]",nm0776271


In [302]:
data

{'John Schultz': 'nm0776271',
 'Danny Jacobson': 'nm0414816',
 'David Sheffield': 'nm0790775',
 'Barry W. Blaustein': 'nm0087904',
 'Cedric The Entertainer': 'nm0147825',
 'Mike Epps': 'nm0258402',
 'Gabrielle Union': 'nm0005517'}

In [290]:
data

{'John Schultz': 'nm0776271',
 'Danny Jacobson': 'nm0414816',
 'David Sheffield': 'nm0790775',
 'Barry W. Blaustein': 'nm0087904',
 'Cedric The Entertainer': 'nm0147825',
 'Mike Epps': 'nm0258402',
 'Gabrielle Union': 'nm0005517'}

In [372]:
url = "https://www.imdb.com/title/"
response = requests.get(url+ids[1192], headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(response.text, 'html.parser')
    
# # extracting budget
# budget = spider.find_all('div', class_='sc-f65f65be-0 fVkLRr')[3].find(class_="ipc-metadata-list-item__list-content-item")
# budget = budget.text

In [383]:
budget = soup.find_all('div', class_='sc-f65f65be-0 fVkLRr').find(class_="ipc-metadata-list-item__list-content-item")
budget = budget.text
budget

AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

In [382]:
header = soup.find('div', class_='sc-dffc6c81-3 jFHENY').find_all('li', class_="ipc-inline-list__item")
header

[<li class="ipc-inline-list__item" role="presentation"><a aria-disabled="false" class="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link" href="/name/nm0680782/?ref_=tt_ov_dr" role="button" tabindex="0">Sean MacLeod Phillips</a></li>,
 <li class="ipc-inline-list__item" role="presentation"><a aria-disabled="false" class="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link" href="/name/nm0724255/?ref_=tt_ov_wr" role="button" tabindex="0">Mose Richards</a></li>,
 <li class="ipc-inline-list__item" role="presentation"><a aria-disabled="false" class="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link" href="/name/nm0000630/?ref_=tt_ov_st" role="button" tabindex="0">Liev Schreiber</a></li>,
 <li class="ipc-inline-list__item" role="presentation"><a aria-disabled="false" class="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link" href="/name/

In [380]:
box_office = soup.find_all('div', class_='sc-f65f65be-0 fVkLRr')

for i in range(1,len(box_office)):
    text = box_office[i].find(class_="ipc-metadata-list-item__list-content-item")
    if text is None:
        continue
    else:
        text = text.text
        if '$' in text or '£' in text:
            budget = text.split(' ')[0]
            break
        
budget

'$23,746,066'

In [238]:
box_office[1].find(class_="ipc-metadata-list-item__list-content-item").text

'August 2, 2002 (United States)'

In [228]:
box_office[2].find(class_="ipc-metadata-list-item__list-content-item").text

'$5,000,000 (estimated)'