In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import json

In [2]:
def get_imdb():
    try:
        with open('imdb_cache.html', 'rb') as f:
            read = f.read()
    except:
        url = 'http://www.imdb.com/chart/top'
        response = requests.get(url)
        read = response.text
        with open('imdb_cache.html', 'wb') as f:
            f.write(response.content)
    soup = BeautifulSoup(read, "html.parser")
    return soup

In [52]:
soup = get_imdb()
movies = soup.select('td.titleColumn')
IMDB_number = [i.attrs.get('href')[7:16] for i in soup.select('td.titleColumn a')]
crew = [i.attrs.get('title') for i in soup.select('td.titleColumn a')]
ratings = [i.attrs.get('data-value')
        for i in soup.select('td.posterColumn span[name=ir]')]

In [56]:
list = []
for i in range(len(movies)):
    movie_string = movies[i].get_text()
    movie = str().join(movie_string.split()[1:])
    movie_title = movie[:-6]
    year = int(movie[-5:-1])
    place = int(movie_string.split()[0][:-1])
    data = {"place": place,
            "IMDB_number": IMDB_number[i], 
            "title": movie_title,
            "rating": round(float(ratings[i]), 1),
            "year": year,
            "director": re.search('^[^(]+', crew[i]).group()[:-1],
            "stars": [item[1:] for item in crew[i].split(',')[1:]],
            }
    list.append(data)

In [57]:
df = pd.DataFrame(list)
#df.to_csv('imdb_top_250_movies.csv',index=False)

In [58]:
df

Unnamed: 0,place,IMDB_number,title,rating,year,director,stars
0,1,tt0111161,TheShawshankRedemption,9.2,1994,Frank Darabont,"[Tim Robbins, Morgan Freeman]"
1,2,tt0068646,TheGodfather,9.2,1972,Francis Ford Coppola,"[Marlon Brando, Al Pacino]"
2,3,tt0468569,TheDarkKnight,9.0,2008,Christopher Nolan,"[Christian Bale, Heath Ledger]"
3,4,tt0071562,TheGodfatherPartII,9.0,1974,Francis Ford Coppola,"[Al Pacino, Robert De Niro]"
4,5,tt0050083,12AngryMen,9.0,1957,Sidney Lumet,"[Henry Fonda, Lee J. Cobb]"
...,...,...,...,...,...,...,...
245,246,tt0071411,DersuUzala,8.0,1975,Akira Kurosawa,"[Maksim Munzuk, Yuriy Solomin]"
246,247,tt0103639,Aladdin,8.0,1992,Ron Clements,"[Scott Weinger, Robin Williams]"
247,248,tt0129167,TheIronGiant,8.0,1999,Brad Bird,"[Eli Marienthal, Harry Connick Jr.]"
248,249,tt1454029,TheHelp,8.0,2011,Tate Taylor,"[Viola Davis, Emma Stone]"


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   place        250 non-null    int64  
 1   IMDB_number  250 non-null    object 
 2   title        250 non-null    object 
 3   rating       250 non-null    float64
 4   year         250 non-null    int64  
 5   director     250 non-null    object 
 6   stars        250 non-null    object 
dtypes: float64(1), int64(2), object(4)
memory usage: 13.8+ KB


In [60]:
df.shape

(250, 7)

In [36]:
def get_omdb():
    try:
        with open("omdb_cache.json", 'r') as f:
            cache_contents = f.read()
            cache_dict = json.loads(cache_contents)
    except:
        cache_dict = {}
        for i in range(len(df)):###
            number = df.iloc[i]['IMDB_number']
            response = requests.get("http://www.omdbapi.com/?i=" + number + "&apikey=2a48f97e")
            json_str = response.text
            json_dict = json.loads(json_str)
            cache_dict[number] = json_dict
        with open("omdb_cache.json", 'w') as f:
            dumped_json_cache = json.dumps(cache_dict)
            f.write(dumped_json_cache)
    return cache_dict

In [37]:
omdb_dict = get_omdb()

In [42]:
lst_omdb = []
for movie in omdb_dict.values():
    IMDB_number = movie['imdbID']
    rated = movie['Rated'] #R, PG-13, Approved
    if movie['Runtime'] == 'N/A':
        runtime = None
    else:
        runtime = int(re.search('\d+', movie['Runtime']).group())
    genre = movie['Genre'].split(', ')
    language = movie['Language'].split(', ')
    country = movie['Country'].split(', ')
    nominated_oscar = ((re.search('Nominated for \d+ Oscars', movie['Awards']) != None) or (re.search('Won \d+ Oscars', movie['Awards']) != None))
    won_oscar = (re.search('Won \d+ Oscars', movie['Awards']) != None)
    if movie['imdbVotes'] == 'N/A':
        IMDB_votes = None
    else:
        IMDB_votes = int(str().join(movie['imdbVotes'].split(',')))
    
    try:
        box_office = int(str().join(movie['BoxOffice'][1:].split(',')))
    except:
        box_office = None
    #if movie['BoxOffice'] == 'N/A':
    #    box_office = None
    #else:
    #    box_office = int(str().join(movie['BoxOffice'][1:].split(',')))
    data = {"IMDB_number": IMDB_number,
            "rated": rated, 
            "runtime": runtime,
            "genre": genre,
            "language": language,
            "country": country,
            "nominated_oscar": nominated_oscar,
            "won_oscar": won_oscar,
            "IMDB_votes": IMDB_votes,
            "box_office":box_office,
            }
    lst_omdb.append(data)

In [43]:
df_omdb = pd.DataFrame(lst_omdb)
df_omdb

Unnamed: 0,IMDB_number,rated,runtime,genre,language,country,nominated_oscar,won_oscar,IMDB_votes,box_office
0,tt0111161,R,142.0,[Drama],[English],[United States],True,False,2662343.0,28767189.0
1,tt0068646,R,175.0,"[Crime, Drama]","[English, Italian, Latin]",[United States],True,True,1847150.0,136381073.0
2,tt0468569,PG-13,152.0,"[Action, Crime, Drama]","[English, Mandarin]","[United States, United Kingdom]",True,True,2638780.0,534987076.0
3,tt0071562,R,202.0,"[Crime, Drama]","[English, Italian, Spanish, Latin, Sicilian]",[United States],True,True,1265087.0,47834595.0
4,tt0050083,Approved,96.0,"[Crime, Drama]",[English],[United States],True,False,786180.0,
...,...,...,...,...,...,...,...,...,...,...
245,tt0071411,G,142.0,"[Adventure, Biography, Drama]","[Russian, Chinese]","[Soviet Union, Japan]",False,False,30160.0,
246,tt0103639,G,90.0,"[Animation, Adventure, Comedy]",[English],[United States],True,True,420013.0,217350219.0
247,tt0129167,PG,86.0,"[Animation, Action, Adventure]",[English],[United States],False,False,198212.0,23315035.0
248,tt1454029,PG-13,146.0,[Drama],[English],"[United States, India]",False,False,459753.0,169708112.0


In [44]:
df_omdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   IMDB_number      250 non-null    object 
 1   rated            250 non-null    object 
 2   runtime          248 non-null    float64
 3   genre            250 non-null    object 
 4   language         250 non-null    object 
 5   country          250 non-null    object 
 6   nominated_oscar  250 non-null    bool   
 7   won_oscar        250 non-null    bool   
 8   IMDB_votes       247 non-null    float64
 9   box_office       218 non-null    float64
dtypes: bool(2), float64(3), object(5)
memory usage: 16.2+ KB


In [45]:
df_omdb.shape

(250, 10)

In [62]:
merge = df.merge(df_omdb, on = 'IMDB_number')
merge

Unnamed: 0,place,IMDB_number,title,rating,year,director,stars,rated,runtime,genre,language,country,nominated_oscar,won_oscar,IMDB_votes,box_office
0,1,tt0111161,TheShawshankRedemption,9.2,1994,Frank Darabont,"[Tim Robbins, Morgan Freeman]",R,142.0,[Drama],[English],[United States],True,False,2662343.0,28767189.0
1,2,tt0068646,TheGodfather,9.2,1972,Francis Ford Coppola,"[Marlon Brando, Al Pacino]",R,175.0,"[Crime, Drama]","[English, Italian, Latin]",[United States],True,True,1847150.0,136381073.0
2,3,tt0468569,TheDarkKnight,9.0,2008,Christopher Nolan,"[Christian Bale, Heath Ledger]",PG-13,152.0,"[Action, Crime, Drama]","[English, Mandarin]","[United States, United Kingdom]",True,True,2638780.0,534987076.0
3,4,tt0071562,TheGodfatherPartII,9.0,1974,Francis Ford Coppola,"[Al Pacino, Robert De Niro]",R,202.0,"[Crime, Drama]","[English, Italian, Spanish, Latin, Sicilian]",[United States],True,True,1265087.0,47834595.0
4,5,tt0050083,12AngryMen,9.0,1957,Sidney Lumet,"[Henry Fonda, Lee J. Cobb]",Approved,96.0,"[Crime, Drama]",[English],[United States],True,False,786180.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,246,tt0071411,DersuUzala,8.0,1975,Akira Kurosawa,"[Maksim Munzuk, Yuriy Solomin]",G,142.0,"[Adventure, Biography, Drama]","[Russian, Chinese]","[Soviet Union, Japan]",False,False,30160.0,
246,247,tt0103639,Aladdin,8.0,1992,Ron Clements,"[Scott Weinger, Robin Williams]",G,90.0,"[Animation, Adventure, Comedy]",[English],[United States],True,True,420013.0,217350219.0
247,248,tt0129167,TheIronGiant,8.0,1999,Brad Bird,"[Eli Marienthal, Harry Connick Jr.]",PG,86.0,"[Animation, Action, Adventure]",[English],[United States],False,False,198212.0,23315035.0
248,249,tt1454029,TheHelp,8.0,2011,Tate Taylor,"[Viola Davis, Emma Stone]",PG-13,146.0,[Drama],[English],"[United States, India]",False,False,459753.0,169708112.0


In [63]:
merge.to_csv('collected_data.csv')