In [2]:
import imdb
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

# Dataset can be found here:
https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?resource=download

In [3]:
df = pd.read_csv("archive/movies_metadata.csv")
df2=df.dropna(subset=['imdb_id'])

# Extracting the columns to be used
titles = df2['original_title'].to_numpy()
genres = df2['genres'].to_numpy()
imdbIds = df2['imdb_id'].to_numpy()

# Removing NaN
imdbIds = [idee.replace('tt','') for idee in imdbIds]

In [4]:
# Defining imdb object
db = imdb.IMDb()

In [8]:
def getActors(movie):
    try:
        top_actors_list = movie['cast'][:10]
        actors = re.findall('_([A-Za-z ]+)_', str(top_actors_list))
        return list(actors)
    except KeyError:
        print("Oops!  Movie has no or few actors...")
        return False

def getReviews(movie):
    try:
        reviews = []
        # Concatenating all reviews for given movie
        for review in movie['reviews']:
            content = review['content']
            reviews.append(content)       
        return reviews
    except:
        print("Oops! Movie has not been reviewed...")
        return False
    
def getMovie(id):
    movie = db.get_movie(id,['main', 'reviews'])
    return movie


In [9]:
df = pd.DataFrame()
N = 10
for i, titge in enumerate(zip(titles[:N], genres[:N], imdbIds[:N])):
    title, genre, imdbId = titge
    print(f'{i} + {title}')
    
    movie = getMovie(imdbId)
    reviews = getReviews(movie)
    
    # Check that movie has been reviewed
    if(reviews):
        actors = getActors(movie)
        
        # Check that movie has enough actors
        if(actors):
            new_row = pd.Series({'TitleId':imdbId, 'MovieTitle':title, 'Actors': actors, 'Genres': genre, 'Reviews': reviews})
            df = pd.concat([df, new_row.to_frame().T], ignore_index=True) 

0 + Toy Story
1 + Jumanji
2 + Grumpier Old Men
3 + Waiting to Exhale
4 + Father of the Bride Part II
5 + Heat
6 + Sabrina
7 + Tom and Huck
8 + Sudden Death
9 + GoldenEye


In [7]:
df

Unnamed: 0,TitleId,MovieTitle,Actors,Genres,Reviews
0,114709,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",[Andy's toys live a reasonable life of fun and...
1,113497,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[Among the thousands of films I have viewed, t..."
2,113228,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Sophia Loren, Bu...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",[Sequels are rarely half as good as the origin...
3,114885,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devi...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[This was a good movie, even though I'm not th..."
4,113041,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short, Geo...","[{'id': 35, 'name': 'Comedy'}]","[As much as I want to rag this movie, make fun..."
5,113277,Heat,"[Al Pacino, Robert De Niro, Val Kilmer, Jon Vo...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",[I have very little interest in most action fi...
6,114319,Sabrina,"[Harrison Ford, Julia Ormond, Greg Kinnear, Na...","[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",[I was surprised at how good this movie is. A ...
7,112302,Tom and Huck,"[Jonathan Taylor Thomas, Brad Renfro, Eric Sch...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",[We all know the story. There have been some p...
8,114576,Sudden Death,"[Powers Boothe, Whittni Wright, Ross Malinger,...","[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",[I found this to be a very interesting action ...
9,113189,GoldenEye,"[Pierce Brosnan, Sean Bean, Izabella Scorupco,...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...","[Brosnan has the look, the style, the intellig..."


In [33]:
#df.to_pickle("moviesDfv2.pkl")