In [7]:
import pandas as pd
import numpy as np
import spacy as sp
from imdb import Cinemagoer
import matplotlib.pyplot as plt
import datetime
from collections import Counter
import warnings

warnings.filterwarnings("ignore")

from cleaning_funcs import *

ia = Cinemagoer()

%matplotlib inline

#IMDb Docs: https://imdbpy.readthedocs.io/en/latest/index.html

In [60]:
data = load_data('movie_list_current.csv')
data = data[['Month', 'Pick', 'IMDb ID', 'Movie', 'Year', 'IMDb Score', 'Sebastian', 'Jon', 'Dazraf', 'Average', 'Notes']]
print("Number of Movies:", data.shape[0])
data = data[data.Sebastian.notnull()]
data.head(3)

Number of Movies: 93


Unnamed: 0,Month,Pick,IMDb ID,Movie,Year,IMDb Score,Sebastian,Jon,Dazraf,Average,Notes
0,3/2019,Farzad,947798,Black Swan,2010,8.0,8.0,9.0,8.0,8.3,Inagural movie!
1,4/2019,Jon,156887,Perfect Blue,1997,8.0,9.0,9.0,7.0,8.3,"Black Swan, but make it anime."
2,5/2019,Steven,311113,Master and Commander: The Far Side of the World,2003,7.4,9.5,8.0,8.0,8.5,https://www.youtube.com/watch?v=mq1YthGFjRI


In [61]:
title_list = []
year_list = []
rating_list = []
plot_outline_list = []
plot_outline = []
synopsis_list = []
director_list = []
runtime_list = []
gross_list = []
budget_list = []

genre_list = []

for index, row in data.iterrows():
    movie = ia.get_movie(row['IMDb ID'])
    
    title_list.append(movie.get('title'))
    year_list.append(movie.get('year'))
    rating_list.append(movie.get('rating'))
    runtime_list.append(movie.get('runtime')[0])

    try:
        director_list.append(movie.get('director')[0]['name'])
    except:
        director_list.append('No director found')

    try:
        gross_list.append(movie.get('box office')['Cumulative Worldwide Gross'].replace('$', '').replace(',',''))
    except:
        gross_list.append(np.nan)

    try:
        temp = movie.get('box office')['Budget']
        temp = temp.replace('$', '').replace(',','').split(' ')[0]
        budget_list.append(temp)
    except:
        budget_list.append(np.nan)
        
    plot_outline_list.append(text_preprocessing(movie.get('plot outline')))
    plot_outline.append(text_preprocessing(movie.get('plot outline')))
    synopsis_list.append(text_preprocessing(movie.get('synopsis')))
    genre_list.append(movie.get('genre'))
    # movie_data['imdbid'] = movie.get('imdbid')


data['title'] = title_list
data['year'] = year_list
data['rating'] = rating_list
data['runtime'] = runtime_list
data['director'] = director_list
data['plot_outline'] = plot_outline_list
data['plot'] = plot_outline
data['synopsis'] = synopsis_list
data['genres'] = genre_list
data['gross'] = gross_list
data['budget'] = budget_list

f = lambda x: 'genre_{}'.format(x + 1)

genres = pd.DataFrame(
    genre_list,
    data.index, dtype=object
).fillna('').rename(columns=f)

data = pd.concat([data, genres], axis=1)

In [30]:
subset = data[0:3]

genre_list = []

for index, row in subset.iterrows():
    movie = ia.get_movie(row['IMDb ID'])
    
    genre_list.append(movie.get('genres'))
    print(movie.get('genres'))

subset['genres'] = genre_list

['Drama', 'Thriller']
['Animation', 'Crime', 'Drama', 'Mystery', 'Thriller']
['Action', 'Adventure', 'Drama', 'War']


In [62]:
data

Unnamed: 0,Month,Pick,IMDb ID,Movie,Year,IMDb Score,Sebastian,Jon,Dazraf,Average,...,synopsis,genres,gross,budget,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6
0,3/2019,Farzad,947798,Black Swan,2010,8.0,8.0,9.0,8.0,8.3,...,"[movie, open, nina, sayers, natalie, portman, ...","[Drama, Thriller]",329398046,13000000,Drama,Thriller,,,,
1,4/2019,Jon,156887,Perfect Blue,1997,8.0,9.0,9.0,7.0,8.3,...,,"[Animation, Crime, Drama, Mystery, Thriller]",781197,JPY3000000,Animation,Crime,Drama,Mystery,Thriller,
2,5/2019,Steven,311113,Master and Commander: The Far Side of the World,2003,7.4,9.5,8.0,8.0,8.5,...,"[film, take, place, 1805, napoleonic, war, cap...","[Action, Adventure, Drama, War]",212011111,150000000,Action,Adventure,Drama,War,,
3,6/2019,Farzad,2492296,Show Me A Hero,2015,8.1,7.0,8.0,8.0,7.7,...,,"[Drama, History]",,,Drama,History,,,,
4,7/2019,Jon,147800,10 Things I Hate About You,1999,7.3,7.0,7.0,5.0,6.3,...,"[cameron, james, joseph, gordonlevitt, new, ki...","[Comedy, Drama, Romance]",,30000000,Comedy,Drama,Romance,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,3/2022,Farzad,93870,Robocop,1987,7.6,6.5,7.5,9.5,7.8,...,"[movie, open, news, report, advertise, way, li...","[Action, Crime, Sci-Fi, Thriller]",,13700000,Action,Crime,Sci-Fi,Thriller,,
88,4/2022,Steven,46911,Diabolique,1955,8.1,7.0,7.0,6.5,6.8,...,"[story, take, place, secondrate, boarding, sch...","[Crime, Drama, Horror, Mystery, Thriller]",,,Crime,Drama,Horror,Mystery,Thriller,
89,4/2022,Jon,14039582,Drive My Car,2021,7.6,7.0,9.5,7.0,7.8,...,,[Drama],,,Drama,,,,,
90,5/2022,Farzad,416449,300,2006,7.6,5.0,4.0,3.0,4.0,...,"[spartan, custom, harsh, spartan, inspect, inf...","[Action, Drama]",456068181,65000000,Action,Drama,,,,


In [63]:
#write file out to csv
output_data(data, 'movie_list_current_clean.csv')

Data output to /Users/jonat/Desktop/code/movie_analysis/data/output/movie_list_current_clean.csv
