# Analysis of our data

1. Data collecting and data cleaning 
2. Computations for the creation of the adjacency matrix and the graph
3. Analysis

In [28]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
from pandas.io.json import json_normalize
import pickle
import re
import scipy as sp
from scipy import stats
import seaborn as sns
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.datasets.samples_generator import make_blobs
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
pd.options.mode.chained_assignment = None

import omdb

In [2]:
%matplotlib inline

# 1. Data collecting and data cleaning

In [14]:
#import dataset
FILE_PATH_CREW = "../Data/tmdb_5000_credits.csv"

features_crew = pd.read_csv(FILE_PATH_CREW)

# Dataframe with info about the movies
features_crew.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [22]:
#specify your API_KEY used to set client
API_y_KEY = '3a7e30ce'
omdb.set_default('apikey', API_y_KEY)
client = omdb.OMDBClient(apikey=API_y_KEY)

In [70]:
#We have a 1000 request limit so you can decide here at which film you want to start and where to end 
id_end = 4803
id_start = id_end - 1000

In [74]:
#creates empty dictionary then updates it using the movie_id as a key and the output of the
#request as value (note that that res.content isn't actually a dict but a byte object, we may
#need to convert it later)

collected = dict()

for i in range(id_start, id_end):
    title = features_crew.title[i]
    res = omdb.request(t=title, tomatoes=True)
    movie_json = res.content
    collected[str(features_crew.movie_id[i])] = movie_json

In [83]:
# saves collected as pickle
with open('metacrit.p', 'wb') as fp:
    pickle.dump(collected, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
# loads collected
with open('metacrit.p', 'rb') as fp:
    test_data = pickle.load(fp)

# 2. Computations for the creation of the adjacency matrix and the graph

In [4]:
df = pd.DataFrame.from_dict(test_data, orient='index')
df.head()

Unnamed: 0,0
19995,"b'{""Title"":""Avatar"",""Year"":""2009"",""Rated"":""PG-..."
285,"b'{""Title"":""Pirates of the Caribbean: At World..."
206647,"b'{""Title"":""Spectre"",""Year"":""2015"",""Rated"":""PG..."
49026,"b'{""Title"":""The Dark Knight Rises"",""Year"":""201..."
49529,"b'{""Title"":""John Carter"",""Year"":""2012"",""Rated""..."


In [79]:
dataframe = pd.read_json(df.iloc[0].values[0])

for i in range(1, len(df)):
    if len(df.iloc[i].values[0]) > 100:
        movie = pd.read_json(df.iloc[i].values[0])
    dataframe = pd.concat([dataframe, movie])

In [80]:
# We decided to drop the useless columns that did not contain information about the movie
# These columns are: - urls (Poster, Website, tomatoImage)
#                    - columns related to the API (Response, Type)
#                    - columns where all entries are NaN

dataframe = dataframe.drop(columns=['Poster', 'Response', 'Type', 'Website', 'tomatoConsensus', 
                                    'tomatoFresh', 'tomatoImage', 'tomatoMeter', 'tomatoRating', 
                                    'tomatoReviews', 'tomatoRotten', 'tomatoURL', 'tomatoUserMeter', 
                                    'tomatoUserRating', 'tomatoUserReviews', 'totalSeasons'])
dataframe.head()

Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Genre,Language,Metascore,Plot,...,Rated,Ratings,Released,Runtime,Title,Writer,Year,imdbID,imdbRating,imdbVotes
0,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",Won 3 Oscars. Another 85 wins & 128 nominations.,"$749,700,000","UK, USA",22 Apr 2010,James Cameron,"Action, Adventure, Fantasy, Sci-Fi","English, Spanish",83,A paraplegic marine dispatched to the moon Pan...,...,PG-13,"{'Source': 'Internet Movie Database', 'Value':...",18 Dec 2009,162 min,Avatar,James Cameron,2009,tt0499549,7.8,1013715
1,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",Won 3 Oscars. Another 85 wins & 128 nominations.,"$749,700,000","UK, USA",22 Apr 2010,James Cameron,"Action, Adventure, Fantasy, Sci-Fi","English, Spanish",83,A paraplegic marine dispatched to the moon Pan...,...,PG-13,"{'Source': 'Rotten Tomatoes', 'Value': '82%'}",18 Dec 2009,162 min,Avatar,James Cameron,2009,tt0499549,7.8,1013715
2,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",Won 3 Oscars. Another 85 wins & 128 nominations.,"$749,700,000","UK, USA",22 Apr 2010,James Cameron,"Action, Adventure, Fantasy, Sci-Fi","English, Spanish",83,A paraplegic marine dispatched to the moon Pan...,...,PG-13,"{'Source': 'Metacritic', 'Value': '83/100'}",18 Dec 2009,162 min,Avatar,James Cameron,2009,tt0499549,7.8,1013715
0,"Johnny Depp, Geoffrey Rush, Orlando Bloom, Kei...",Nominated for 2 Oscars. Another 20 wins & 44 n...,"$309,404,152",USA,04 Dec 2007,Gore Verbinski,"Action, Adventure, Fantasy",English,50,"Captain Barbossa, Will Turner and Elizabeth Sw...",...,PG-13,"{'Source': 'Internet Movie Database', 'Value':...",25 May 2007,169 min,Pirates of the Caribbean: At World's End,"Ted Elliott, Terry Rossio, Ted Elliott (charac...",2007,tt0449088,7.1,542125
1,"Johnny Depp, Geoffrey Rush, Orlando Bloom, Kei...",Nominated for 2 Oscars. Another 20 wins & 44 n...,"$309,404,152",USA,04 Dec 2007,Gore Verbinski,"Action, Adventure, Fantasy",English,50,"Captain Barbossa, Will Turner and Elizabeth Sw...",...,PG-13,"{'Source': 'Rotten Tomatoes', 'Value': '44%'}",25 May 2007,169 min,Pirates of the Caribbean: At World's End,"Ted Elliott, Terry Rossio, Ted Elliott (charac...",2007,tt0449088,7.1,542125


We now need to transform the types so that it is easier to analyze the features. As of now, almost all the entries of our dataframe are strings. We have a few entries that are integer numbers that need to be converted: BoxOffice, Metascore, Runtime, Year and imdbVotes.

Metascore and Year are already in an appropriate format, we just need to cast the type. However, BoxOffice, Runtime and imdbVotes need a little bit of tinkering.

In [81]:
# For BoxOffice we need to remove the $ and commas
dataframe['BoxOffice'] = dataframe['BoxOffice'].apply(lambda x: x.replace("$", ""))
dataframe['BoxOffice'] = dataframe['BoxOffice'].apply(lambda x: x.replace(",", ""))
dataframe['BoxOffice'] = dataframe['BoxOffice'].replace('N/A', 0)
dataframe['BoxOffice'] = dataframe['BoxOffice'].astype(float)
dataframe['BoxOffice'] = dataframe['BoxOffice'].astype(int)
# For imdbVotes we need to remove the commas
dataframe['imdbVotes'] = dataframe['imdbVotes'].astype(str)
dataframe['imdbVotes'] = dataframe['imdbVotes'].apply(lambda x: x.replace(",", ""))
dataframe['imdbVotes'] = dataframe['imdbVotes'].replace('N/A', 0)
dataframe['imdbVotes'] = dataframe['imdbVotes'].astype(int)
# For Runtime we need to remove the " min"
dataframe['Runtime'] = dataframe['Runtime'].apply(lambda x: x.replace(" min", ""))
dataframe['Runtime'] = dataframe['Runtime'].replace('N/A', 0)
dataframe['Runtime'] = dataframe['Runtime'].astype(int)
# Metascore
dataframe['Metascore'] = dataframe['Metascore'].replace('N/A', 0)
dataframe['Metascore'] = dataframe['Metascore'].astype(int)
# Year
dataframe['Year'] = dataframe['Year'].astype(str)
dataframe['Year'] = dataframe['Year'].apply(lambda x: x.replace("-1969", ""))
dataframe['Year'] = dataframe['Year'].apply(lambda x: x.replace("–", ""))
dataframe['Year'] = dataframe['Year'].replace('N/A', 0)
dataframe['Year'] = dataframe['Year'].astype(int)

dataframe = dataframe.reset_index(drop=True)

In [82]:
dataframe.head()

Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Genre,Language,Metascore,Plot,...,Rated,Ratings,Released,Runtime,Title,Writer,Year,imdbID,imdbRating,imdbVotes
0,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",Won 3 Oscars. Another 85 wins & 128 nominations.,749700000,"UK, USA",22 Apr 2010,James Cameron,"Action, Adventure, Fantasy, Sci-Fi","English, Spanish",83,A paraplegic marine dispatched to the moon Pan...,...,PG-13,"{'Source': 'Internet Movie Database', 'Value':...",18 Dec 2009,162,Avatar,James Cameron,2009,tt0499549,7.8,1013715
1,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",Won 3 Oscars. Another 85 wins & 128 nominations.,749700000,"UK, USA",22 Apr 2010,James Cameron,"Action, Adventure, Fantasy, Sci-Fi","English, Spanish",83,A paraplegic marine dispatched to the moon Pan...,...,PG-13,"{'Source': 'Rotten Tomatoes', 'Value': '82%'}",18 Dec 2009,162,Avatar,James Cameron,2009,tt0499549,7.8,1013715
2,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",Won 3 Oscars. Another 85 wins & 128 nominations.,749700000,"UK, USA",22 Apr 2010,James Cameron,"Action, Adventure, Fantasy, Sci-Fi","English, Spanish",83,A paraplegic marine dispatched to the moon Pan...,...,PG-13,"{'Source': 'Metacritic', 'Value': '83/100'}",18 Dec 2009,162,Avatar,James Cameron,2009,tt0499549,7.8,1013715
3,"Johnny Depp, Geoffrey Rush, Orlando Bloom, Kei...",Nominated for 2 Oscars. Another 20 wins & 44 n...,309404152,USA,04 Dec 2007,Gore Verbinski,"Action, Adventure, Fantasy",English,50,"Captain Barbossa, Will Turner and Elizabeth Sw...",...,PG-13,"{'Source': 'Internet Movie Database', 'Value':...",25 May 2007,169,Pirates of the Caribbean: At World's End,"Ted Elliott, Terry Rossio, Ted Elliott (charac...",2007,tt0449088,7.1,542125
4,"Johnny Depp, Geoffrey Rush, Orlando Bloom, Kei...",Nominated for 2 Oscars. Another 20 wins & 44 n...,309404152,USA,04 Dec 2007,Gore Verbinski,"Action, Adventure, Fantasy",English,50,"Captain Barbossa, Will Turner and Elizabeth Sw...",...,PG-13,"{'Source': 'Rotten Tomatoes', 'Value': '44%'}",25 May 2007,169,Pirates of the Caribbean: At World's End,"Ted Elliott, Terry Rossio, Ted Elliott (charac...",2007,tt0449088,7.1,542125


Now, we need to regroup the different sources of ratings so that the movies are not repeated three times in the dataframe.

- The first rating comes from imdb, its value is on a scale from 0 to 10
- The second rating comes from Rotten Tomatoes, its value is a percentage
- The third rating comes from Metacritic, its value is on a scale from 0 to 100

We wish to separate this column into 3 columns, drop the duplicate rows and convert the ratings to percentages for a more uniform notation.

In [140]:
dataframe_ = dataframe.copy()

In [141]:
# Create columns
dataframe_['imdbGrade'] = 0
dataframe_['tomatoGrade'] = 0
dataframe_['metacriticGrade'] = 0

# Fill columns
for i in range(0, int(len(dataframe)/3)):
    dataframe_['imdbGrade'][3*i] = dataframe_['Ratings'][3*i]['Value']
    dataframe_['tomatoGrade'][3*i] = dataframe_['Ratings'][3*i+1]['Value']
    dataframe_['metacriticGrade'][3*i] = dataframe_['Ratings'][3*i+2]['Value']

# Drop rows
dataframe_ = dataframe_[dataframe_['imdbGrade'] != 0]

In [142]:
dataframe_ = dataframe_.reset_index(drop=True)

In [143]:
# Transform grades
dataframe_['imdbGrade'] = dataframe_['imdbRating']*10
dataframe_ = dataframe_.drop(columns=['imdbRating'])

for i in range(0, len(dataframe_)):
    if '/10' in dataframe_['tomatoGrade'][i]:
        grade_ = dataframe_['tomatoGrade'][i].replace('/10', '').replace('.', '')
        dataframe_['tomatoGrade'][i] = int(grade)
    elif '%' in dataframe_['tomatoGrade'][i]:
        grade = dataframe_['tomatoGrade'][i].replace('%', '')
        dataframe_['tomatoGrade'][i] = int(grade)
    elif '/100' in dataframe_['metacriticGrade'][i]:
        grade = dataframe_['metacriticGrade'][i].replace('/100', '')
        dataframe_['metacriticGrade'][i] = int(grade)
    else:
        grade = float(dataframe_['tomatoGrade'][i])
        if grade > 100:
            grade = grade / 10
        dataframe_['tomatoGrade'][i] = int(grade)

dataframe_['metacriticGrade'] = dataframe_['metacriticGrade'].astype(str)
for i in range(0, len(dataframe_)):
    if '/10' in dataframe_['metacriticGrade'][i]:
        grade_ = dataframe_['metacriticGrade'][i].replace('/10', '').replace('.', '')
        dataframe_['metacriticGrade'][i] = int(grade)
    elif '%' in dataframe_['metacriticGrade'][i]:
        grade = dataframe_['metacriticGrade'][i].replace('%', '')
        dataframe_['metacriticGrade'][i] = int(grade)
    elif '/100' in dataframe_['metacriticGrade'][i]:
        grade = dataframe_['metacriticGrade'][i].replace('/100', '')
        dataframe_['metacriticGrade'][i] = int(grade)
    else:
        grade = float(dataframe_['metacriticGrade'][i])
        if grade > 100:
            grade = grade / 10
        dataframe_['metacriticGrade'][i] = int(grade)

In [144]:
dataframe_['imdbGrade'] = dataframe_['imdbGrade'].astype(int)
dataframe_['tomatoGrade'] = dataframe_['tomatoGrade'].astype(int)
dataframe_['metacriticGrade'] = dataframe_['metacriticGrade'].astype(int)

In [145]:
dataframe_.head()

Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Genre,Language,Metascore,Plot,...,Released,Runtime,Title,Writer,Year,imdbID,imdbVotes,imdbGrade,tomatoGrade,metacriticGrade
0,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",Won 3 Oscars. Another 85 wins & 128 nominations.,749700000,"UK, USA",22 Apr 2010,James Cameron,"Action, Adventure, Fantasy, Sci-Fi","English, Spanish",83,A paraplegic marine dispatched to the moon Pan...,...,18 Dec 2009,162,Avatar,James Cameron,2009,tt0499549,1013715,78,82,15
1,"Johnny Depp, Geoffrey Rush, Orlando Bloom, Kei...",Nominated for 2 Oscars. Another 20 wins & 44 n...,309404152,USA,04 Dec 2007,Gore Verbinski,"Action, Adventure, Fantasy",English,50,"Captain Barbossa, Will Turner and Elizabeth Sw...",...,25 May 2007,169,Pirates of the Caribbean: At World's End,"Ted Elliott, Terry Rossio, Ted Elliott (charac...",2007,tt0449088,542125,71,44,15
2,"Daniel Craig, Christoph Waltz, Léa Seydoux, Ra...",Won 1 Oscar. Another 7 wins & 32 nominations.,208777731,"UK, USA",09 Feb 2016,Sam Mendes,"Action, Adventure, Thriller","English, Spanish, Italian, German, French",60,A cryptic message from 007's past sends him pi...,...,06 Nov 2015,148,Spectre,"John Logan (screenplay by), Neal Purvis (scree...",2015,tt2379713,343533,68,64,15
3,"Christian Bale, Gary Oldman, Tom Hardy, Joseph...",Nominated for 1 BAFTA Film Award. Another 38 w...,448130642,"UK, USA",03 Dec 2012,Christopher Nolan,"Action, Thriller","English, Arabic",78,Eight years after the Joker's reign of anarchy...,...,20 Jul 2012,164,The Dark Knight Rises,"Jonathan Nolan (screenplay), Christopher Nolan...",2012,tt1345836,1344474,84,87,15
4,"Taylor Kitsch, Lynn Collins, Samantha Morton, ...",2 wins & 8 nominations.,73058679,USA,05 Jun 2012,Andrew Stanton,"Action, Adventure, Sci-Fi",English,51,"Transported to Barsoom, a Civil War vet discov...",...,09 Mar 2012,132,John Carter,"Andrew Stanton (screenplay by), Mark Andrews (...",2012,tt0401729,234717,66,51,15


Now, we need to deal with the string attributes that are in form of a list. We want a set of strings, each string corresponding to an individual characteristic (country, actor, etc).

In order to do so, we split the string at the commas. For the Writer attribute, we decided to remove the specifications and only keep the names of the writers, taking off who wrote the story, screenplay or characters.

In [147]:
dataframe_['Actors'] = dataframe_['Actors'].apply(lambda x: set(x.split(', ')))
dataframe_['Country'] = dataframe_['Country'].apply(lambda x: set(x.split(', ')))
dataframe_['Director'] = dataframe_['Director'].apply(lambda x: set(x.split(', ')))
dataframe_['Genre'] = dataframe_['Genre'].apply(lambda x: set(x.split(', ')))
dataframe_['Language'] = dataframe_['Language'].apply(lambda x: set(x.split(', ')))

In [148]:
dataframe_['Writer'] = dataframe_['Writer'].apply(lambda x: re.sub(" [\(\[].*?[\)\]]", "", x))
dataframe_['Writer'] = dataframe_['Writer'].apply(lambda x: set(x.split(', ')))

In [149]:
dataframe_

Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Genre,Language,Metascore,Plot,...,Released,Runtime,Title,Writer,Year,imdbID,imdbVotes,imdbGrade,tomatoGrade,metacriticGrade
0,"{Sigourney Weaver, Stephen Lang, Sam Worthingt...",Won 3 Oscars. Another 85 wins & 128 nominations.,749700000,"{USA, UK}",22 Apr 2010,{James Cameron},"{Adventure, Fantasy, Sci-Fi, Action}","{English, Spanish}",83,A paraplegic marine dispatched to the moon Pan...,...,18 Dec 2009,162,Avatar,{James Cameron},2009,tt0499549,1013715,78,82,15
1,"{Orlando Bloom, Johnny Depp, Geoffrey Rush, Ke...",Nominated for 2 Oscars. Another 20 wins & 44 n...,309404152,{USA},04 Dec 2007,{Gore Verbinski},"{Adventure, Fantasy, Action}",{English},50,"Captain Barbossa, Will Turner and Elizabeth Sw...",...,25 May 2007,169,Pirates of the Caribbean: At World's End,"{Ted Elliott, Terry Rossio, Stuart Beattie, Ja...",2007,tt0449088,542125,71,44,15
2,"{Daniel Craig, Ralph Fiennes, Christoph Waltz,...",Won 1 Oscar. Another 7 wins & 32 nominations.,208777731,"{USA, UK}",09 Feb 2016,{Sam Mendes},"{Adventure, Action, Thriller}","{Spanish, English, Italian, French, German}",60,A cryptic message from 007's past sends him pi...,...,06 Nov 2015,148,Spectre,"{Jez Butterworth, Robert Wade, John Logan, Nea...",2015,tt2379713,343533,68,64,15
3,"{Joseph Gordon-Levitt, Christian Bale, Gary Ol...",Nominated for 1 BAFTA Film Award. Another 38 w...,448130642,"{USA, UK}",03 Dec 2012,{Christopher Nolan},"{Action, Thriller}","{English, Arabic}",78,Eight years after the Joker's reign of anarchy...,...,20 Jul 2012,164,The Dark Knight Rises,"{Jonathan Nolan, Bob Kane, David S. Goyer, Chr...",2012,tt1345836,1344474,84,87,15
4,"{Willem Dafoe, Samantha Morton, Lynn Collins, ...",2 wins & 8 nominations.,73058679,{USA},05 Jun 2012,{Andrew Stanton},"{Adventure, Sci-Fi, Action}",{English},51,"Transported to Barsoom, a Civil War vet discov...",...,09 Mar 2012,132,John Carter,"{Mark Andrews, Michael Chabon, Edgar Rice Burr...",2012,tt0401729,234717,66,51,15
5,"{Kirsten Dunst, Tobey Maguire, Thomas Haden Ch...",Nominated for 1 BAFTA Film Award. Another 3 wi...,336530303,{USA},30 Oct 2007,{Sam Raimi},"{Adventure, Sci-Fi, Action}","{English, French}",59,A strange black entity from another world bond...,...,04 May 2007,139,Spider-Man 3,"{Stan Lee, Sam Raimi, Alvin Sargent, Steve Dit...",2007,tt0413300,445631,62,63,15
6,"{Donna Murphy, Ron Perlman, Zachary Levi, Mand...",Nominated for 1 Oscar. Another 9 wins & 40 nom...,200803309,{USA},29 Mar 2011,"{Byron Howard, Nathan Greno}","{Fantasy, Family, Adventure, Musical, Comedy, ...",{English},71,The magically long-haired Rapunzel has spent h...,...,24 Nov 2010,100,Tangled,"{Jacob Grimm, Dan Fogelman, Wilhelm Grimm}",2010,tt0398286,353478,78,89,15
7,"{Chris Evans, Chris Hemsworth, Robert Downey J...",7 wins & 45 nominations.,429113729,{USA},02 Oct 2015,{Joss Whedon},"{Adventure, Sci-Fi, Action}","{English, Korean}",66,When Tony Stark and Bruce Banner try to jump-s...,...,01 May 2015,141,Avengers: Age of Ultron,"{Stan Lee, Jack Kirby, Joss Whedon, Joe Simon,...",2015,tt2395427,619549,74,75,15
8,"{Dave Legeno, Daniel Radcliffe, Elarica Johnso...",Nominated for 1 Oscar. Another 8 wins & 35 nom...,301920409,"{USA, UK}",08 Dec 2009,{David Yates},"{Adventure, Family, Fantasy, Mystery}",{English},78,As Harry Potter begins his sixth year at Hogwa...,...,15 Jul 2009,153,Harry Potter and the Half-Blood Prince,"{J.K. Rowling, Steve Kloves}",2009,tt0417741,397592,76,83,15
9,"{Jesse Eisenberg, Henry Cavill, Ben Affleck, A...",14 wins & 30 nominations.,293792936,{USA},19 Jul 2016,{Zack Snyder},"{Adventure, Fantasy, Sci-Fi, Action}",{English},44,Fearing that the actions of Superman are left ...,...,25 Mar 2016,151,Batman v Superman: Dawn of Justice,"{David S. Goyer, Jerry Siegel, Bill Finger, Wi...",2016,tt2975590,553233,65,27,15


In [150]:
dataframe_.to_pickle("df.pkl")

In [200]:
dataframe_ = pd.read_pickle("df.pkl")

Now, we deal with the Production company. We have observed that a same company has different names within this dataset, sometimes the names diferring only by a space or an add-on (like "distribution"). We want to uniformize this.

In [205]:
dataframe_['Production'] = dataframe_['Production'].str.replace('-', ' ')
dataframe_['Production'] = dataframe_['Production'].str.replace(' / ', '/')
dataframe_['Production'] = dataframe_['Production'].str.replace(' /', '/')
dataframe_['Production'] = dataframe_['Production'].str.replace('/ ', '/')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Corporation', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Distribution', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Corporat', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Production', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Films', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Film', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Pictures', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Picutres', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Internationa', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' International', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Industries', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Compa', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Co', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Distribu', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Studios', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Animation', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Feature', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' SKG', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' LLC', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Recommends', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Digital', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Media', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Video', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Inc', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Home Entertainment', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Entertainment', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Lorber', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Releasing', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Cinema.', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Cinema', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Classics', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Classic', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Group', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Europacorp', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Pvt. Ltd.', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Ventures', '')
dataframe_['Production'] = dataframe_['Production'].str.replace('DreamWorks', 'Dreamworks')
dataframe_['Production'] = dataframe_['Production'].str.replace('Dream Works', 'Dreamworks')
dataframe_['Production'] = dataframe_['Production'].str.replace("Lion's Gate", 'Lionsgate')
dataframe_['Production'] = dataframe_['Production'].str.replace("Liongate", 'Lionsgate')
dataframe_['Production'] = dataframe_['Production'].str.replace("Lions Gate", 'Lionsgate')
dataframe_['Production'] = dataframe_['Production'].str.replace("LionsGate", 'Lionsgate')
dataframe_['Production'] = dataframe_['Production'].str.replace('Metro Goldwyn Mayer (MGM)', 'MGM')
dataframe_['Production'] = dataframe_['Production'].str.replace('Metro Goldwyn Mayer', 'MGM')
dataframe_['Production'] = dataframe_['Production'].str.replace('WARNER BROTHERS PICTURES', 'Warner Bros')
dataframe_['Production'] = dataframe_['Production'].str.replace('WB', 'Warner Bros')
dataframe_['Production'] = dataframe_['Production'].str.replace('Weinsteinny', 'Weinstein')
dataframe_['Production'] = dataframe_['Production'].str.replace('Warner Brothers', 'Warner Bros')
dataframe_['Production'] = dataframe_['Production'].str.replace('Warner Home', 'Warner Bros')
dataframe_['Production'] = dataframe_['Production'].str.replace('Warner Independent', 'Warner Bros')
dataframe_['Production'] = dataframe_['Production'].str.replace('PIXAR', 'Pixar')
dataframe_['Production'] = dataframe_['Production'].str.replace(' [us]', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Pictur', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Home', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Focus', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' City', '')
dataframe_['Production'] = dataframe_['Production'].str.replace('Warners Bros', 'Warner Bros')
dataframe_['Production'] = dataframe_['Production'].str.replace('Universall', 'Universal')
dataframe_['Production'] = dataframe_['Production'].str.replace('The Weinstein', 'Weinstein')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Vantage', 'Weinstein')
dataframe_['Production'] = dataframe_['Production'].str.replace('/lumbia', '/ Columbia')
dataframe_['Production'] = dataframe_['Production'].str.replace('MGM (MGM)', 'MGM')
dataframe_['Production'] = dataframe_['Production'].str.replace(' First Take', '/ Columbia')
dataframe_['Production'] = dataframe_['Production'].str.replace('Goodbye Cruel', 'Goodbye Cruel World')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Atomic', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Faith', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Searchlight', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Walden', '')
dataframe_['Production'] = dataframe_['Production'].str.replace('FoxFaith', 'Fox')
dataframe_['Production'] = dataframe_['Production'].str.replace('Fine Line', 'Fine Lines')
dataframe_['Production'] = dataframe_['Production'].str.replace('Erosl', 'Eros')
dataframe_['Production'] = dataframe_['Production'].str.replace('Walt Disneys', 'Disney')
dataframe_['Production'] = dataframe_['Production'].str.replace('Walt Disney', 'Disney')
dataframe_['Production'] = dataframe_['Production'].str.replace('.', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' TriStarl', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' TriStar', '')
dataframe_['Production'] = dataframe_['Production'].str.replace(' Tristar', '')

In [207]:
dataframe_['Production'] = dataframe_['Production'].apply(lambda x: set(x.split('/')))

In [211]:
dataframe_.to_pickle("df.pkl")

Now, we deal with the plot. For this NLP task, we will keep it simple. We only kept the words that are not determinants or adverbs and store them as a set of separate words.

In [212]:
dataframe_ = pd.read_pickle("df.pkl")

In [261]:
common_words = [' a', ' the', ' his', ' her', ' hers', ' on', ' to', ' and', ' is', ' was', 
                ' be', ' must', ' of', ' for', ' one', ' theirs', ' their', ' our', ' they', ' off', ' from',
                ' him', " it's", ' it', ' its', ' in', ' most', ' after', ' with', ' by', ' herself', 
                ' himself', ' only', ' as', ' has', ' but', ' that', ' he', ' she', ' about', ' who', 
                ' where', ' go', "'s", ' at', ' more', ' takes', ' take', ' what', ' after', ' into', " weren't",
                ' own', ' when', ' by', ' an', 'A ', 'The ', 'One ', ' once', ' off', ' if', ' too', ' are', 
                ' were', ' had', ' out', ' will', ' which', ' back', ' comes', ' come', ' under', ' down', 
                ' how', ' those', ' these', ' between', ' during', ",", '.', "'", 'On ']

In [265]:
for word in common_words:
    dataframe_['Plot'] = dataframe_['Plot'].str.replace(word, '')
dataframe_['Plot'] = dataframe_['Plot'].apply(lambda x: set(x.split(' ')))

In [266]:
dataframe_['Plot']

0       {ordersnd, unique, protecting, world, feels, p...
1       {makeir, navigate, sailf, Will, Turnernd, find...
2       {007, cryptic, organization, previous, mission...
3       {save, City, Gotham, enigmatic, brutal, Catwom...
4       {barren, Transported, princess, barbarians, ne...
5       {world, black, temptationsnd, villains, new, t...
6       {world, first, lifewer, timend, really, discov...
7       {Banner, called, When, dormant, stop, jump-sta...
8       {sixth, Pottergins, dark, past, Voldemort, Pri...
9       {while, world, Fearingctions, Batman, Steel, r...
10      {world, foe, Kryptonian, reappearsfter, old, u...
11      {stop, organization, James, resource, mysterys...
12      {Jones, Sparrow, foes, friendsnd, Jonesvoid, s...
13      {legend, Tonto, recounts, justice, lawto, warr...
14      {Clark, survivors, brink, destruction, humance...
15      {enlistedcegainlp, land, throne, kingnd, Caspi...
16      {fights, stop, learn, mightiestoesgethernd, te...
17      {Black

In [267]:
dataframe_.to_pickle("df.pkl")

In [268]:
dataframe_

Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Genre,Language,Metascore,Plot,...,Released,Runtime,Title,Writer,Year,imdbID,imdbVotes,imdbGrade,tomatoGrade,metacriticGrade
0,"{Sigourney Weaver, Stephen Lang, Sam Worthingt...",Won 3 Oscars. Another 85 wins & 128 nominations.,749700000,"{USA, UK}",22 Apr 2010,{James Cameron},"{Adventure, Fantasy, Sci-Fi, Action}","{English, Spanish}",83,"{ordersnd, unique, protecting, world, feels, p...",...,18 Dec 2009,162,Avatar,{James Cameron},2009,tt0499549,1013715,78,82,15
1,"{Orlando Bloom, Johnny Depp, Geoffrey Rush, Ke...",Nominated for 2 Oscars. Another 20 wins & 44 n...,309404152,{USA},04 Dec 2007,{Gore Verbinski},"{Adventure, Fantasy, Action}",{English},50,"{makeir, navigate, sailf, Will, Turnernd, find...",...,25 May 2007,169,Pirates of the Caribbean: At World's End,"{Ted Elliott, Terry Rossio, Stuart Beattie, Ja...",2007,tt0449088,542125,71,44,15
2,"{Daniel Craig, Ralph Fiennes, Christoph Waltz,...",Won 1 Oscar. Another 7 wins & 32 nominations.,208777731,"{USA, UK}",09 Feb 2016,{Sam Mendes},"{Adventure, Action, Thriller}","{Spanish, English, Italian, French, German}",60,"{007, cryptic, organization, previous, mission...",...,06 Nov 2015,148,Spectre,"{Jez Butterworth, Robert Wade, John Logan, Nea...",2015,tt2379713,343533,68,64,15
3,"{Christian Bale, Gary Oldman, Tom Hardy, Josep...",Nominated for 1 BAFTA Film Award. Another 38 w...,448130642,"{USA, UK}",03 Dec 2012,{Christopher Nolan},"{Action, Thriller}","{English, Arabic}",78,"{save, City, Gotham, enigmatic, brutal, Catwom...",...,20 Jul 2012,164,The Dark Knight Rises,"{Jonathan Nolan, David S. Goyer, Bob Kane, Chr...",2012,tt1345836,1344474,84,87,15
4,"{Willem Dafoe, Samantha Morton, Lynn Collins, ...",2 wins & 8 nominations.,73058679,{USA},05 Jun 2012,{Andrew Stanton},"{Adventure, Sci-Fi, Action}",{English},51,"{barren, Transported, princess, barbarians, ne...",...,09 Mar 2012,132,John Carter,"{Mark Andrews, Michael Chabon, Edgar Rice Burr...",2012,tt0401729,234717,66,51,15
5,"{Kirsten Dunst, Tobey Maguire, Thomas Haden Ch...",Nominated for 1 BAFTA Film Award. Another 3 wi...,336530303,{USA},30 Oct 2007,{Sam Raimi},"{Adventure, Sci-Fi, Action}","{English, French}",59,"{world, black, temptationsnd, villains, new, t...",...,04 May 2007,139,Spider-Man 3,"{Stan Lee, Sam Raimi, Alvin Sargent, Steve Dit...",2007,tt0413300,445631,62,63,15
6,"{Donna Murphy, Ron Perlman, Zachary Levi, Mand...",Nominated for 1 Oscar. Another 9 wins & 40 nom...,200803309,{USA},29 Mar 2011,"{Byron Howard, Nathan Greno}","{Fantasy, Family, Adventure, Musical, Comedy, ...",{English},71,"{world, first, lifewer, timend, really, discov...",...,24 Nov 2010,100,Tangled,"{Jacob Grimm, Dan Fogelman, Wilhelm Grimm}",2010,tt0398286,353478,78,89,15
7,"{Chris Evans, Chris Hemsworth, Robert Downey J...",7 wins & 45 nominations.,429113729,{USA},02 Oct 2015,{Joss Whedon},"{Adventure, Sci-Fi, Action}","{English, Korean}",66,"{Banner, called, When, dormant, stop, jump-sta...",...,01 May 2015,141,Avengers: Age of Ultron,"{Stan Lee, Jack Kirby, Joss Whedon, Joe Simon,...",2015,tt2395427,619549,74,75,15
8,"{Dave Legeno, Daniel Radcliffe, Elarica Johnso...",Nominated for 1 Oscar. Another 8 wins & 35 nom...,301920409,"{USA, UK}",08 Dec 2009,{David Yates},"{Adventure, Family, Fantasy, Mystery}",{English},78,"{sixth, Pottergins, dark, past, Voldemort, Pri...",...,15 Jul 2009,153,Harry Potter and the Half-Blood Prince,"{J.K. Rowling, Steve Kloves}",2009,tt0417741,397592,76,83,15
9,"{Jesse Eisenberg, Henry Cavill, Ben Affleck, A...",14 wins & 30 nominations.,293792936,{USA},19 Jul 2016,{Zack Snyder},"{Adventure, Fantasy, Sci-Fi, Action}",{English},44,"{while, world, Fearingctions, Batman, Steel, r...",...,25 Mar 2016,151,Batman v Superman: Dawn of Justice,"{David S. Goyer, Bill Finger, William Moulton ...",2016,tt2975590,553233,65,27,15


# 3. Analysis