In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import json
import matplotlib.pyplot as plt

In [2]:
# movies = pd.read_csv('./data/tmdb_5000_movies.csv').head(n=1)
# credits = pd.read_csv('./data/tmdb_5000_credits.csv').head(n=1)
movies = pd.read_csv('./data/tmdb_5000_movies.csv')
credits = pd.read_csv('./data/tmdb_5000_credits.csv')

In [3]:
movies.head(n=1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
movies = pd.merge(left=movies, right=credits, left_on='id', right_on='movie_id')

In [6]:
# turn json into python objects
json_columns = ['genres', 'keywords', 'production_countries', 'spoken_languages', 'cast', 'crew']
for column in json_columns:
    movies[column] = movies[column].apply(json.loads, encoding="utf-8")

In [7]:
movies.genres[0][0]['name']

'Action'

In [8]:
def actor_to_id_string(actor):
    return '{} - {}'.format(actor['name'], actor['id'])

In [9]:
def cast_to_actors(cast):
    actors = []
    for person in cast:
        actors.append(actor_to_id_string(person))
    return actors

In [10]:
movies.actors = movies.cast.apply(cast_to_actors)

### Create list of all actors

In [11]:
all_actors = []
for actors in movies.actors:
    for actor in actors:
        all_actors.append(actor)

In [12]:
actors = pd.Series(all_actors)

In [13]:
unique_actors_count = pd.crosstab(actors, columns='count')

unique_actors_count \
.sort_values(by='count', ascending=False).head(12)

col_0,count
row_0,Unnamed: 1_level_1
Samuel L. Jackson - 2231,67
Robert De Niro - 380,57
Bruce Willis - 62,51
Matt Damon - 1892,48
Morgan Freeman - 192,46
Steve Buscemi - 884,43
Liam Neeson - 3896,41
Johnny Depp - 85,40
Owen Wilson - 887,40
John Goodman - 1230,39


#### Total Number of unique actors

In [14]:
len(unique_actors_count)

54588

In [15]:
#### Total Number of actors that show up more than once

In [16]:
len(unique_actors_count[unique_actors_count.values > 1])

15661

In [17]:
#### Total Number of actors that show up more than three times

In [18]:
len(unique_actors_count[unique_actors_count.values > 3])

5326

In [19]:
sam_count = unique_actors_count[unique_actors_count.index == 'Samuel L. Jackson - 2231']
sam_count

col_0,count
row_0,Unnamed: 1_level_1
Samuel L. Jackson - 2231,67


In [20]:
sam_count.values[0][0]

67

### Create Actors Features

In [21]:
actors_string_id_by_id = {}
actor_counts_by_string_id = {}
unique_actors_count_dict = unique_actors_count.to_dict()['count']

def actor_has_more_than_x_movies(actor, number_of_movies = 3):
    string_id = str(actor['id'])

    # First, get the unique id created for the actor
    actor_string_id = ''
    if string_id in actors_string_id_by_id:
        actor_string_id = actors_string_id_by_id[string_id]
    else:
        actor_string_id = actor_to_id_string(actor)
        actors_string_id_by_id[string_id] = actor_string_id
    
    # Now let's see how many movies this actor has played in
    actor_count = unique_actors_count_dict[actor_string_id]
    return actor_count > number_of_movies

In [22]:
actor_has_more_than_x_movies(movies.cast[0][2], 1)

True

#### Create a new feature that represents all actors that show up in more than 3 movies (and are not sam l jackson)

In [23]:
movies_actors_ids = []
for actors in movies.cast:
    movie_actors_ids = []
    for actor in actors:
        if (not actor_has_more_than_x_movies(actor, 3)):
            continue
        movie_actors_ids.append(str(actor['id']))
    movies_actors_ids.append(' '.join(movie_actors_ids))

In [24]:
movies['actor_ids'] = movies_actors_ids
cleaned = movies.set_index('id').actor_ids.str.split(' ', expand=True).stack()
actor_features = pd.get_dummies(cleaned, prefix='actor').groupby(level=0).sum()

actor_features['id'] = actor_features.index
actor_features = actor_features.reset_index(drop=True)

### label records as having Samuel L. Jackson

In [25]:
samuel_label = 'Samuel L. Jackson - 2231'
def record_has_samuel (actors):
    return samuel_label in actors


In [26]:
movies.samuel = movies.actors.apply(record_has_samuel)

In [27]:
movies_with_samuel = movies[movies.samuel == True]

#### There should be 67 movies with Samuel L Jackson

In [28]:
len(movies_with_samuel)

67

In [29]:
baseline_prediction = len(movies_with_samuel) / len(movies)
1 - baseline_prediction

0.9860503851759317

# Create x/y

In [30]:
unimportant_features = [
    'genres',
    'homepage',
    'keywords',
    'original_language',
    'original_title',
    'overview',
    'production_companies',
    'spoken_languages',
    'status',
    'tagline',
    'title',
    'cast',
    'crew',
    'movie_id',
    'actor_ids'
]
important_features = [
    'id',
#    'budget',
#    'popularity',
#    'vote_average',
#    'vote_count'
]

movies_with_important_features = movies[important_features]

In [31]:
y = movies.samuel
x = pd.merge(left=movies_with_important_features, right=actor_features, left_on='id', right_on='id')

#### We can create an x variable with what we want directly, but can not assign it to the dataframe

In [32]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(x,y)

GaussianNB(priors=None)

## Get Accuracy

In [33]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [34]:
predict_true = clf.predict(x)

In [35]:
len(predict_true[predict_true == True])

0

In [45]:
confusion_matrix(y, predict_true)

array([[4736,    0],
       [  67,    0]])

In [46]:
accuracy_score(y, predict_true)

0.9860503851759317

In [55]:
x['actor_65731'][0:8]

0    1
1    0
2    0
3    0
4    0
5    0
6    0
7    0
Name: actor_65731, dtype: uint8

In [54]:
y[0:8]

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
Name: cast, dtype: bool

# TODO

Perform deap learning prediction https://www.kaggle.com/liwste/simple-deep-mlp-with-keras

Create cross validation