In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import json
import matplotlib.pyplot as plt

In [2]:
# movies = pd.read_csv('./data/tmdb_5000_movies.csv').head(n=1)
# credits = pd.read_csv('./data/tmdb_5000_credits.csv').head(n=1)
movies = pd.read_csv('./data/tmdb_5000_movies.csv').head(n=1000)
credits = pd.read_csv('./data/tmdb_5000_credits.csv').head(n=1000)

In [3]:
movies.head(n=1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [None]:
movies = pd.merge(left=movies, right=credits, left_on='id', right_on='movie_id')

In [None]:
# turn json into python objects
json_columns = ['genres', 'keywords', 'production_countries', 'spoken_languages', 'cast', 'crew']
for column in json_columns:
    movies[column] = movies[column].apply(json.loads, encoding="utf-8")

In [None]:
movies.cast

In [None]:
movies.genres[0][0]['name']

In [None]:
def actor_to_id_string(actor):
    return '{} - {}'.format(actor['name'], actor['id'])

In [None]:
def cast_to_actors(cast):
    actors = []
    for person in cast:
        actors.append(actor_to_id_string(person))
    return actors

In [None]:
movies.actors = movies.cast.apply(cast_to_actors)

### Create list of all actors

In [None]:
all_actors = []
for actors in movies.actors:
    for actor in actors:
        all_actors.append(actor)

In [None]:
actors = pd.Series(all_actors)

In [None]:
unique_actors_count = pd.crosstab(actors, columns='count')

unique_actors_count \
.sort_values(by='count', ascending=False).head(12)

#### Total Number of unique actors

In [None]:
len(unique_actors_count)

In [None]:
#### Total Number of actors that show up more than once

In [None]:
len(unique_actors_count[unique_actors_count.values > 1])

In [None]:
#### Total Number of actors that show up more than three times

In [None]:
len(unique_actors_count[unique_actors_count.values > 3])

In [None]:
sam_count = unique_actors_count[unique_actors_count.index == 'Samuel L. Jackson - 2231']
sam_count

In [None]:
sam_count.values[0][0]

### Create Actors Features

In [None]:
actors_string_id_by_id = {}
actor_counts_by_string_id = {}
unique_actors_count_dict = unique_actors_count.to_dict()['count']

def actor_has_more_than_x_movies(actor, number_of_movies = 3):
    string_id = str(actor['id'])

    # First, get the unique id created for the actor
    actor_string_id = ''
    if string_id in actors_string_id_by_id:
        actor_string_id = actors_string_id_by_id[string_id]
    else:
        actor_string_id = actor_to_id_string(actor)
        actors_string_id_by_id[string_id] = actor_string_id
    
    # Now let's see how many movies this actor has played in
    actor_count = unique_actors_count_dict[actor_string_id]
    return actor_count > number_of_movies

In [None]:
actor_has_more_than_x_movies(movies.cast[0][2], 1)

#### Create a new feature that represents all actors that show up in more than 3 movies (and are not sam l jackson)

In [None]:
movies_actors_ids = []
for actors in movies.cast:
    movie_actors_ids = []
    for actor in actors:
        # if (not actor_has_more_than_x_movies(actor, 3)):
            # continue
        movie_actors_ids.append(str(actor['id']))
    movies_actors_ids.append(' '.join(movie_actors_ids))

In [None]:
movies['actor_ids'] = movies_actors_ids
cleaned = movies.set_index('id').actor_ids.str.split(' ', expand=True).stack()
actor_features = pd.get_dummies(cleaned, prefix='actor').groupby(level=0).sum()

actor_features['id'] = actor_features.index
actor_features = actor_features.reset_index(drop=True)

## Actor Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
CountVectorizer(analyzer='word', binary=True)

In [None]:
movie_vector = vectorizer.fit_transform(movies_actors_ids).toarray()

In [None]:
actor_feature_names = vectorizer.get_feature_names()

In [None]:
actor_vector_works = pd.DataFrame(movie_vector, columns=actor_feature_names)

In [None]:
actor_vector_works['id'] = movies.id

### label records as having Samuel L. Jackson

In [None]:
samuel_label = 'Samuel L. Jackson - 2231'
def record_has_samuel (actors):
    return samuel_label in actors


In [None]:
movies.samuel = movies.actors.apply(record_has_samuel)

In [None]:
movies_with_samuel = movies[movies.samuel == True]

#### There should be 67 movies with Samuel L Jackson

In [None]:
len(movies_with_samuel)

In [None]:
baseline_prediction = len(movies_with_samuel) / len(movies)
1 - baseline_prediction

# Create x/y

In [None]:
unimportant_features = [
    'genres',
    'homepage',
    'keywords',
    'original_language',
    'original_title',
    'overview',
    'production_companies',
    'spoken_languages',
    'status',
    'tagline',
    'title',
    'cast',
    'crew',
    'movie_id',
    'actor_ids'
]
important_features = [
    'id',
#    'budget',
#    'popularity',
#    'vote_average',
#    'vote_count'
]

movies_with_important_features = movies[important_features]

In [None]:
y = movies.samuel
#x = pd.merge(left=movies_with_important_features, right=actor_features, left_on='id', right_on='id')
x = pd.merge(left=movies_with_important_features, right=actor_vector_works, left_on='id', right_on='id')
#x = x['2231'].values.reshape(len(x), 1)

# THIS WORKS
#x = x[x.columns[int(len(x.columns) * 6/10):]]
#x['id'] = movies.id

In [None]:
len(x.columns)

In [None]:
# remove id from dataframe
x = x[x.columns.difference(['id'])]

In [None]:
len(x['2231'])

#### We can create an x variable with what we want directly, but can not assign it to the dataframe

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(x,y)

## Get Accuracy

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
predict_true = clf.predict(x)

In [None]:
len(predict_true[predict_true == True])

In [None]:
confusion_matrix(y, predict_true)

In [None]:
accuracy_score(y, predict_true)

In [None]:
x['65731'][0:8]

In [None]:
y[0:8]

# TODO

Perform deap learning prediction https://www.kaggle.com/liwste/simple-deep-mlp-with-keras

Create cross validation