In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import json
import matplotlib.pyplot as plt

In [2]:
# movies = pd.read_csv('./data/tmdb_5000_movies.csv').head(n=1)
# credits = pd.read_csv('./data/tmdb_5000_credits.csv').head(n=1)
movies = pd.read_csv('./data/tmdb_5000_movies.csv').head(n=1000)
credits = pd.read_csv('./data/tmdb_5000_credits.csv').head(n=1000)

In [3]:
movies.head(n=1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [53]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
movies = pd.merge(left=movies, right=credits, left_on='id', right_on='movie_id')

In [6]:
# turn json into python objects
json_columns = ['genres', 'keywords', 'production_countries', 'spoken_languages', 'cast', 'crew']
for column in json_columns:
    movies[column] = movies[column].apply(json.loads, encoding="utf-8")

In [7]:
movies.cast[0][:2]

[{'cast_id': 242,
  'character': 'Jake Sully',
  'credit_id': '5602a8a7c3a3685532001c9a',
  'gender': 2,
  'id': 65731,
  'name': 'Sam Worthington',
  'order': 0},
 {'cast_id': 3,
  'character': 'Neytiri',
  'credit_id': '52fe48009251416c750ac9cb',
  'gender': 1,
  'id': 8691,
  'name': 'Zoe Saldana',
  'order': 1}]

In [8]:
movies.genres[0][0]['name']

'Action'

In [9]:
def actor_to_id_string(actor):
    return '{} - {}'.format(actor['name'], actor['id'])

In [10]:
def cast_to_actors(cast):
    actors = []
    for person in cast:
        actors.append(actor_to_id_string(person))
    return actors

In [11]:
movies.actors = movies.cast.apply(cast_to_actors)

### Create list of all actors

In [12]:
all_actors = []
for actors in movies.actors:
    for actor in actors:
        all_actors.append(actor)

In [13]:
actors = pd.Series(all_actors)

In [14]:
unique_actors_count = pd.crosstab(actors, columns='count')

unique_actors_count \
.sort_values(by='count', ascending=False).head(12)

col_0,count
row_0,Unnamed: 1_level_1
Samuel L. Jackson - 2231,34
Bruce Willis - 62,29
Matt Damon - 1892,27
Morgan Freeman - 192,26
Stan Lee - 7624,25
Owen Wilson - 887,25
Tom Cruise - 500,22
Arnold Schwarzenegger - 1100,21
Will Smith - 2888,21
Frank Welker - 15831,20


#### Total Number of unique actors

In [15]:
len(unique_actors_count)

19965

In [16]:
#### Total Number of actors that show up more than once

In [17]:
len(unique_actors_count[unique_actors_count.values > 1])

4913

In [18]:
#### Total Number of actors that show up more than three times

In [19]:
len(unique_actors_count[unique_actors_count.values > 3])

1356

In [20]:
sam_count = unique_actors_count[unique_actors_count.index == 'Samuel L. Jackson - 2231']
sam_count

col_0,count
row_0,Unnamed: 1_level_1
Samuel L. Jackson - 2231,34


In [21]:
sam_count.values[0][0]

34

### Create Actors Features

In [22]:
actors_string_id_by_id = {}
actor_counts_by_string_id = {}
unique_actors_count_dict = unique_actors_count.to_dict()['count']

def actor_has_more_than_x_movies(actor, number_of_movies = 3):
    string_id = str(actor['id'])

    # First, get the unique id created for the actor
    actor_string_id = ''
    if string_id in actors_string_id_by_id:
        actor_string_id = actors_string_id_by_id[string_id]
    else:
        actor_string_id = actor_to_id_string(actor)
        actors_string_id_by_id[string_id] = actor_string_id
    
    # Now let's see how many movies this actor has played in
    actor_count = unique_actors_count_dict[actor_string_id]
    return actor_count > number_of_movies

In [23]:
actor_has_more_than_x_movies(movies.cast[0][2], 1)

True

#### Create a new feature that represents all actors that show up in more than 3 movies (and are not sam l jackson)

In [24]:
movies_actors_ids = []
for actors in movies.cast:
    movie_actors_ids = []
    for actor in actors:
        # if (not actor_has_more_than_x_movies(actor, 3)):
            # continue
        movie_actors_ids.append(str(actor['id']))
    movies_actors_ids.append(' '.join(movie_actors_ids))

movies['actor_ids'] = movies_actors_ids
cleaned = movies.set_index('id').actor_ids.str.split(' ', expand=True).stack()
actor_features = pd.get_dummies(cleaned, prefix='actor').groupby(level=0).sum()

actor_features['id'] = actor_features.index
actor_features = actor_features.reset_index(drop=True)

## Actor Vectorization

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
CountVectorizer(analyzer='word', binary=True)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [26]:
movie_vector = vectorizer.fit_transform(movies_actors_ids).toarray()

In [27]:
actor_feature_names = vectorizer.get_feature_names()

In [28]:
actor_vector_works = pd.DataFrame(movie_vector, columns=actor_feature_names)

In [29]:
actor_vector_works['id'] = movies.id

### label records as having Samuel L. Jackson

In [30]:
movies.samuel = actor_vector_works['2231'] == 1

In [31]:
movies_with_samuel = movies[movies.samuel == True]

#### There should be 67 movies with Samuel L Jackson

In [32]:
len(movies_with_samuel)

34

In [33]:
baseline_prediction = len(movies_with_samuel) / len(movies)
1 - baseline_prediction

0.966

# Create x/y

In [34]:
unimportant_features = [
    'genres',
    'homepage',
    'keywords',
    'original_language',
    'original_title',
    'overview',
    'production_companies',
    'spoken_languages',
    'status',
    'tagline',
    'title',
    'cast',
    'crew',
    'movie_id',
    'actor_ids'
]
important_features = [
    'id',
#    'budget',
#    'popularity',
#    'vote_average',
#    'vote_count'
]

movies_with_important_features = movies[important_features]

In [35]:
y = movies.samuel
x = pd.merge(left=movies_with_important_features, right=actor_vector_works, left_on='id', right_on='id')

In [36]:
len(x.columns)

19960

In [37]:
# remove id from dataframe
x = x[x.columns.difference(['id'])]

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=1)

In [55]:
X_train.shape, y_train.shape

((800, 19959), (800,))

#### We can create an x variable with what we want directly, but can not assign it to the dataframe

In [56]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB(priors=None)

In [60]:
clf.score(X_train, y_train)

1.0

In [61]:
clf.score(X_test, y_test)

0.98

## Get Accuracy

In [99]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import itertools

In [116]:
def show_confusion_matrix(y, predict_true, x):
    cm = confusion_matrix(y, predict_true)
    cm = pd.DataFrame(cm)
    cm.columns = ['True', 'False']
    cm.index = ['True', 'False']
    print(cm)
    
    return
    cmap=plt.cm.Blues
    classes = x.columns
    tick_marks = np.arange(len(classes))
    title = 'Confusion Matrix'

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [117]:
def get_accuracy(model, x, y):
    predict_true = model.predict(x)
    print('Number of positive predictions {}'. format(len(predict_true[predict_true == True])))
    show_confusion_matrix(y, predict_true, x)

In [118]:
get_accuracy(clf, X_train, y_train)

Number of positive predictions 30
       True  False
True    770      0
False     0     30


array([[770,   0],
       [  0,  30]])

In [66]:
accuracy_score(y, predict_true)

0.996

In [45]:
x['65731'][0:8]

0    1
1    0
2    0
3    0
4    0
5    0
6    0
7    0
Name: 65731, dtype: int64

In [46]:
y[0:8]

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
Name: 2231, dtype: bool

# TODO

Perform deap learning prediction https://www.kaggle.com/liwste/simple-deep-mlp-with-keras