In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
import json
import matplotlib.pyplot as plt

In [None]:
# movies = pd.read_csv('./data/tmdb_5000_movies.csv').head(n=1)
# credits = pd.read_csv('./data/tmdb_5000_credits.csv').head(n=1)
movies = pd.read_csv('./data/tmdb_5000_movies.csv')
credits = pd.read_csv('./data/tmdb_5000_credits.csv')

In [None]:
movies.head(n=1)

In [None]:
credits.head()

In [None]:
movies = pd.merge(left=movies, right=credits, left_on='id', right_on='movie_id')

In [None]:
# turn json into python objects
json_columns = ['genres', 'keywords', 'production_countries', 'spoken_languages', 'cast', 'crew']
for column in json_columns:
    movies[column] = movies[column].apply(json.loads, encoding="utf-8")

In [None]:
movies.cast[0][:2]

In [None]:
movies.genres[0][0]['name']

In [None]:
def actor_to_id_string(actor):
    return '{} - {}'.format(actor['name'], actor['id'])

In [None]:
def cast_to_actors(cast):
    actors = []
    for person in cast:
        actors.append(actor_to_id_string(person))
    return actors

In [None]:
movies.actors = movies.cast.apply(cast_to_actors)

### Create list of all actors

In [None]:
all_actors = []
for actors in movies.actors:
    for actor in actors:
        all_actors.append(actor)

In [None]:
actors = pd.Series(all_actors)

In [None]:
unique_actors_count = pd.crosstab(actors, columns='count')

unique_actors_count \
.sort_values(by='count', ascending=False).head(12)

#### Total Number of unique actors

In [None]:
len(unique_actors_count)

In [None]:
#### Total Number of actors that show up more than once

In [None]:
len(unique_actors_count[unique_actors_count.values > 1])

In [None]:
#### Total Number of actors that show up more than three times

In [None]:
len(unique_actors_count[unique_actors_count.values > 3])

In [None]:
sam_count = unique_actors_count[unique_actors_count.index == 'Samuel L. Jackson - 2231']
sam_count

In [None]:
sam_count.values[0][0]

### Create Actors Features

In [None]:
actors_string_id_by_id = {}
actor_counts_by_string_id = {}
unique_actors_count_dict = unique_actors_count.to_dict()['count']

def actor_has_more_than_x_movies(actor, number_of_movies = 3):
    string_id = str(actor['id'])

    # First, get the unique id created for the actor
    actor_string_id = ''
    if string_id in actors_string_id_by_id:
        actor_string_id = actors_string_id_by_id[string_id]
    else:
        actor_string_id = actor_to_id_string(actor)
        actors_string_id_by_id[string_id] = actor_string_id
    
    # Now let's see how many movies this actor has played in
    actor_count = unique_actors_count_dict[actor_string_id]
    return actor_count > number_of_movies

In [None]:
actor_has_more_than_x_movies(movies.cast[0][2], 1)

#### Create a new feature that represents all actors that show up in more than 3 movies (and are not sam l jackson)

In [None]:
movies_actors_ids = []
for actors in movies.cast:
    movie_actors_ids = []
    for actor in actors:
        if (not actor_has_more_than_x_movies(actor, 3)):
            continue
        movie_actors_ids.append(str(actor['id']))
    movies_actors_ids.append(' '.join(movie_actors_ids))

movies['actor_ids'] = movies_actors_ids
cleaned = movies.set_index('id').actor_ids.str.split(' ', expand=True).stack()
actor_features = pd.get_dummies(cleaned, prefix='actor').groupby(level=0).sum()

actor_features['id'] = actor_features.index
actor_features = actor_features.reset_index(drop=True)

## Actor Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
CountVectorizer(analyzer='word', binary=True)

In [None]:
movie_vector = vectorizer.fit_transform(movies_actors_ids).toarray()
actor_feature_names = vectorizer.get_feature_names()
actor_vector_works = pd.DataFrame(movie_vector, columns=actor_feature_names)
actor_vector_works['id'] = movies.id

### label records as having Samuel L. Jackson

In [None]:
movies.samuel = actor_vector_works['2231'] == 1

In [None]:
actor_vector_works.shape

In [None]:
movies_with_samuel = movies[movies.samuel == True]

#### There should be 67 movies with Samuel L Jackson

In [None]:
len(movies_with_samuel)

In [None]:
baseline_prediction = len(movies_with_samuel) / len(movies)
1 - baseline_prediction

# Create x/y

In [None]:
unimportant_features = [
    'genres',
    'homepage',
    'keywords',
    'original_language',
    'original_title',
    'overview',
    'production_companies',
    'spoken_languages',
    'status',
    'tagline',
    'title',
    'cast',
    'crew',
    'movie_id',
    'actor_ids'
]
important_features = [
    'id',
#    'budget',
#    'popularity',
#    'vote_average',
#    'vote_count'
]

movies_with_important_features = movies[important_features]

In [None]:
y = movies.samuel
x = pd.merge(left=movies_with_important_features, right=actor_vector_works, left_on='id', right_on='id')

In [None]:
len(x.columns)

In [None]:
# remove id from dataframe
x = x[x.columns.difference(['id'])]

### Create Cross-validation (20% holdout)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=3)

In [None]:
X_train.shape, y_train.shape

#### We can create an x variable with what we want directly, but can not assign it to the dataframe

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

## Get Accuracy

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import itertools

In [None]:
def show_confusion_matrix(y, predict_true, x):
    cm = confusion_matrix(y, predict_true)
    cm = pd.DataFrame(cm)
    cm.columns = ['Negative', 'Positive']
    cm.index = ['Negative', 'Positive']
    print(cm)
    
    return
    cmap=plt.cm.Blues
    classes = x.columns
    tick_marks = np.arange(len(classes))
    title = 'Confusion Matrix'

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
def get_accuracy(model, x, y):
    predict_true = np.zeros(len(y))
    if hasattr(model, 'predict'):
        predict_true = model.predict(x)
    else:
        predict_true = model.fit_predict(x)
    print('Number of positive predictions {}'.format(len(predict_true[predict_true == True])))
    print('')
    print('Accuracy {}'.format(accuracy_score(y, predict_true)))
    print('')
    show_confusion_matrix(y, predict_true, x)

In [None]:
print('Accuracy of Training Data')
get_accuracy(clf, X_train, y_train)

In [None]:
print('Accuracy of Test Data')
get_accuracy(clf, X_test, y_test)

#### Remove Samuel from list of actors

In [None]:
def remove_samuel(X_train, X_test):
    return [
        X_train[X_train.columns.difference(['2231'])]
        ,X_test[X_test.columns.difference(['2231'])]
    ]
X_train, X_test = remove_samuel(X_train, X_test)

## Add Additional Features

In [None]:

important_features = [
    'id',
    'budget',
    'popularity',
    'vote_average',
    'vote_count'
]

movies_with_important_features = movies[important_features]
y = movies.samuel
x = pd.merge(left=movies_with_important_features, right=actor_vector_works, left_on='id', right_on='id')
x = x[x.columns.difference(['id'])]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=3)

In [None]:
X_train, X_test = remove_samuel(X_train, X_test)

# Naive Bayes Classifications

In [None]:
clf = GaussianNB()
clf.fit(X_train, y_train)

In [None]:
x.shape

In [None]:
print('Accuracy of Training Data')
get_accuracy(clf, X_train, y_train)

In [None]:
print('Accuracy of Test Data')
get_accuracy(clf, X_test, y_test)

### Hierarchical Clustering

In [None]:
# from sklearn.cluster import AgglomerativeClustering
# cluster_model = AgglomerativeClustering(n_clusters=2, affinity='cosine', linkage='complete')
# cluster_model.fit(X_train, y_train)

In [None]:
# print('Accuracy of Training Data')
# get_accuracy(cluster_model, X_train, y_train)

In [None]:
# print('Accuracy of Test Data')
# get_accuracy(cluster_model, X_test, y_test)

# TODO

Perform deap learning prediction https://www.kaggle.com/liwste/simple-deep-mlp-with-keras