In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import json
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

In [2]:
# movies = pd.read_csv('./data/tmdb_5000_movies.csv').head(n=1)
# credits = pd.read_csv('./data/tmdb_5000_credits.csv').head(n=1)
movies = pd.read_csv('./data/tmdb_5000_movies.csv')
credits = pd.read_csv('./data/tmdb_5000_credits.csv')

In [3]:
movies.shape

(4803, 20)

In [4]:
movies.head(n=1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [5]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [6]:
movies = pd.merge(left=movies, right=credits, left_on='id', right_on='movie_id')

In [7]:
# turn json into python objects
json_columns = ['genres', 'keywords', 'production_countries', 'spoken_languages', 'cast', 'crew']
for column in json_columns:
    movies[column] = movies[column].apply(json.loads, encoding="utf-8")

In [8]:
movies.cast[0][:2]

[{'cast_id': 242,
  'character': 'Jake Sully',
  'credit_id': '5602a8a7c3a3685532001c9a',
  'gender': 2,
  'id': 65731,
  'name': 'Sam Worthington',
  'order': 0},
 {'cast_id': 3,
  'character': 'Neytiri',
  'credit_id': '52fe48009251416c750ac9cb',
  'gender': 1,
  'id': 8691,
  'name': 'Zoe Saldana',
  'order': 1}]

In [9]:
movies.genres[0][0]['name']

'Action'

In [10]:
def actor_to_id_string(actor):
    return '{} - {}'.format(actor['name'], actor['id'])

In [11]:
def cast_to_actors(cast):
    actors = []
    for person in cast:
        actors.append(actor_to_id_string(person))
    return actors

In [12]:
movies.actors = movies.cast.apply(cast_to_actors)

### Create list of all actors

In [13]:
all_actors = []
for actors in movies.actors:
    for actor in actors:
        all_actors.append(actor)
actors = pd.Series(all_actors)
unique_actors_count = pd.crosstab(actors, columns='count')

unique_actors_count \
    .sort_values(by='count', ascending=False).head(12)

col_0,count
row_0,Unnamed: 1_level_1
Samuel L. Jackson - 2231,67
Robert De Niro - 380,57
Bruce Willis - 62,51
Matt Damon - 1892,48
Morgan Freeman - 192,46
Steve Buscemi - 884,43
Liam Neeson - 3896,41
Johnny Depp - 85,40
Owen Wilson - 887,40
John Goodman - 1230,39


#### Total Number of unique actors

In [14]:
len(unique_actors_count)

54588

In [15]:
#### Total Number of actors that show up more than once

In [16]:
len(unique_actors_count[unique_actors_count.values > 1])

15661

In [17]:
#### Total Number of actors that show up more than three times

In [18]:
len(unique_actors_count[unique_actors_count.values > 3])

5326

In [19]:
sam_count = unique_actors_count[unique_actors_count.index == 'Samuel L. Jackson - 2231']
sam_count

col_0,count
row_0,Unnamed: 1_level_1
Samuel L. Jackson - 2231,67


In [20]:
sam_count.values[0][0]

67

### Create Actors Features

In [21]:
actors_string_id_by_id = {}
actor_counts_by_string_id = {}
unique_actors_count_dict = unique_actors_count.to_dict()['count']

def actor_has_more_than_x_movies(actor, number_of_movies = 3):
    string_id = str(actor['id'])

    # First, get the unique id created for the actor
    actor_string_id = ''
    if string_id in actors_string_id_by_id:
        actor_string_id = actors_string_id_by_id[string_id]
    else:
        actor_string_id = actor_to_id_string(actor)
        actors_string_id_by_id[string_id] = actor_string_id
    
    # Now let's see how many movies this actor has played in
    actor_count = unique_actors_count_dict[actor_string_id]
    return actor_count > number_of_movies

In [22]:
actor_has_more_than_x_movies(movies.cast[0][2], 1)

True

#### Create a new feature that represents all actors that show up in more than 3 movies

In [23]:
movies_actors_ids = []
for actors in movies.cast:
    movie_actors_ids = []
    for actor in actors:
        if (not actor_has_more_than_x_movies(actor, 3)):
            continue
        movie_actors_ids.append(str(actor['id']))
    movies_actors_ids.append(' '.join(movie_actors_ids))

## Actor Vectorization

In [24]:
vectorizer = CountVectorizer()

movie_vector = vectorizer.fit_transform(movies_actors_ids).toarray()
actor_feature_names = vectorizer.get_feature_names()
actor_vector_works = pd.DataFrame(movie_vector, columns=actor_feature_names)
actor_vector_works['id'] = movies.id

# Crew Vectorization

### Determine if cast was in more than 2 movies

In [25]:
movies.crew_people = movies.crew.apply(cast_to_actors)

In [26]:
all_crews = []
for crews in movies.crew_people:
    for crew in crews:
        all_crews.append(crew)
crews = pd.Series(all_crews)
unique_crews_count = pd.crosstab(crews, columns='count')

unique_crews_count \
    .sort_values(by='count', ascending=False).head(12)

col_0,count
row_0,Unnamed: 1_level_1
Robert Rodriguez - 2294,104
Steven Spielberg - 488,84
Avy Kaufman - 2952,83
Mary Vernieu - 5914,82
Deborah Aquila - 3965,75
Hans Zimmer - 947,71
James Newton Howard - 1213,69
Harvey Weinstein - 59839,68
Tricia Wood - 1034748,67
Bob Weinstein - 1307,67


In [None]:
crew_string_id_by_id = {}
crew_counts_by_string_id = {}
unique_crew_count_dict = unique_crews_count.to_dict()['count']

def crew_has_more_than_x_movies(crew, number_of_movies = 3):
    string_id = str(crew['id'])

    # First, get the unique id created for the crew
    crew_string_id = ''
    if string_id in crew_string_id_by_id:
        crew_string_id = crew_string_id_by_id[string_id]
    else:
        crew_string_id = actor_to_id_string(crew)
        crew_string_id_by_id[string_id] = crew_string_id
    
    # Now let's see how many movies this crew has played in
    crew_count = unique_crew_count_dict[crew_string_id]
    return crew_count > number_of_movies

In [None]:
movies_crews_ids = []
for crews in movies.crew:
    movie_crews_ids = []
    for crew in crews:
        if (not crew_has_more_than_x_movies(crew, 3)):
            continue
        movies_crews_ids.append(str(crew['id']))
    movies_crews_ids.append(' '.join(movies_crews_ids))

In [None]:
vectorizer = CountVectorizer()

movie_vector = vectorizer.fit_transform(movies_crews_ids).toarray()
crew_feature_names = vectorizer.get_feature_names()
crew_vector_works = pd.DataFrame(movie_vector, columns=crew_feature_names)
crew_vector_works['id'] = movies.id

### label records as having Any Star (top 5 high performers)

In [None]:
stars = [
    '2231', # Samuel L. Jackson
    '380',  # Robert De Niro
    '62',   # Bruce Willis
    '1892', # Matt Damon
    '192',  # Morgan Freeman
]
movies['stars'] = actor_vector_works[stars].any(axis='columns')
movies_with_stars = movies[movies.stars == True]
len(movies_with_stars)

### label records as having Samuel L. Jackson

In [None]:
movies['samuel'] = actor_vector_works['2231'] == 1

In [None]:
actor_vector_works.shape

In [None]:
movies_with_samuel = movies[movies.samuel == True]

#### There should be 67 movies with Samuel L Jackson

In [None]:
len(movies_with_samuel)

In [None]:
baseline_prediction = len(movies_with_stars) / len(movies)
1 - baseline_prediction

# Genres

In [None]:
movies.genres.head()

In [None]:
movies.genres[0][0]

In [None]:
def unique_genres(movies_genres):
    genre_map = {}
    for genres in movies_genres:
        for genre in genres:
            name = genre['name']
            if not name in genre_map:
                genre_map[name] = 0
            genre_map[name]+=1
    return pd.Series(genre_map).sort_values(ascending=False)

### All Unique Genres

In [None]:
unique_genres(movies.genres)

### Genres which Samuel is in

In [None]:
movies_merge = pd.merge(left=movies, right=actor_vector_works, left_on='id', right_on='id')
movies_with_samuel = movies_merge[movies_merge['2231'] == 1]
unique_genres(movies_with_samuel['genres'])

### Genres which Stars are in

In [None]:
movies_with_stars = movies_merge[movies_merge[stars].any(axis='columns')]
unique_genres(movies_with_stars['genres'])

### Number of genres in movie

In [None]:
number_genres = movies.genres.apply(len)
number_genres.plot.hist(bins=np.max(number_genres))
plt.show()

## Genre Vectorization

In [None]:
movies_genres_ids = []
for genres in movies.genres:
    movie_genres_ids = []
    for genre in genres:
        movie_genres_ids.append(str(genre['id']))
    movies_genres_ids.append(' '.join(movie_genres_ids))

In [None]:
genre_vectorization = CountVectorizer()

movie_vector = genre_vectorization.fit_transform(movies_genres_ids).toarray()
genre_feature_names = genre_vectorization.get_feature_names()
genre_vector = pd.DataFrame(movie_vector, columns=genre_feature_names)
genre_vector['id'] = movies.id

### Normalize Movie Features

In [None]:
features_to_normalize = [
    'budget',
    'popularity',
    'vote_average',
    'vote_count',
    'revenue'
]

In [None]:
for feature in features_to_normalize:
    feature_array = movies[feature].values.astype('float64').reshape(1, -1)
    #print(len(normalize(feature_array, axis=1)[0]))
    movies[feature] = normalize(feature_array, axis=1)[0]

In [None]:
movies[features_to_normalize].head(n=1)

# Create x/y

In [None]:
important_features = features_to_normalize + ['id']

movies_with_important_features = movies[important_features]
y = movies.stars
x = pd.merge(left=movies_with_important_features, right=actor_vector_works, left_on='id', right_on='id')
len(x.columns)

#### Merge in Genres

In [None]:
x = pd.merge(left=x, right=genre_vector, left_on='id', right_on='id')
len(x.columns)

### Remove id as a feature

In [None]:
x = x[x.columns.difference(['id'])]
len(x.columns)

### Remove "Stars" from actors

In [None]:
def remove_samuel(x):
    return x[x.columns.difference(stars)]
x = remove_samuel(x)
len(x.columns)

### Create Cross-validation (20% holdout)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=3)

In [None]:
X_train.shape, y_train.shape

## Get Accuracy

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import itertools

In [None]:
def show_confusion_matrix(y, predict_true, x):
    cm = confusion_matrix(y, predict_true)
    cm = pd.DataFrame(cm)
    cm.columns = ['Negative', 'Positive']
    cm.index = ['Negative', 'Positive']
    print(cm)
    
    return
    cmap=plt.cm.Blues
    classes = x.columns
    tick_marks = np.arange(len(classes))
    title = 'Confusion Matrix'

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
def get_accuracy(model, x, y):
    predict_true = np.zeros(len(y))
    if hasattr(model, 'predict'):
        predict_true = model.predict(x)
    else:
        predict_true = model.fit_predict(x)
    print('Number of positive predictions {}'.format(len(predict_true[predict_true == True])))
    print('')
    print('Accuracy {}'.format(accuracy_score(y, predict_true)))
    print('')
    show_confusion_matrix(y, predict_true, x)

# Naive Bayes Classifications

In [None]:
bayes_model = MultinomialNB()
bayes_model.fit(X_train, y_train)

In [None]:
print('Accuracy of Training Data')
get_accuracy(bayes_model, X_train, y_train)

In [None]:
print('Accuracy of Test Data')
get_accuracy(bayes_model, X_test, y_test)

# Deep Learning (Dense Layers)

In [None]:
from keras import models, layers

In [None]:
model = models.Sequential()
model.add(layers.Dense(8, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(8, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(16, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=64)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
print(confusion_matrix(y_test, model.predict_classes(X_test) == 1))

### Hierarchical Clustering

In [None]:
# from sklearn.cluster import AgglomerativeClustering
# cluster_model = AgglomerativeClustering(n_clusters=2, affinity='cosine', linkage='complete')
# cluster_model.fit(X_train, y_train)

In [None]:
# print('Accuracy of Training Data')
# get_accuracy(cluster_model, X_train, y_train)

In [None]:
# print('Accuracy of Test Data')
# get_accuracy(cluster_model, X_test, y_test)

# TODO

Perform deap learning prediction https://www.kaggle.com/liwste/simple-deep-mlp-with-keras