In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import json
import matplotlib.pyplot as plt

In [2]:
# movies = pd.read_csv('./data/tmdb_5000_movies.csv').head(n=1)
# credits = pd.read_csv('./data/tmdb_5000_credits.csv').head(n=1)
movies = pd.read_csv('./data/tmdb_5000_movies.csv')
credits = pd.read_csv('./data/tmdb_5000_credits.csv')

In [3]:
movies.head(n=1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
movies = pd.merge(left=movies, right=credits, left_on='id', right_on='movie_id')

In [6]:
# turn json into python objects
json_columns = ['genres', 'keywords', 'production_countries', 'spoken_languages', 'cast', 'crew']
for column in json_columns:
    movies[column] = movies[column].apply(json.loads, encoding="utf-8")

In [7]:
movies.cast[0][:2]

[{'cast_id': 242,
  'character': 'Jake Sully',
  'credit_id': '5602a8a7c3a3685532001c9a',
  'gender': 2,
  'id': 65731,
  'name': 'Sam Worthington',
  'order': 0},
 {'cast_id': 3,
  'character': 'Neytiri',
  'credit_id': '52fe48009251416c750ac9cb',
  'gender': 1,
  'id': 8691,
  'name': 'Zoe Saldana',
  'order': 1}]

In [8]:
movies.genres[0][0]['name']

'Action'

In [9]:
def actor_to_id_string(actor):
    return '{} - {}'.format(actor['name'], actor['id'])

In [10]:
def cast_to_actors(cast):
    actors = []
    for person in cast:
        actors.append(actor_to_id_string(person))
    return actors

In [11]:
movies.actors = movies.cast.apply(cast_to_actors)

### Create list of all actors

In [12]:
all_actors = []
for actors in movies.actors:
    for actor in actors:
        all_actors.append(actor)

In [13]:
actors = pd.Series(all_actors)

In [14]:
unique_actors_count = pd.crosstab(actors, columns='count')

unique_actors_count \
.sort_values(by='count', ascending=False).head(12)

col_0,count
row_0,Unnamed: 1_level_1
Samuel L. Jackson - 2231,67
Robert De Niro - 380,57
Bruce Willis - 62,51
Matt Damon - 1892,48
Morgan Freeman - 192,46
Steve Buscemi - 884,43
Liam Neeson - 3896,41
Johnny Depp - 85,40
Owen Wilson - 887,40
John Goodman - 1230,39


#### Total Number of unique actors

In [15]:
len(unique_actors_count)

54588

In [16]:
#### Total Number of actors that show up more than once

In [17]:
len(unique_actors_count[unique_actors_count.values > 1])

15661

In [18]:
#### Total Number of actors that show up more than three times

In [19]:
len(unique_actors_count[unique_actors_count.values > 3])

5326

In [20]:
sam_count = unique_actors_count[unique_actors_count.index == 'Samuel L. Jackson - 2231']
sam_count

col_0,count
row_0,Unnamed: 1_level_1
Samuel L. Jackson - 2231,67


In [21]:
sam_count.values[0][0]

67

### Create Actors Features

In [22]:
actors_string_id_by_id = {}
actor_counts_by_string_id = {}
unique_actors_count_dict = unique_actors_count.to_dict()['count']

def actor_has_more_than_x_movies(actor, number_of_movies = 3):
    string_id = str(actor['id'])

    # First, get the unique id created for the actor
    actor_string_id = ''
    if string_id in actors_string_id_by_id:
        actor_string_id = actors_string_id_by_id[string_id]
    else:
        actor_string_id = actor_to_id_string(actor)
        actors_string_id_by_id[string_id] = actor_string_id
    
    # Now let's see how many movies this actor has played in
    actor_count = unique_actors_count_dict[actor_string_id]
    return actor_count > number_of_movies

In [23]:
actor_has_more_than_x_movies(movies.cast[0][2], 1)

True

#### Create a new feature that represents all actors that show up in more than 3 movies (and are not sam l jackson)

In [24]:
movies_actors_ids = []
for actors in movies.cast:
    movie_actors_ids = []
    for actor in actors:
        # if (not actor_has_more_than_x_movies(actor, 3)):
            # continue
        movie_actors_ids.append(str(actor['id']))
    movies_actors_ids.append(' '.join(movie_actors_ids))

movies['actor_ids'] = movies_actors_ids
cleaned = movies.set_index('id').actor_ids.str.split(' ', expand=True).stack()
actor_features = pd.get_dummies(cleaned, prefix='actor').groupby(level=0).sum()

actor_features['id'] = actor_features.index
actor_features = actor_features.reset_index(drop=True)

## Actor Vectorization

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
CountVectorizer(analyzer='word', binary=True)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [26]:
movie_vector = vectorizer.fit_transform(movies_actors_ids).toarray()

In [27]:
actor_feature_names = vectorizer.get_feature_names()

In [28]:
actor_vector_works = pd.DataFrame(movie_vector, columns=actor_feature_names)

In [29]:
actor_vector_works['id'] = movies.id

### label records as having Samuel L. Jackson

In [30]:
movies.samuel = actor_vector_works['2231'] == 1

In [31]:
movies_with_samuel = movies[movies.samuel == True]

#### There should be 67 movies with Samuel L Jackson

In [32]:
len(movies_with_samuel)

67

In [33]:
baseline_prediction = len(movies_with_samuel) / len(movies)
1 - baseline_prediction

0.9860503851759317

# Create x/y

In [34]:
unimportant_features = [
    'genres',
    'homepage',
    'keywords',
    'original_language',
    'original_title',
    'overview',
    'production_companies',
    'spoken_languages',
    'status',
    'tagline',
    'title',
    'cast',
    'crew',
    'movie_id',
    'actor_ids'
]
important_features = [
    'id',
#    'budget',
#    'popularity',
#    'vote_average',
#    'vote_count'
]

movies_with_important_features = movies[important_features]

In [35]:
y = movies.samuel
x = pd.merge(left=movies_with_important_features, right=actor_vector_works, left_on='id', right_on='id')

In [36]:
len(x.columns)

54581

In [37]:
# remove id from dataframe
x = x[x.columns.difference(['id'])]

### Create Cross-validation (20% holdout)

In [71]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=3)

In [72]:
X_train.shape, y_train.shape

((3842, 54580), (3842,))

#### We can create an x variable with what we want directly, but can not assign it to the dataframe

In [40]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB(priors=None)

In [41]:
clf.score(X_train, y_train)

1.0

In [42]:
clf.score(X_test, y_test)

0.9885535900104059

## Get Accuracy

In [54]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import itertools

In [73]:
def show_confusion_matrix(y, predict_true, x):
    cm = confusion_matrix(y, predict_true)
    cm = pd.DataFrame(cm)
    cm.columns = ['Negative', 'Positive']
    cm.index = ['Negative', 'Positive']
    print(cm)
    
    return
    cmap=plt.cm.Blues
    classes = x.columns
    tick_marks = np.arange(len(classes))
    title = 'Confusion Matrix'

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [74]:
def get_accuracy(model, x, y):
    predict_true = model.predict(x)
    print('Number of positive predictions {}'.format(len(predict_true[predict_true == True])))
    print('')
    print('Accuracy {}'.format(accuracy_score(y, predict_true)))
    print('')
    show_confusion_matrix(y, predict_true, x)

In [75]:
print('Accuracy of Training Data')
get_accuracy(clf, X_train, y_train)

Accuracy of Training Data
Number of positive predictions 56

Accuracy 1.0

          Negative  Positive
Negative      3786         0
Positive         0        56


In [76]:
print('Accuracy of Test Data')
get_accuracy(clf, X_test, y_test)

Accuracy of Test Data
Number of positive predictions 0

Accuracy 0.9885535900104059

          Negative  Positive
Negative       950         0
Positive        11         0


#### Remove Samuel from list of actors

In [88]:
def remove_samuel(X_train, X_test):
    return [
        X_train[X_train.columns.difference(['2231'])]
        ,X_test[X_test.columns.difference(['2231'])]
    ]
X_train, X_test = remove_samuel(X_train, X_test)

In [78]:
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB(priors=None)

In [79]:
print('Accuracy of Training Data')
get_accuracy(clf, X_train, y_train)

Accuracy of Training Data
Number of positive predictions 94

Accuracy 0.9901093180635085

          Negative  Positive
Negative      3748        38
Positive         0        56


In [80]:
print('Accuracy of Test Data')
get_accuracy(clf, X_test, y_test)

Accuracy of Test Data
Number of positive predictions 29

Accuracy 0.9583766909469302

          Negative  Positive
Negative       921        29
Positive        11         0


## Add Additional Features

In [101]:

important_features = [
    'id',
    'budget',
    'popularity',
    'vote_average',
    'vote_count'
]

movies_with_important_features = movies[important_features]
y = movies.samuel
x = pd.merge(left=movies_with_important_features, right=actor_vector_works, left_on='id', right_on='id')
x = x[x.columns.difference(['id'])]

In [102]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=3)

In [103]:
X_train, X_test = remove_samuel(X_train, X_test)

In [104]:
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB(priors=None)

In [105]:
x

Unnamed: 0,10,100,10000,1000083,1000089,1000227,1000228,1000241,100040,100062,...,999771,999772,999790,9998,999817,9999,budget,popularity,vote_average,vote_count
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,237000000,150.437577,7.2,11800
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,300000000,139.082615,6.9,4500
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,245000000,107.376788,6.3,4466
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,250000000,112.312950,7.6,9106
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,260000000,43.926995,6.1,2124
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,258000000,115.699814,5.9,3576
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,260000000,48.681969,7.4,3330
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,280000000,134.279229,7.3,6767
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,250000000,98.885637,7.4,5293
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,250000000,155.790452,5.7,7004


In [106]:
print('Accuracy of Training Data')
get_accuracy(clf, X_train, y_train)

Accuracy of Training Data
Number of positive predictions 104

Accuracy 0.9635606454971369

          Negative  Positive
Negative      3692        94
Positive        46        10


In [107]:
print('Accuracy of Test Data')
get_accuracy(clf, X_test, y_test)

Accuracy of Test Data
Number of positive predictions 21

Accuracy 0.9667013527575442

          Negative  Positive
Negative       929        21
Positive        11         0


# Support Vector Machine

In [108]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [109]:
print('Accuracy of Training Data')
get_accuracy(clf, X_train, y_train)

Accuracy of Training Data
Number of positive predictions 15

Accuracy 0.989328474752733

          Negative  Positive
Negative      3786         0
Positive        41        15


In [110]:
print('Accuracy of Test Data')
get_accuracy(clf, X_test, y_test)

Accuracy of Test Data
Number of positive predictions 0

Accuracy 0.9885535900104059

          Negative  Positive
Negative       950         0
Positive        11         0


# TODO

Perform deap learning prediction https://www.kaggle.com/liwste/simple-deep-mlp-with-keras