In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import os, shutil

from sklearn.model_selection import train_test_split

### Load Movies

In [2]:
# %load load-movies.py
import pandas as pd

movies = pd.read_csv('./data/tmdb_5000_movies.csv')
credits = pd.read_csv('./data/tmdb_5000_credits.csv')

In [3]:
# %load clean-movies.py
import pandas as pd
import numpy as np
import sklearn as sk
import json

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

# merge two datasets
movies = pd.merge(left=movies, right=credits, left_on='id', right_on='movie_id')

# turn json into python objects
json_columns = ['genres', 'keywords', 'production_countries', 'spoken_languages', 'cast', 'crew']
for column in json_columns:
  movies[column] = movies[column].apply(json.loads, encoding="utf-8")



######## Actors ########

# create functions that will help with extracting actor information
def actor_to_id_string(actor):
  return '{} - {}'.format(actor['name'], actor['id'])

def cast_to_actors(cast):
  actors = []
  for person in cast:
    actors.append(actor_to_id_string(person))
  return actors

# Create a new column (array) from cast
movies.actors = movies.cast.apply(cast_to_actors)

# Create a list of all actors
all_actors = []
for actors in movies.actors:
  for actor in actors:
    all_actors.append(actor)
actors = pd.Series(all_actors)
unique_actors_count = pd.crosstab(actors, columns='count')

# Create function to determine if actor has been in more than 3 films
actors_string_id_by_id = {}
actor_counts_by_string_id = {}
unique_actors_count_dict = unique_actors_count.to_dict()['count']

def actor_has_more_than_x_movies(actor, number_of_movies = 3):
    string_id = str(actor['id'])

    # First, get the unique id created for the actor
    actor_string_id = ''
    if string_id in actors_string_id_by_id:
        actor_string_id = actors_string_id_by_id[string_id]
    else:
        actor_string_id = actor_to_id_string(actor)
        actors_string_id_by_id[string_id] = actor_string_id
    
    # Now let's see how many movies this actor has played in
    actor_count = unique_actors_count_dict[actor_string_id]
    return actor_count > number_of_movies

# Create a list of all actors that have been in more than 3 movies
movies_actors_ids = []
for actors in movies.cast:
  movie_actors_ids = []
  for actor in actors:
    if (not actor_has_more_than_x_movies(actor, 3)):
      continue
    movie_actors_ids.append(str(actor['id']))
  movies_actors_ids.append(' '.join(movie_actors_ids))

# Vectorize the list of actors
def get_actor_feature_name(id):
  return actors_string_id_by_id[id] + ' (actor)'
vectorizer = CountVectorizer()
movie_vector = vectorizer.fit_transform(movies_actors_ids).toarray()

actor_feature_ids = vectorizer.get_feature_names()
actor_feature_names = []
for id in actor_feature_ids:
  actor_name = get_actor_feature_name(id)
  actor_feature_names.append(actor_name)

actor_vector_works = pd.DataFrame(movie_vector, columns=actor_feature_names)
actor_vector_works['id'] = movies.id

# Create the label for whether Samuel L Jackson was in the film (samuel)
movies['samuel'] = actor_vector_works[get_actor_feature_name('2231')] == 1





######## GENRES ########

def unique_genres(movies_genres):
  genre_map = {}
  for genres in movies_genres:
    for genre in genres:
      name = genre['name']
      if not name in genre_map:
        genre_map[name] = 0
      genre_map[name]+=1
  return pd.Series(genre_map).sort_values(ascending=False)

def get_genres_feature_name(genre):
  return genre['name'] + ' (genre)'

movies_genres_ids = []
movie_genres_names_by_id = {}
for genres in movies.genres:
  movie_genres_ids = []
  for genre in genres:
    genre_feature_name = get_genres_feature_name(genre)
    movie_genres_ids.append(str(genre['id']))
    movie_genres_names_by_id[str(genre['id'])] = genre_feature_name
  movies_genres_ids.append(' '.join(movie_genres_ids))

genre_vectorization = CountVectorizer()

movie_vector = genre_vectorization.fit_transform(movies_genres_ids).toarray()
genre_feature_ids = genre_vectorization.get_feature_names()

def get_feature_names_by_ids(ids):
  genre_feature_names = []

  for id in ids:
    genre_feature_names.append(movie_genres_names_by_id[id])
  return genre_feature_names

genre_feature_names = get_feature_names_by_ids(genre_feature_ids)

genre_vector = pd.DataFrame(movie_vector, columns=genre_feature_names)
genre_vector['id'] = movies.id

features_to_normalize = [
  'budget',
  'popularity',
  'vote_average',
  'vote_count',
  'revenue'
]

for feature in features_to_normalize:
  feature_array = movies[feature].values.astype('float64').reshape(1, -1)
  movies[feature + '_original'] =     movies[feature]
  movies[feature] = normalize(feature_array, axis=1, norm='max')[0]






### Load Images

In [4]:
posters_dir = 'data/posters'
poster_filenames = os.listdir(posters_dir)

### Create DataFrame from filenames and movie ids

In [5]:
def get_movie_id_from_filename(filename):
    movie_id = filename.split('.')[0]
    return movie_id

file_movie_ids = []

for filename in poster_filenames:
    movie_id = get_movie_id_from_filename(filename)
    file_movie_ids.append(movie_id)

In [6]:
poster_df = pd.DataFrame({
    'movie_id': pd.Series(file_movie_ids).astype('int64'),
    'filenames': poster_filenames
})

In [7]:
type(file_movie_ids[0])

str

In [8]:
type(movies.id[0])

numpy.int64

In [9]:
movies_poster_merge = pd.merge(left=movies, right=poster_df, left_on='id', right_on='movie_id')

In [10]:
movies_poster_merge.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,cast,crew,samuel,budget_original,popularity_original,vote_average_original,vote_count_original,revenue_original,filenames,movie_id_y
0,0.623684,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.avatarmovie.com/,19995,"[{'id': 1463, 'name': 'culture clash'}, {'id':...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",0.171815,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...",False,237000000,150.437577,7.2,11800,2787965087,19995.jpg,19995
1,0.789474,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://disney.go.com/disneypictures/pirates/,285,"[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",0.158846,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",False,300000000,139.082615,6.9,4500,961000000,285.jpg,285
2,0.644737,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{'id': 470, 'name': 'spy'}, {'id': 818, 'name...",en,Spectre,A cryptic message from Bond’s past sends him o...,0.122635,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...",False,245000000,107.376788,6.3,4466,880674609,206647.jpg,206647
3,0.657895,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",http://www.thedarkknightrises.com/,49026,"[{'id': 849, 'name': 'dc comics'}, {'id': 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,0.128272,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...",False,250000000,112.31295,7.6,9106,1084939099,49026.jpg,49026
4,0.684211,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://movies.disney.com/john-carter,49529,"[{'id': 818, 'name': 'based on novel'}, {'id':...",en,John Carter,"John Carter is a war-weary, former military ca...",0.050169,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...",False,260000000,43.926995,6.1,2124,284139100,49529.jpg,49529


### Define features and labels

In [11]:
movies_poster_merge.size

153376

In [12]:
y = movies_poster_merge.samuel
X = movies_poster_merge[['filenames', 'samuel']]

### Create stratified Cross Validation

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=42)

In [14]:
X_train.size

6710

In [15]:
X_test.size

2876

In [16]:
y_train[y_train].size

47

In [17]:
y_test[y_test].size

20

### Create Directories

In [18]:
posters_test_dir = 'data/posters_test'
posters_train_dir = 'data/posters_train'
posters_sam_train_dir = os.path.join(posters_train_dir, 'sam')
posters_not_sam_train_dir = os.path.join(posters_train_dir, 'not_sam')
posters_sam_test_dir = os.path.join(posters_test_dir, 'sam')
posters_not_sam_test_dir = os.path.join(posters_test_dir, 'not_sam')

directories_to_create = [
    posters_test_dir,
    posters_train_dir,
    posters_sam_train_dir,
    posters_not_sam_train_dir,
    posters_sam_test_dir,
    posters_not_sam_test_dir,
]

In [19]:
for directory in directories_to_create:
    try:
        os.mkdir(directory)
    except:
        print('{} already exists'.format([directory]))

### Copy Files over to new directories

In [20]:
train_filenames = X_train

In [21]:
train_filenames = X_train
for index, row in train_filenames.iterrows():
    filename = row.filenames
    file_from = os.path.join(posters_dir, filename)
    file_to = ''
    if row.samuel:
        file_to = os.path.join(posters_sam_train_dir, filename)
    else:
        file_to = os.path.join(posters_not_sam_train_dir, filename)        
    shutil.copyfile(file_from, file_to)
    
test_filenames = X_test
for index, row in test_filenames.iterrows():
    filename = row.filenames
    file_from = os.path.join(posters_dir, filename)
    file_to = ''
    if row.samuel:
        file_to = os.path.join(posters_sam_test_dir, filename)
    else:
        file_to = os.path.join(posters_not_sam_test_dir, filename)        
    shutil.copyfile(file_from, file_to)