In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from glob import glob
import os
import json
import re
from collections import defaultdict
import itertools

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# Data pre-processing

In [2]:
df_movies = pd.read_csv('dataset/tmdb_5000_movies.csv')
df_movies = df_movies.dropna(subset=['overview', 'release_date', 'runtime'])
df_movies.isnull().sum()
df_movies.to_csv("dataset/tmdb_5000_movies_nonull.csv")

# Data Shape

In [3]:
df_movies = pd.read_csv('dataset/tmdb_5000_movies_nonull.csv')
print(df_movies.shape)
df_movies.head()

(4799, 21)


Unnamed: 0.1,Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,"[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [4]:
df_credits = pd.read_csv('dataset/tmdb_5000_credits.csv')
print(df_credits.shape)
df_credits.head()

(4803, 4)


Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


## Load two datasets and merge them by movie_id

In [5]:
df = pd.read_csv('dataset/tmdb_5000_movies_nonull.csv')
# df_movies.loc[:,['id', 'keywords', 'genres', 'title']]
df_credits = pd.read_csv('dataset/tmdb_5000_credits.csv')
credits_sub = df_credits.loc[:, ['movie_id', 'cast']].rename(columns={
                                                             'movie_id': 'id'})

df = df[['id', 'budget', 'genres', 'keywords', 'original_language',
         'overview', 'popularity', 'production_companies',
         'production_countries', 'release_date', 'revenue', 'runtime',
         'spoken_languages', 'tagline', 'title', 'vote_average', 'vote_count']]
df = df.merge(credits_sub)

# df.sort_values(by='release_date', ascending=False).head(10)

In [6]:
print(df.shape)
df.head()

(4799, 18)


Unnamed: 0,id,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,cast
0,19995,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Enter the World of Pandora.,Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""..."
1,285,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]","At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa..."
2,206647,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",A Plan No One Escapes,Spectre,6.3,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr..."
3,49026,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",The Legend Ends,The Dark Knight Rises,7.6,9106,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba..."
4,49529,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]","Lost in our world, found in another.",John Carter,6.1,2124,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c..."


## Add subtitles (US movies only)

In [7]:
with open('dataset/subtitles/subtitles.json', 'r') as f:
    sub_dict = json.load(f)
sub_dict = defaultdict(str, sub_dict)

In [8]:
#df["subtitles"] = df.title.apply(lambda title: sub_dict[title]) # keep blocks separated
df["subtitles"] = df.title.apply(lambda title: "\n\n".join(sub_dict[title])) # single string

In [9]:
df[df["subtitles"] == ''].shape

(1359, 19)

In [10]:
df['subtitles'].head(10)

0    When I was lying there\nin the VA hospital,\n\...
1    [man] "ln order to affect a timely halt\nto de...
2    Where are you going?\n\nI won't be long.\n\nWe...
3    I knew Harvey Dent.\n\nI was his friend.\n\nAn...
4    Subtitles downloaded from Podnapisi.NET\n\nMar...
5                                                     
6    OK, Mr. Kline?\n\nDo not move.\n\nWhat happene...
7    Report\nto your stations immediately.\n\nThis ...
8    Subtitles downloaded from Podnapisi.NET\n\nI k...
9    KRYPTONIAN SHIP VOICE: Welcome.\n\nAnalysis re...
Name: subtitles, dtype: object

## Drop movies with no genres information

In [11]:
# drop movies with no genre info
for i, row in df.iterrows():
    if row['genres'] == '[]':
        df.drop(i, inplace=True)
df = df.dropna()

## Movies  Genres Information

In [12]:
all_genres = defaultdict(int)

for row in df.genres:
    for item in json.loads(row):
#         print(item)
        all_genres[item['name']] += 1

In [13]:
genres_list = sorted(all_genres.items(), key=lambda x: x[1], reverse=True)
genres_list

[('Drama', 1801),
 ('Comedy', 1442),
 ('Thriller', 1151),
 ('Action', 1025),
 ('Romance', 726),
 ('Adventure', 720),
 ('Crime', 627),
 ('Science Fiction', 488),
 ('Horror', 470),
 ('Family', 437),
 ('Fantasy', 388),
 ('Mystery', 316),
 ('Animation', 201),
 ('History', 153),
 ('Music', 149),
 ('War', 115),
 ('Western', 69),
 ('Documentary', 56),
 ('Foreign', 14),
 ('TV Movie', 3)]

## Column transformation

**Steps:**  
+ Filter columns with specific filtering values
+ Convert json-format cells to list
+ Extract cast gender and calculate the proportion of female cast and male cast for each movie

In [14]:
def convert_list(cell):
    """convert the json format to a list of categories"""
    kw_list = []
    for kw in json.loads(cell):
        kw_list.append(kw['name'])
    return kw_list


def larger_n(col, n):
    """filter the column"""
    keywords = defaultdict(int)
    for row in df[col]:
        row = json.loads(row)
        for entry in row:
            keywords[entry['name']] += 1
    kw_cnt = sorted(keywords.items(), key=lambda x: -x[1])
    return [kw[0] for kw in kw_cnt if kw[1] >= n]


def extract_gender(cell):
    """Extract cast gender"""
    female = 0
    male = 0
    for item in json.loads(cell):
        if item['gender'] == 1:
            female += 1
        elif item['gender'] == 2:
            male += 1
        else:
            continue
    return female, male


def concat_names(cell):
    """Concatenate first names and last names"""
    names = []
    for name in cell:
        names.append(name.replace(' ', ''))
    return names


def list2str(cell):
    """Convert list to string"""
    return ' '.join(cell)


def transform_cols(df, cols_to_transform):
    """Transform columns of a dataframe.
    cols_to_transform should be a dict(col_name: filter value n)
    """
    for col_name in cols_to_transform.keys():
        larger_col = larger_n(col_name, cols_to_transform[col_name])
        if col_name == 'cast':
            gen = df[col_name].apply(extract_gender)
            df['female_pct'] = gen.apply(lambda x: x[0]/(x[0]+x[1]+0.001))
            df['male_pct'] = gen.apply(lambda x: x[1]/(x[0]+x[1]+0.001))

        df[col_name] = df[col_name].apply(convert_list)\
            .apply(lambda cell: [kw for kw in cell if kw in larger_col])
    return df

def tokenize(text):
    """
    A tokenizer, remove special characters punctuation and replace them with a space, stem the word
    """
    text = text.lower()
    cleanString = re.sub('[^a-zA-Z]',' ', text)
    words = nltk.word_tokenize(cleanString)
    english = list(ENGLISH_STOP_WORDS)
    goodwords = [w for w in words if w not in english]
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in goodwords]
    return stemmed

In [15]:
cols_to_transform = {'keywords': 30,
                     'genres': 0,
                     'production_companies': 5,
                     'production_countries': 3,
                     'spoken_languages': 10,
                     'cast': 2}

df_movies = transform_cols(df, cols_to_transform)

In [16]:
df_movies = df_movies[df_movies["subtitles"] != ''] # use only movies with subtitles
df_movies = df_movies.reset_index(drop=True)
df_movies.shape

(3037, 21)

# Feature Engineering

## Runtime - drop 0 runtime

In [17]:
df_movies.loc[df_movies.runtime == 0]

Unnamed: 0,id,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,...,runtime,spoken_languages,tagline,title,vote_average,vote_count,cast,subtitles,female_pct,male_pct
2966,285743,0,"[Music, Horror]","[musical, sequel]",en,The Devil's Carnival: Alleluia! is the second ...,0.674398,[],[United States of America],2016-03-29,...,0.0,[],Hell ain't got a prayer.,Alleluia! The Devil's Carnival,6.0,2,"[Terrance Zdunich, Paul Sorvino, Nivek Ogre, A...",Advertise your product or brand here\ncontact ...,0.363603,0.636306


Checking the movies that have no runtime data, we can see that these six movies also lack revenue data, production companies, keywords, and all have very few vote counts. So we can think that removing these six rows will not have significant influence on our model.  

In [18]:
df_movies = df_movies.drop(df_movies.loc[df_movies.runtime == 0].index)

In [19]:
df_movies = df_movies.reset_index(drop=True)

## Overview -- TfidfVectorizer

In [20]:
import re
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.porter import *

In [None]:
import re
import nltk
nltk.download()
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.porter import *

df_movies = df_movies.loc[df_movies.overview.apply(lambda x: len(x) > 10)]
# df_movies = df_movies.reset_index(drop=True)

tfidf = TfidfVectorizer(tokenizer=tokenize,stop_words='english', binary=True,
                        ngram_range=(1, 2), min_df=0.001)
overview_vec = tfidf.fit_transform(df_movies['overview'])

overview = pd.DataFrame(overview_vec.toarray(), columns=tfidf.get_feature_names()).add_prefix('ov_')
# overview_vec = overview_vec.todense()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [None]:
overview.head(5)

## Subtitles -- Word2Vec

### Tfidf

Tfidf-ing all the subtitles will take some time. Be prapared when you run it!

In [None]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', binary=True,
                        ngram_range=(1, 1), min_df=0.001)
subtitles_vec = tfidf.fit_transform(df_movies['subtitles'])
subtitles = pd.DataFrame(subtitles_vec.toarray(), columns=tfidf.get_feature_names()).add_prefix('sub_')

In [None]:
subtitles.head(5)

### Doc2Vec

In [None]:
def text_to_words(text):
    # remove punctuation and whitespace
    # but keep hyphens and apostrophes
    filtered_text = re.sub(r'[^\w\'\s-]',
                           '', text) 
    return word_tokenize(filtered_text.lower())

In [None]:
idxs = df_movies.id.tolist()
%time sub_words = [text_to_words(text) for text in df_movies.subtitles.tolist()]
subs = dict(zip(idxs, sub_words))

In [None]:
%time tagged_data = [TaggedDocument(words=word_list, tags=[index]) for index, word_list in subs.items()]

#### Model 1: not pretrained

In [None]:
model = Doc2Vec(vector_size=50, min_count=2, workers=4)
%time model.build_vocab(tagged_data)

In [None]:
%time model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

#### Model 2: using pretrained word2vec weights

In [None]:
#download glove.6B.50d.txt from
#https://www.kaggle.com/rtatman/glove-global-vectors-for-word-representation/version/1
# run following command to convert to word2vec format (extra line at the top)
#python -m gensim.scripts.glove2word2vec -i glove.6B.50d.txt -o glove.6B.50d.word2vec.txt
#https://github.com/RaRe-Technologies/gensim/blob/4543646d3fe3496e11bc935e72cbf9b18504442e/gensim/models/word2vec.py

In [None]:
word2vec = Word2Vec(size=50)
%time word2vec.build_vocab([word for text in tagged_data for word in text])

In [None]:
# lockf=0 doesn't train word vectors any further, 1.0 does.
glove_path = "/Users/xhan/OneDrive/dataset/glove/glove.6B.50d.word2vec.txt"
%time word2vec.intersect_word2vec_format(glove_path, lockf=1.0, binary=False, encoding='utf8', unicode_errors='strict')

In [None]:
model_pretrained = Doc2Vec(vector_size=50, min_count=2, workers=4)
model_pretrained.build_vocab(tagged_data)
model_pretrained.wv = word2vec.wv
%time model_pretrained.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
def infer_docvecs(df):
    docvecs = []
    for index in tqdm(df.id.tolist()):
        word_list = subs[index]
        vec = model_pretrained.infer_vector(word_list, steps=20)
        docvecs.append(vec)
    docvecs = np.array(docvecs, dtype=np.float32)
    return docvecs

In [None]:
movies_docvecs = infer_docvecs(df_movies)
movies_docvecs.shape

## Cast names -- CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# concatenate names and join the list of names to a string
df_movies['cast'] = df_movies['cast'].apply(concat_names).apply(list2str)

vectorizer = CountVectorizer(ngram_range=(1, 1))
cast_vect = vectorizer.fit_transform(df_movies['cast'])

cast = pd.DataFrame(cast_vect.todense(), columns=vectorizer.get_feature_names()).add_prefix('cast_')

## Budget + Revenue -- deal with abnormal values

There are some movies in the data set has less than $100 budget, which obviously does not make any sense. So we need to deal with these abnormal values.

In [None]:
# number of movies with less than $100 budget
df_movies.loc[(df_movies.budget<100)].shape[0]

For these 107 movies with less than \\$100 budget and more than \\$10000 revenue, we use `revenue + overview + popularity + release_date` to predict the actual budget.

In [None]:
df_movies.loc[(df_movies.budget<100) & (df_movies.revenue>10000)].shape

In [None]:
from sklearn import ensemble


def predict_budget1(dataset):
    """Predict budget of movies with less than $100 budget and more than $10000 revenue"""
    data_p = dataset[['revenue', 'popularity', 'budget']]
    data_p = pd.concat([data_p, overview], axis=1)

    x_train = data_p.loc[~(data_p.budget < 100) & (
        data_p.revenue > 10000)].drop('budget', 1)
    y_train = data_p.loc[~(data_p.budget < 100) &
                         (data_p.revenue > 10000)]['budget']
    x_test = data_p.loc[(data_p.budget < 100) & (
        data_p.revenue > 10000)].drop('budget', 1)

    rfr = ensemble.RandomForestRegressor(random_state=42)
    rfr.fit(x_train, y_train)

    y_test = rfr.predict(x_test)
    return y_test


df_movies.loc[(df_movies.budget < 100) & 
              (df_movies.revenue > 10000), 'budget'] = predict_budget1(df_movies)

In [None]:
# check
df_movies.loc[(df_movies.budget<100) & (df_movies.revenue>10000)].shape

For others whose revenue are below $10000, use `overview + popularity + release_date` to predict budget

In [None]:
df_movies.loc[(df_movies.budget<100)].shape

In [None]:
def predict_budget2(dataset):
    """Predict budget of movies with less than $100 budget and less than $10000 revenue"""
    data_p = dataset[['popularity', 'budget']]
    data_p = pd.concat([data_p, overview], axis=1)

    x_train = data_p.loc[~(data_p.budget < 100)].drop('budget', 1)
    y_train = data_p.loc[~(data_p.budget < 100)]['budget']
    x_test = data_p.loc[(data_p.budget < 100) & (
        df_movies.revenue <= 10000)].drop('budget', 1)

    rfr = ensemble.RandomForestRegressor(random_state=42)
    rfr.fit(x_train, y_train)
#     print(x_test.shape)
    y_test = rfr.predict(x_test)
    return y_test


df_movies.loc[(df_movies.budget < 100) & (df_movies.revenue <=
                                          10000), 'budget'] = predict_budget2(df_movies)

In [None]:
df_movies.loc[(df_movies.budget<100)].shape

## Keywords + tagline

**Maybe we should not use tagline as a feature, two reasons:**  
+ tagline is too short for each movie, and has almost no overlap across movies
+ Even, as a human, I cannot have a good guess of the movie genres reading only the tagline when I already know the movie.

In [None]:
# check all keywords and their count
kw_all = defaultdict(int)

for cell in df_movies.keywords:
    for kw in cell:
        kw_all[kw] += 1

print(len(kw_all))
sorted(kw_all.items(), key=lambda x: x[1], reverse=True)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

keywords_dummies = pd.DataFrame(mlb.fit_transform(
    df_movies['keywords']), columns=mlb.classes_).add_prefix('kw_')

## production_companies

In [None]:
prod_com = pd.DataFrame(mlb.fit_transform(
    df_movies['production_companies']), columns=mlb.classes_).add_prefix('prodComp_')

## production_countries

In [None]:
prod_coun = pd.DataFrame(mlb.fit_transform(
    df_movies['production_countries']), columns=mlb.classes_).add_prefix('prodComp_')

## release_date

In [None]:
# df_movies['year_bucket'] = 
years = pd.get_dummies(df_movies.apply(lambda row: str(row.release_date)[:3], axis=1))
years['before97'] = years.iloc[:,:6].sum(axis=1)
years = years.iloc[:,6:]

## popularity + vote_average + vote_count

In [None]:
# df_movies.loc[df_movies['vote_average']!=0].vote_average.mean()
df_movies.loc[df_movies['vote_average']==0, 'vote_average'] = df_movies.loc[df_movies['vote_average']!=0].vote_average.mean()
df_movies.loc[df_movies['vote_count']==0, 'vote_count'] = df_movies.loc[df_movies['vote_count']!=0].vote_count.mean()

# Result-dataset

In [None]:
df_movies = df_movies.reset_index(drop=True)

In [None]:
df_movies.columns

Besides the modified dataframe, we also have three vectorized features: overview, cast names, and keywords.  


**Features** (so far ready to use):  
+ **numeric**: budget, revenue, female_pct, male_pct, runtime 
+ **text(vectorized)**: keywords, overview, cast(names)

**Labels**:  
20 genres, i.e., `genre_*`

In [None]:
# labels
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genres_dummies = pd.DataFrame(mlb.fit_transform(
    df_movies['genres']), columns=mlb.classes_).add_prefix('genre_')

In [None]:
df_movies.head(5)

In [None]:
# features (subtitles doc2vec)
movies_docvecs = pd.DataFrame(movies_docvecs)
features = pd.concat((df_movies.loc[:, ['budget', 'revenue', 'female_pct', 'male_pct', 'runtime', 
                                        'vote_average', 'vote_count']],
                      overview, cast, keywords_dummies, prod_com, prod_coun, years, movies_docvecs), axis=1)

In [None]:
features.head(5)

# Modelling

The result is very interesting.  

When I use only cast names and gender proportion, the prediction tends to give less genres for each movie than the ground truth (please refer to the `cast_vs_genres` notebook); however, using the eight features I mentioned above, the model tends to give more genres than the ground truth.

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_labels, test_labels = train_test_split(features, genres_dummies,
                                                                    test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# from xgboost import XGBClassifier
    
clf = OneVsRestClassifier(MultinomialNB(alpha=1.0))
# clf = RandomForestClassifier(n_estimators=100)

clf.fit(train_data, train_labels)
predictions = clf.predict(test_data)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [None]:
accuracy = accuracy_score(y_pred=predictions, y_true=np.array(test_labels))
f1 = f1_score(y_pred=predictions, y_true=np.array(test_labels), average='weighted')
precision = precision_score(y_pred=predictions, y_true=np.array(test_labels), average='weighted')
recall = recall_score(y_pred=predictions, y_true=np.array(test_labels), average='weighted')

print("Subset Accuracy: {}".format(accuracy)) # subset accuracy
print("Global F1 score: {}".format(f1))
print("Global Precision: {}".format(precision))
print("Global Recall: {}".format(recall))

In [None]:
print(classification_report(test_labels, predictions, target_names=mlb.classes_))

In [None]:
from sklearn.metrics import hamming_loss

train_predictions = clf.predict(train_data)

print(f"Train Hamming Loss: {hamming_loss(train_labels, train_predictions)}\n")
print(f"Test Hamming Loss: {hamming_loss(test_labels, predictions)}\n")

# Visualize predicted genres

In [None]:
classes = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
           'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History',
           'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction',
           'TV Movie', 'Thriller', 'War', 'Western']

In [None]:
def undummy_genres(genres_dummies):
    """Match dummy-ized predictions with genres."""
    genres = []
    for row in genres_dummies:
        indices = [i for i, x in enumerate(row) if x == 1]
        genres.append([classes[i] for i in indices])

    return genres


def list2pairs(genres_list):
    """Create a list of paired genres"""
    pairs_list = []
    for genres in genres_list:
        for pair in list(itertools.combinations(genres, 2)):
            pairs_list.append(pair)

    return pairs_list


def create_visGrid(classes, genres_list):
    """Create a grid of pair counts."""
    all_pairs = list(itertools.combinations(classes, 2))
    pairs_list = list2pairs(genres_list)

    nr_ids = np.unique(all_pairs)
    visGrid = np.zeros((len(nr_ids), len(nr_ids)))
    for p in pairs_list:
        visGrid[np.argwhere(nr_ids == p[0]), np.argwhere(nr_ids == p[1])] += 1
        if p[1] != p[0]:
            visGrid[np.argwhere(nr_ids == p[1]),
                    np.argwhere(nr_ids == p[0])] += 1
    visGrid = visGrid.astype(int)

    return nr_ids, visGrid


def make_heatmap(nr_ids, visGrid, ax=None, cbar=True, cbar_size=1.0):
    # shows how often the genres occur together
    annot_lookup = []
    for i in range(len(nr_ids)):
        annot_lookup.append(nr_ids[i])

    # create mask
    mask = np.zeros_like(visGrid)
    mask[np.triu_indices_from(mask)] = True

    # make the heatmap
    sns.set(rc={'figure.figsize': (10, 8)})
    with sns.axes_style("white"):
        sns.heatmap(visGrid, mask=mask, square=True, vmin=0, vmax=visGrid.max(),
                    xticklabels=annot_lookup, yticklabels=annot_lookup, ax=ax, cbar=cbar,
                    cbar_kws={"shrink": cbar_size})

## Heatmap (Tfdif subtitles)

### Genres pair occurence heatmap - predicted genres

In [None]:
pred_classes = undummy_genres(predictions)
nr_ids, visGrid_pred = create_visGrid(classes, pred_classes)

### Genres pair occurence heatmap - test labels

In [None]:
test_labels_notdummy = undummy_genres(np.asarray(test_labels))
nr_ids, visGrid_test = create_visGrid(classes, test_labels_notdummy)

### Genres pair occurence heatmap - train labels

In [None]:
train_labels_notdummy = undummy_genres(np.asarray(train_labels))
nr_ids, visGrid_train = create_visGrid(classes, train_labels_notdummy)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(20, 20))
make_heatmap(nr_ids, visGrid_pred, ax=ax1, cbar_size=0.2)
ax1.set_title("Predicted Genres")
make_heatmap(nr_ids, visGrid_test, ax=ax2, cbar_size=0.2)
ax2.set_title("Test Genres (Groundtruth)")
make_heatmap(nr_ids, visGrid_train, ax=ax3, cbar_size=0.2)
ax3.set_title("Train Data Genres")
plt.show()

## Heatmap (Doc2Vec subtitles)

### Genres pair occurence heatmap - predicted genres

In [None]:
pred_classes = undummy_genres(predictions)
nr_ids, visGrid_pred = create_visGrid(classes, pred_classes)

### Genres pair occurence heatmap - test labels

In [None]:
test_labels_notdummy = undummy_genres(np.asarray(test_labels))
nr_ids, visGrid_test = create_visGrid(classes, test_labels_notdummy)

### Genres pair occurence heatmap - train labels

In [None]:
train_labels_notdummy = undummy_genres(np.asarray(train_labels))
nr_ids, visGrid_train = create_visGrid(classes, train_labels_notdummy)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(20, 20))
make_heatmap(nr_ids, visGrid_pred, ax=ax1, cbar_size=0.2)
ax1.set_title("Predicted Genres")
make_heatmap(nr_ids, visGrid_test, ax=ax2, cbar_size=0.2)
ax2.set_title("Test Genres (Groundtruth)")
make_heatmap(nr_ids, visGrid_train, ax=ax3, cbar_size=0.2)
ax3.set_title("Train Data Genres")
plt.show()