<center><h2>Predict Movie Genre for TMDB Database</h2></center>

_____


<center><img src="images/poster.jpg" width="30%"/></center>

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from glob import glob
import os
import json
import re
from collections import defaultdict
import itertools

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from tqdm import tqdm

from ipywidgets import interact, IntSlider
from IPython.display import display

In [84]:
def freeze_header(df, num_rows=30, num_columns=10, step_rows=1,
                  step_columns=1):
    """
    Freeze the headers (column and index names) of a Pandas DataFrame. A widget
    enables to slide through the rows and columns.
    """
    @interact(last_row=IntSlider(min=min(num_rows, df.shape[0]),
                                 max=df.shape[0],
                                 step=step_rows,
                                 description='rows',
                                 readout=False,
                                 disabled=False,
                                 continuous_update=True,
                                 orientation='horizontal',
                                 slider_color='purple'),
              last_column=IntSlider(min=min(num_columns, df.shape[1]),
                                    max=df.shape[1],
                                    step=step_columns,
                                    description='columns',
                                    readout=False,
                                    disabled=False,
                                    continuous_update=True,
                                    orientation='horizontal',
                                    slider_color='purple'))
    def _freeze_header(last_row, last_column):
        display(df.iloc[max(0, last_row-num_rows):last_row,
                        max(0, last_column-num_columns):last_column])

# Outline

______
    -- Purpose
    -- Introduce the dataset
    -- Feature selection and data engineering
    -- Models and Results
    -- Discussion

# The Purpose 
    -- Predicting movie genre for TMDB movie
    -- Auto tagging new movie
    -- Possibly fix existing problem

# Dataset - features and observations

______

    Dataset: TMDB(The Movie Database) and Kaggle
    General info : 
        1.cast
        2.budget
        3.overview
        4.production company
        5.voting
    Observations : 4799
    Additional data: 
        1.3440 of them have subtitles
        2.retrieve them with API

## Dataset 

-----

    - Groud Truth is Messy
        -- user contributed database, everyone can edit it
        -- some movies get little attention
_____

<img src="images/edit_genres.jpg" width="60%"/>

In [24]:
"""
Load data
"""
df = pd.read_csv('dataset/tmdb_5000_movies_nonull.csv')
df_credits = pd.read_csv('dataset/tmdb_5000_credits.csv')
credits_sub = df_credits.loc[:, ['movie_id', 'cast']].rename(columns={
                                                             'movie_id': 'id'})
df = df[['id', 'budget', 'genres', 'keywords', 'original_language',
         'overview', 'popularity', 'production_companies',
         'production_countries', 'release_date', 'revenue', 'runtime',
         'spoken_languages', 'tagline', 'title', 'vote_average', 'vote_count']]
df = df.merge(credits_sub)

In [25]:
"""
Add subtitles
"""
with open('dataset/subtitles/subtitles.json', 'r') as f:
    sub_dict = json.load(f)
sub_dict = defaultdict(str, sub_dict)
df["subtitles"] = df.title.apply(lambda title: "\n\n".join(sub_dict[title])) # single string

In [27]:
# drop movies with no genre info
for i, row in df.iterrows():
    if row['genres'] == '[]':
        df.drop(i, inplace=True)
df = df.dropna()
all_genres = defaultdict(int)
for row in df.genres:
    for item in json.loads(row):
        all_genres[item['name']] += 1
genres_list = sorted(all_genres.items(), key=lambda x: x[1], reverse=True)

In [28]:
def convert_list(cell):
    """convert the json format to a list of categories"""
    kw_list = []
    for kw in json.loads(cell):
        kw_list.append(kw['name'])
    return kw_list


def larger_n(col, n):
    """filter the column"""
    keywords = defaultdict(int)
    for row in df[col]:
        row = json.loads(row)
        for entry in row:
            keywords[entry['name']] += 1
    kw_cnt = sorted(keywords.items(), key=lambda x: -x[1])
    return [kw[0] for kw in kw_cnt if kw[1] >= n]


def extract_gender(cell):
    """Extract cast gender"""
    female = 0
    male = 0
    for item in json.loads(cell):
        if item['gender'] == 1:
            female += 1
        elif item['gender'] == 2:
            male += 1
        else:
            continue
    return female, male


def concat_names(cell):
    """Concatenate first names and last names"""
    names = []
    for name in cell:
        names.append(name.replace(' ', ''))
    return names


def list2str(cell):
    """Convert list to string"""
    return ' '.join(cell)


def transform_cols(df, cols_to_transform):
    """Transform columns of a dataframe.
    cols_to_transform should be a dict(col_name: filter value n)
    """
    for col_name in cols_to_transform.keys():
        larger_col = larger_n(col_name, cols_to_transform[col_name])
        if col_name == 'cast':
            gen = df[col_name].apply(extract_gender)
            df['female_pct'] = gen.apply(lambda x: x[0]/(x[0]+x[1]+0.001))
            df['male_pct'] = gen.apply(lambda x: x[1]/(x[0]+x[1]+0.001))

        df[col_name] = df[col_name].apply(convert_list)\
            .apply(lambda cell: [kw for kw in cell if kw in larger_col])
    return df

def tokenize(text):
    """
    A tokenizer, remove special characters punctuation and replace them with a space, stem the word
    """
    text = text.lower()
    cleanString = re.sub('[^a-zA-Z]',' ', text)
    words = nltk.word_tokenize(cleanString)
    english = list(ENGLISH_STOP_WORDS)
    goodwords = [w for w in words if w not in english]
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in goodwords]
    return stemmed

In [29]:
cols_to_transform = {'keywords': 30,
                     'genres': 0,
                     'production_companies': 5,
                     'production_countries': 3,
                     'spoken_languages': 10,
                     'cast': 2}

df_movies = transform_cols(df, cols_to_transform)

In [30]:
df_movies = df_movies[df_movies["subtitles"] != ''] # use only movies with subtitles
df_movies = df_movies.reset_index(drop=True)
#df_movies.shape

(3037, 21)

In [32]:
df_movies[df_movies.title.str.lower().str.contains("kung fu panda")][["title", "release_date", "genres"]]

Unnamed: 0,title,release_date,genres
120,Kung Fu Panda 2,2011-05-25,"[Animation, Family]"
134,Kung Fu Panda 3,2016-01-23,"[Action, Adventure, Animation, Comedy, Family]"
159,Kung Fu Panda,2008-06-04,"[Adventure, Animation, Family, Comedy]"


In [39]:
df_movies[df_movies.title.str.lower().str.contains("harry potter")][["title", "release_date", "genres"]]

Unnamed: 0,title,release_date,genres
7,Harry Potter and the Half-Blood Prince,2009-07-07,"[Adventure, Fantasy, Family]"
100,Harry Potter and the Order of the Phoenix,2007-06-28,"[Adventure, Fantasy, Family, Mystery]"
169,Harry Potter and the Prisoner of Azkaban,2004-05-31,"[Adventure, Fantasy, Family]"
174,Harry Potter and the Philosopher's Stone,2001-11-16,"[Adventure, Fantasy, Family]"
246,Harry Potter and the Chamber of Secrets,2002-11-13,"[Adventure, Fantasy, Family]"


## Dataset 

______
    - missing data
        -- label missing
        -- feature missing / abnormal

In [45]:
genres_list

[('Drama', 1801),
 ('Comedy', 1442),
 ('Thriller', 1151),
 ('Action', 1025),
 ('Romance', 726),
 ('Adventure', 720),
 ('Crime', 627),
 ('Science Fiction', 488),
 ('Horror', 470),
 ('Family', 437),
 ('Fantasy', 388),
 ('Mystery', 316),
 ('Animation', 201),
 ('History', 153),
 ('Music', 149),
 ('War', 115),
 ('Western', 69),
 ('Documentary', 56),
 ('Foreign', 14),
 ('TV Movie', 3)]

In [40]:
df_movies = df_movies.drop(df_movies.loc[df_movies.runtime == 0].index)

In [41]:
df_movies = df_movies.reset_index(drop=True)

# Data processing and Feature engineering -- Text

______




## NLP for overview and sub-titles

+ tokenization / lemmatizing
+ TfidfVectorizer
+ Doc2vec (Not pretrained vs Pretrained) PAINFUL

## Tagline - do not use

+ Tagline is too short for each movie, and has almost no overlap across movies
+ Even, as a human, I cannot have a good guess of the movie genres reading only the tagline when I already know the movie.

## Cast names -- CountVectorizer

+ Concatenate Fisrt and Last Name




# Data processing and Feature engineering -- Numeric

______


## Budget & Revenue - predict
+ deal with abnormal values by predict with other information
+ For these 107 movies with less than \\$100 budget and more than \\$10000 revenue, we use `revenue + overview + popularity + release_date` to predict the actual budget.
+ For others whose revenue are below $10000, use `overview + popularity + release_date` to predict budget

## Column transformation

**Steps:**  
+ Filter columns with specific filtering values
+ **Convert** json-format cells to list
+ **Create feature** Extract cast gender and calculate the proportion of female cast and male cast for each movie

## Other features
+ Production company
+ Release date
+ Popularity -- MinMax
+ vote count -- MinMax
+ vote


In [43]:
df_movies.head(3)

Unnamed: 0,id,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,...,runtime,spoken_languages,tagline,title,vote_average,vote_count,cast,subtitles,female_pct,male_pct
0,19995,237000000,"[Action, Adventure, Fantasy, Science Fiction]","[future, alien, soldier, 3d]",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[Ingenious Film Partners, Twentieth Century Fo...","[United States of America, United Kingdom]",2009-12-10,...,162.0,"[English, Español]",Enter the World of Pandora.,Avatar,7.2,11800,"[Sam Worthington, Zoe Saldana, Sigourney Weave...","When I was lying there\nin the VA hospital,\n\...",0.374988,0.62498
1,285,300000000,"[Adventure, Fantasy, Action]","[love of one's life, aftercreditsstinger]",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films]",[United States of America],2007-05-19,...,169.0,[English],"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[man] ""ln order to affect a timely halt\nto de...",0.166661,0.833306
2,206647,245000000,"[Action, Adventure, Crime]","[spy, based on novel, sequel]",en,A cryptic message from Bond’s past sends him o...,107.376788,"[Columbia Pictures, Danjaq]","[United Kingdom, United States of America]",2015-10-26,...,148.0,"[Français, English, Español, Italiano, Deutsch]",A Plan No One Escapes,Spectre,6.3,4466,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",Where are you going?\n\nI won't be long.\n\nWe...,0.191174,0.808812


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

df_movies = df_movies.loc[df_movies.overview.apply(lambda x: len(x) > 10)]
# df_movies = df_movies.reset_index(drop=True)

tfidf = TfidfVectorizer(stop_words='english', binary=True,
                        ngram_range=(1, 2), min_df=0.001)
overview_vec = tfidf.fit_transform(df_movies['overview'])

overview = pd.DataFrame(overview_vec.toarray(), columns=tfidf.get_feature_names()).add_prefix('ov_')
# overview_vec = overview_vec.todense()

In [49]:
import re
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.porter import *

In [50]:
df_movies = df_movies.loc[df_movies.overview.apply(lambda x: len(x) > 10)]
# df_movies = df_movies.reset_index(drop=True)

tfidf = TfidfVectorizer(tokenizer=tokenize,stop_words='english', binary=True,
                        ngram_range=(1, 2), min_df=0.001)
overview_vec = tfidf.fit_transform(df_movies['overview'])

overview_docvecs = pd.DataFrame(overview_vec.toarray(), columns=tfidf.get_feature_names()).add_prefix('ov_')
# overview_vec = overview_vec.todense()

In [52]:
def text_to_words(text):
    # remove punctuation and whitespace
    # but keep hyphens and apostrophes
    filtered_text = re.sub(r'[^\w\'\s-]',
                           '', text) 
    return word_tokenize(filtered_text.lower())

In [53]:
idxs = df_movies.id.tolist()
%time sub_words = [text_to_words(text) for text in df_movies.subtitles.tolist()]
subs = dict(zip(idxs, sub_words))

CPU times: user 1min 13s, sys: 797 ms, total: 1min 14s
Wall time: 1min 15s


In [54]:
%time tagged_data = [TaggedDocument(words=word_list, tags=[index]) for index, word_list in subs.items()]

CPU times: user 309 ms, sys: 3.98 ms, total: 313 ms
Wall time: 320 ms


In [55]:
model = Doc2Vec(vector_size=50, min_count=2, workers=4)
%time model.build_vocab(tagged_data)

CPU times: user 9.4 s, sys: 60.3 ms, total: 9.46 s
Wall time: 9.5 s


In [56]:
%time model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 3min 31s, sys: 1.26 s, total: 3min 32s
Wall time: 1min 9s


In [57]:
word2vec = Word2Vec(size=50)
%time word2vec.build_vocab([word for text in tagged_data for word in text])

CPU times: user 5.56 s, sys: 45.2 ms, total: 5.6 s
Wall time: 5.71 s


In [58]:
glove_path = "/Users/jessica/GitHub_repo/tmdb/dataset/glove.6B.50d.word2vec.txt"
%time word2vec.intersect_word2vec_format(glove_path, lockf=1.0, binary=False, encoding='utf8', unicode_errors='strict')

CPU times: user 12.9 s, sys: 130 ms, total: 13 s
Wall time: 13.2 s


In [59]:
model_pretrained = Doc2Vec(vector_size=50, min_count=2, workers=4)
model_pretrained.build_vocab(tagged_data)
model_pretrained.wv = word2vec.wv
%time model_pretrained.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 3min 25s, sys: 1.22 s, total: 3min 26s
Wall time: 1min 7s


In [60]:
def infer_docvecs(df):
    docvecs = []
    for index in tqdm(df.id.tolist()):
        word_list = subs[index]
        vec = model_pretrained.infer_vector(word_list, steps=20)
        docvecs.append(vec)
    docvecs = np.array(docvecs, dtype=np.float32)
    return docvecs

In [61]:
movies_docvecs = infer_docvecs(df_movies)
movies_docvecs.shape

100%|██████████| 3036/3036 [08:10<00:00,  6.19it/s]


(3036, 50)

In [62]:
from sklearn.feature_extraction.text import CountVectorizer

# concatenate names and join the list of names to a string
df_movies['cast'] = df_movies['cast'].apply(concat_names).apply(list2str)

vectorizer = CountVectorizer(ngram_range=(1, 1))
cast_vect = vectorizer.fit_transform(df_movies['cast'])

cast = pd.DataFrame(cast_vect.todense(), columns=vectorizer.get_feature_names()).add_prefix('cast_')

In [63]:
from sklearn import ensemble


def predict_budget1(dataset):
    """Predict budget of movies with less than $100 budget and more than $10000 revenue"""
    data_p = dataset[['revenue', 'popularity', 'budget']]
    data_p = pd.concat([data_p, overview_docvecs], axis=1)

    x_train = data_p.loc[~(data_p.budget < 100) & (
        data_p.revenue > 10000)].drop('budget', 1)
    y_train = data_p.loc[~(data_p.budget < 100) &
                         (data_p.revenue > 10000)]['budget']
    x_test = data_p.loc[(data_p.budget < 100) & (
        data_p.revenue > 10000)].drop('budget', 1)

    rfr = ensemble.RandomForestRegressor(random_state=42)
    rfr.fit(x_train, y_train)

    y_test = rfr.predict(x_test)
    return y_test


df_movies.loc[(df_movies.budget < 100) & 
              (df_movies.revenue > 10000), 'budget'] = predict_budget1(df_movies)



In [64]:
def predict_budget2(dataset):
    """Predict budget of movies with less than $100 budget and less than $10000 revenue"""
    data_p = dataset[['popularity', 'budget']]
    data_p = pd.concat([data_p, overview_docvecs], axis=1)

    x_train = data_p.loc[~(data_p.budget < 100)].drop('budget', 1)
    y_train = data_p.loc[~(data_p.budget < 100)]['budget']
    x_test = data_p.loc[(data_p.budget < 100) & (
        df_movies.revenue <= 10000)].drop('budget', 1)

    rfr = ensemble.RandomForestRegressor(random_state=42)
    rfr.fit(x_train, y_train)
#     print(x_test.shape)
    y_test = rfr.predict(x_test)
    return y_test


df_movies.loc[(df_movies.budget < 100) & (df_movies.revenue <=
                                          10000), 'budget'] = predict_budget2(df_movies)



In [65]:
kw_all = defaultdict(int)

for cell in df_movies.keywords:
    for kw in cell:
        kw_all[kw] += 1

#print(len(kw_all))
sorted(kw_all.items(), key=lambda x: x[1], reverse=True)

[('duringcreditsstinger', 252),
 ('woman director', 182),
 ('independent film', 155),
 ('based on novel', 146),
 ('aftercreditsstinger', 138),
 ('murder', 136),
 ('violence', 111),
 ('dystopia', 109),
 ('revenge', 89),
 ('sport', 86),
 ('3d', 81),
 ('sequel', 79),
 ('friendship', 78),
 ('teenager', 74),
 ('sex', 73),
 ('musical', 70),
 ('suspense', 67),
 ('los angeles', 66),
 ('love', 63),
 ('new york', 62),
 ('high school', 62),
 ('alien', 60),
 ('superhero', 59),
 ('biography', 58),
 ('family', 54),
 ('police', 53),
 ('remake', 50),
 ('prison', 48),
 ('nudity', 48),
 ('drug', 48),
 ('based on comic book', 47),
 ('dying and death', 47),
 ('corruption', 46),
 ('serial killer', 46),
 ('airplane', 43),
 ('wedding', 43),
 ('magic', 42),
 ('father son relationship', 42),
 ('fbi', 42),
 ('friends', 41),
 ('london england', 41),
 ('daughter', 40),
 ('time travel', 40),
 ('party', 40),
 ('lawyer', 39),
 ('based on young adult novel', 38),
 ('cia', 38),
 ('brother brother relationship', 37),
 

In [66]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

keywords_dummies = pd.DataFrame(mlb.fit_transform(
    df_movies['keywords']), columns=mlb.classes_).add_prefix('kw_')

In [67]:
prod_com = pd.DataFrame(mlb.fit_transform(
    df_movies['production_companies']), columns=mlb.classes_).add_prefix('prodComp_')

In [68]:
prod_coun = pd.DataFrame(mlb.fit_transform(
    df_movies['production_countries']), columns=mlb.classes_).add_prefix('prodComp_')

In [69]:
# df_movies['year_bucket'] = 
years = pd.get_dummies(df_movies.apply(lambda row: str(row.release_date)[:3], axis=1))
years['before97'] = years.iloc[:,:6].sum(axis=1)
years = years.iloc[:,6:]

In [70]:
df_movies.loc[df_movies['vote_average']==0, 'vote_average'] = df_movies.loc[df_movies['vote_average']!=0].vote_average.mean()
df_movies.loc[df_movies['vote_count']==0, 'vote_count'] = df_movies.loc[df_movies['vote_count']!=0].vote_count.mean()

In [71]:
df_movies = df_movies.reset_index(drop=True)

In [72]:
df_movies.head(5)

Unnamed: 0,id,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,...,runtime,spoken_languages,tagline,title,vote_average,vote_count,cast,subtitles,female_pct,male_pct
0,19995,237000000.0,"[Action, Adventure, Fantasy, Science Fiction]","[future, alien, soldier, 3d]",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[Ingenious Film Partners, Twentieth Century Fo...","[United States of America, United Kingdom]",2009-12-10,...,162.0,"[English, Español]",Enter the World of Pandora.,Avatar,7.2,11800.0,SamWorthington ZoeSaldana SigourneyWeaver Step...,"When I was lying there\nin the VA hospital,\n\...",0.374988,0.62498
1,285,300000000.0,"[Adventure, Fantasy, Action]","[love of one's life, aftercreditsstinger]",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films]",[United States of America],2007-05-19,...,169.0,[English],"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500.0,JohnnyDepp OrlandoBloom KeiraKnightley Stellan...,"[man] ""ln order to affect a timely halt\nto de...",0.166661,0.833306
2,206647,245000000.0,"[Action, Adventure, Crime]","[spy, based on novel, sequel]",en,A cryptic message from Bond’s past sends him o...,107.376788,"[Columbia Pictures, Danjaq]","[United Kingdom, United States of America]",2015-10-26,...,148.0,"[Français, English, Español, Italiano, Deutsch]",A Plan No One Escapes,Spectre,6.3,4466.0,DanielCraig ChristophWaltz LéaSeydoux RalphFie...,Where are you going?\n\nI won't be long.\n\nWe...,0.191174,0.808812
3,49026,250000000.0,"[Action, Crime, Drama, Thriller]","[terrorist, secret identity, superhero]",en,Following the death of District Attorney Harve...,112.31295,"[Legendary Pictures, Warner Bros., DC Entertai...",[United States of America],2012-07-16,...,165.0,[English],The Legend Ends,The Dark Knight Rises,7.6,9106.0,ChristianBale MichaelCaine GaryOldman AnneHath...,I knew Harvey Dent.\n\nI was his friend.\n\nAn...,0.130951,0.869037
4,49529,260000000.0,"[Action, Adventure, Science Fiction]","[based on novel, alien, escape, 3d]",en,"John Carter is a war-weary, former military ca...",43.926995,[Walt Disney Pictures],[United States of America],2012-03-07,...,132.0,[English],"Lost in our world, found in another.",John Carter,6.1,2124.0,TaylorKitsch LynnCollins SamanthaMorton Willem...,Subtitles downloaded from Podnapisi.NET\n\nMar...,0.23999,0.75997


## Data processing/Feature engineering 

**At the end, we got**:


**Features** :  
+ **numeric**: budget, revenue, female_pct, male_pct, runtime 
+ **text(vectorized)**: keywords, overview, cast(names)

**Labels**:  
+ 20 genres, i.e., `genre_*`

In [74]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genres_dummies = pd.DataFrame(mlb.fit_transform(
    df_movies['genres']), columns=mlb.classes_).add_prefix('genre_')

In [75]:
movies_docvecs = pd.DataFrame(movies_docvecs)
features = pd.concat((df_movies.loc[:, ['budget', 'revenue', 'female_pct', 'male_pct', 'runtime', 
                                        'vote_average', 'vote_count']],
                      cast, keywords_dummies, prod_com, prod_coun, years, movies_docvecs, pd.DataFrame(overview_docvecs)), axis=1)

In [77]:
features.head(5)

Unnamed: 0,budget,revenue,female_pct,male_pct,runtime,vote_average,vote_count,cast_50cent,cast_aakomonjones,cast_aaliyah,...,ov_younger,ov_younger brother,ov_youngest,ov_youngster,ov_youth,ov_zebra,ov_zeu,ov_zombi,ov_zone,ov_zoo
0,237000000.0,2787965087,0.374988,0.62498,162.0,7.2,11800.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,300000000.0,961000000,0.166661,0.833306,169.0,6.9,4500.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,245000000.0,880674609,0.191174,0.808812,148.0,6.3,4466.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,250000000.0,1084939099,0.130951,0.869037,165.0,7.6,9106.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,260000000.0,284139100,0.23999,0.75997,132.0,6.1,2124.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Leave blank for model and result

Train-test split: We used two ways to split the train/test data, the first one is by random 80/20 and the second one is by release_date

Regularization: We use random forest and MB, there is no much regularization we could do. We used MinMax Scaler to transform the data and MaxDepth for RF



## Discussion

______

    -- Choose MB after a long journey (reason)
    -- Also provide less sophisticated model but might be better in practice.
    -- The movie which need us to predict genre the most might be the one with lots of missing feature (?)
    -- Terrible for some genre (Cartoon, reason)
    --
    --

## Review
______

    -- Purpose
    -- Introduce the dataset
    -- Feature selection and data engineering
    -- Models and Results
    -- Discussion

In [78]:
df_n = pd.read_csv('dataset/tmdb_5000_movies_nonull.csv')
# df_movies.loc[:,['id', 'keywords', 'genres', 'title']]
df_credits = pd.read_csv('dataset/tmdb_5000_credits.csv')
credits_sub = df_credits.loc[:, ['movie_id', 'cast']].rename(columns={
                                                             'movie_id': 'id'})

df_n = df_n[['id', 'budget', 'genres', 'keywords', 'original_language',
         'overview', 'popularity', 'production_companies',
         'production_countries', 'release_date', 'revenue', 'runtime',
         'spoken_languages', 'tagline', 'title', 'vote_average', 'vote_count']]
df_n = df_n.merge(credits_sub)

In [79]:
j = []
for i, row in df_n.iterrows():
    if row['genres'] == '[]':
        j.append(i)
df_n.iloc[j]

Unnamed: 0,id,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,cast
3970,191229,0,[],[],en,A grotesquely disfigured harpooner called Igua...,0.214704,[],[],1988-04-01,0,88.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",,Iguana,6.0,1,"[{""cast_id"": 6, ""character"": ""Oberlus"", ""credi..."
3991,346081,0,[],[],en,A ghost hunter uses bottles to capture trouble...,0.296981,[],"[{""iso_3166_1"": ""IN"", ""name"": ""India""}]",2015-06-26,0,0.0,[],,Sardaarji,9.5,2,[]
4067,371085,0,[],[],en,The Post War II story of Manhattan born Mike E...,0.027801,[],[],2015-01-01,0,0.0,[],,Sharkskin,0.0,0,[]
4104,48382,2000000,[],[],en,The story of Lehi and his wife Sariah and thei...,0.031947,[],[],2003-09-12,1672730,120.0,[],"2600 years ago, one family began a remarkable ...","The Book of Mormon Movie, Volume 1: The Journey",5.0,2,"[{""cast_id"": 1, ""character"": ""Sam"", ""credit_id..."
4117,325140,0,[],[],en,"Raju, a waiter, is in love with the famous TV ...",0.001186,[],[],2000-05-26,0,0.0,[],,Hum To Mohabbat Karega,0.0,0,[]
4291,357834,1,[],[],en,The Algerian is an international political thr...,0.025364,"[{""name"": ""Zelko Films"", ""id"": 62451}]","[{""iso_3166_1"": ""DZ"", ""name"": ""Algeria""}, {""is...",2015-08-07,0,99.0,[],,The Algerian,0.0,0,"[{""cast_id"": 8, ""character"": ""Ali"", ""credit_id..."
4312,137955,1200000,[],[],en,"In late summer of 2011, five young friends on ...",0.057564,[],"[{""iso_3166_1"": ""CA"", ""name"": ""Canada""}]",2012-01-01,0,84.0,[],,Crowsnest,4.8,12,[]
4383,206412,0,[],[],en,A documentarian decides to follow the career o...,0.690089,[],[],2000-05-14,0,90.0,[],,Lisa Picard Is Famous,4.0,1,"[{""cast_id"": 2, ""character"": """", ""credit_id"": ..."
4398,219716,0,[],[],en,Melba is a Californian trailer-park girl who i...,0.547654,[],[],1999-03-19,0,96.0,[],,Sparkler,0.0,1,"[{""cast_id"": 2, ""character"": ""Melba May"", ""cre..."
4411,335874,0,[],[],en,Katherine is a typical teenager. Today's her f...,0.243853,[],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2015-04-03,0,90.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",,Childless,4.5,2,"[{""cast_id"": 1, ""character"": """", ""credit_id"": ..."


In [81]:
def freeze_header(df, num_rows=30, num_columns=10, step_rows=1,
                  step_columns=1):
    """
    Freeze the headers (column and index names) of a Pandas DataFrame. A widget
    enables to slide through the rows and columns.
    """
    @interact(last_row=IntSlider(min=min(num_rows, df.shape[0]),
                                 max=df.shape[0],
                                 step=step_rows,
                                 description='rows',
                                 readout=False,
                                 disabled=False,
                                 continuous_update=True,
                                 orientation='horizontal',
                                 slider_color='purple'),
              last_column=IntSlider(min=min(num_columns, df.shape[1]),
                                    max=df.shape[1],
                                    step=step_columns,
                                    description='columns',
                                    readout=False,
                                    disabled=False,
                                    continuous_update=True,
                                    orientation='horizontal',
                                    slider_color='purple'))
    def _freeze_header(last_row, last_column):
        display(df.iloc[max(0, last_row-num_rows):last_row,
                        max(0, last_column-num_columns):last_column])

In [85]:
freeze_header(df=df_n.iloc[j], num_rows=5)

interactive(children=(IntSlider(value=5, description='rows', max=27, min=5, readout=False), IntSlider(value=10…