In [10]:
import pandas as pd 
import numpy as np  
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt 
import seaborn as sns 
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

movie_df = pd.read_csv('movie_master_dataset.csv', encoding = "ISO-8859-1")

top_genres = ['Action', 'Drama', 'Thriller', 'Comedy', 'Romance']
unpopular_genres = ['Documentary', 'Musical', 'History']

#Get the first 500 successful movies based on gross 
hg = movie_df.sort(['gross'], ascending = False)
hg = hg.reset_index(drop=True)
hg = hg.head(500)

directors = set(movie_df['director_name'].tolist()) 
director_hits = {} 
for director in directors: 
    df = hg[hg['director_name'] == director]
    director_hits[director] = len(df)

def get_movies(movie_df): 
    movies = [] 
    action = movie_df[movie_df['genres'].str.contains('Action')]
    movies.append(action)

    drama = movie_df[movie_df['genres'].str.contains('Drama')]
    movies.append(drama)

    thriller = movie_df[movie_df['genres'].str.contains('Thriller')]
    movies.append(thriller)
    
    comedy = movie_df[movie_df['genres'].str.contains('Comedy')]
    movies.append(comedy)
    
    romance = movie_df[movie_df['genres'].str.contains('Romance')]
    movies.append(romance)
    
    documentary = movie_df[movie_df['genres'].str.contains('Documentary')]
    movies.append(documentary)
    
    musical = movie_df[movie_df['genres'].str.contains('Musical')]
    movies.append(musical)
    
    history = movie_df[movie_df['genres'].str.contains('History')]
    movies.append(history)
    
    return movies

init_movies = get_movies(movie_df)

def get_hits(row): 
    return director_hits[row['director_name']] 

def get_title_length(row): 
    return len(row['movie_title'].split())
    
def get_actors_similarity(row): 
    actor_1 = row['actor_1_name']
    actor_2 = row['actor_2_name']
    actor_3 = row['actor_3_name']
    
    actor_1_vals = []
    actor_2_vals = [] 
    actor_3_vals = [] 
    for i in range(5): 
        ct_1 = init_movies[i][init_movies[i]['actor_1_name'] == actor_1]
        ct_2 = init_movies[i][init_movies[i]['actor_2_name'] == actor_2]
        ct_3 = init_movies[i][init_movies[i]['actor_3_name'] == actor_3]
        
        actor_1_vals.append(len(ct_1)) 
        actor_2_vals.append(len(ct_2)) 
        actor_3_vals.append(len(ct_3)) 
        
    d12, path = fastdtw(actor_1_vals, actor_2_vals, dist=euclidean) 
    d13, path = fastdtw(actor_1_vals, actor_3_vals, dist=euclidean)
    d23, path = fastdtw(actor_2_vals, actor_3_vals, dist=euclidean)
    avg = (d12 + d13 + d23)/3
    return avg 
    
#Retrieve hits for the director and format the movie duration 
movie_df['director_hits'] = movie_df.apply(lambda row: get_hits(row),axis=1)
movie_df['duration'] = movie_df['duration'].apply(lambda x: str(x))
movie_df['duration'] = movie_df['duration'].apply(lambda x: int(x[:3]) if x[0] == '1' else int(x[:2]))  
movie_df['title_len'] = movie_df.apply(lambda row: get_title_length(row), axis=1)  
movie_df['actors_similarity'] = movie_df.apply(lambda row: get_actors_similarity(row), axis=1)

movies = get_movies(movie_df)

movies[0]

ValueError: invalid literal for int() with base 10: '1.2'