# Movie Data
You are a Data Scientist for a top movie studio. After a series of box office flops, the producers of your studio are starting to question their strategy and need some direction. You suggest a new approach - using data to determine what factors go into making a successful film. Luckily, you have a dataset of over 5000 films to mine for insights. Your producers ask you to spend some time analyzing the data and present a report detailing your findings, along with recommendations on how to revamp the studio’s strategy. 

In [2]:
import pandas as pd
import numpy as np
from functools import reduce

In [3]:
movies = pd.read_csv('movie_data.csv')
#print(movies.columns)
#print(movies.info())

movies

Unnamed: 0,color,director_name,num_critic_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,movie_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
5039,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
5040,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
5041,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


## Explore and Clean Dataset

In [4]:
print(movies['title_year'].min())
print(movies['title_year'].max())

1916.0
2016.0


In [5]:
# remove all movies from before the year 1996 in order to focus on the most recent data points
movies_f = movies[movies['title_year']>=1996]

## Success Metrics
Which of the metrics that we have access to demonstrate the success of a film?

Potential measures of success:
1. num_critic_review
2. gross
3. num_user_reviews
4. movie_score

## Analyze director success

Do certain director's tend to make more successful movies?

In [6]:
#group by director; average gross income
directors_gross = movies_f[['director_name','gross']].groupby(['director_name']).mean().sort_values(by='gross', ascending=False)
directors_gross['gross_millions'] = round(directors_gross['gross'] / 1000000, 2)

#directors_gross.head(10)

In [7]:
#group by director; count number of movies
directors_count = movies_f[['director_name','color']].groupby(['director_name']).count().sort_values(by='color',ascending = False)
directors_count = directors_count.rename(columns = {'color': 'num_movies'})

#directors_count

In [8]:
#group by director; average movie score
directors_score = movies_f[['director_name','movie_score']].groupby(['director_name']).mean().sort_values(by='movie_score', ascending = False)

#directors_score.head(10)

In [9]:
#group by director; average budget
directors_budget = movies_f[['director_name','budget']].groupby(['director_name']).mean().sort_values(by='budget', ascending = False)
directors_budget['budget_millions'] = round(directors_budget['budget'] / 1000000, 2)

#directors_score.head(10)

In [10]:
#group by director; average facebook likes
director_fblikes = movies_f[['director_name','director_facebook_likes']].groupby(['director_name']).mean().sort_values(by='director_facebook_likes', ascending = False)

#director_fblikes.head(10)

In [11]:
#group by director; average movie facebook likes
movie_fblikes = movies_f[['director_name','movie_facebook_likes']].groupby(['director_name']).mean().sort_values(by='movie_facebook_likes', ascending = False)

#movie_fblikes.head(10)

In [12]:
# merge director data into one df
directors_df_lst = [directors_gross, directors_budget, directors_count, directors_score, director_fblikes, movie_fblikes]
directors_df = reduce(lambda left,right: pd.merge(left,right,on='director_name'), directors_df_lst)
directors_df = directors_df.drop(columns=['gross', 'budget'])
directors_df = directors_df.rename(columns = {'gross_millions': 'avg_gross_millions',
                                             'budget_millions': 'avg_budget_million',
                                             'movie_score': 'avg_movie_score',
                                             'director_facebook_likes':'avg_director_fb_likes',
                                             'movie_facebook_likes':'avg_movie_fb_likes'})

directors_df['avg_movie_score'] = round(directors_df['avg_movie_score'],2)
directors_df['avg_movie_fb_likes'] = round(directors_df['avg_movie_fb_likes'],2)

# filter df to focus on most successful directors
directors_df_filter = directors_df[directors_df['avg_gross_millions']>=100]

In [13]:
# top grossing directors
directors_df_filter.head(10)

Unnamed: 0_level_0,avg_gross_millions,avg_budget_million,num_movies,avg_movie_score,avg_director_fb_likes,avg_movie_fb_likes
director_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
James Cameron,709.59,218.5,2,7.8,0.0,29500.0
Joss Whedon,432.72,182.5,4,7.92,0.0,94500.0
Lee Unkrich,414.98,200.0,1,8.3,125.0,30000.0
Chris Buck,400.74,150.0,1,7.6,69.0,58000.0
George Lucas,388.49,114.33,3,6.93,0.0,7666.67
Tim Miller,363.02,58.0,1,8.1,84.0,117000.0
Kyle Balda,336.03,74.0,1,6.4,22.0,70000.0
Colin Trevorrow,328.09,75.38,2,7.0,365.0,90500.0
Yarrow Cheney,323.51,75.0,1,6.8,11.0,36000.0
Pete Docter,313.11,155.0,3,8.23,0.0,48333.33


In [14]:
# directors with top average movie scores
directors_df_filter.sort_values(by = 'avg_movie_score', ascending = False).head(10)

Unnamed: 0_level_0,avg_gross_millions,avg_budget_million,num_movies,avg_movie_score,avg_director_fb_likes,avg_movie_fb_likes
director_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Christopher Nolan,226.65,125.62,8,8.43,22000.0,103625.0
Lee Unkrich,414.98,200.0,1,8.3,125.0,30000.0
Pete Docter,313.11,155.0,3,8.23,0.0,48333.33
Tim Miller,363.02,58.0,1,8.1,84.0,117000.0
Joss Whedon,432.72,182.5,4,7.92,0.0,94500.0
Don Hall,222.49,165.0,1,7.9,38.0,41000.0
James Cameron,709.59,218.5,2,7.8,0.0,29500.0
Rich Moore,189.41,165.0,1,7.8,66.0,40000.0
Alfonso Cuarón,143.09,77.0,4,7.8,0.0,43750.0
Josh Boone,124.87,12.0,1,7.8,131.0,93000.0


In [15]:
# directors whose movies average the most facebook likes
directors_df_filter.sort_values(by = 'avg_movie_fb_likes', ascending = False).head(10)

Unnamed: 0_level_0,avg_gross_millions,avg_budget_million,num_movies,avg_movie_score,avg_director_fb_likes,avg_movie_fb_likes
director_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Tim Miller,363.02,58.0,1,8.1,84.0,117000.0
Christopher Nolan,226.65,125.62,8,8.43,22000.0,103625.0
Joss Whedon,432.72,182.5,4,7.92,0.0,94500.0
Josh Boone,124.87,12.0,1,7.8,131.0,93000.0
Colin Trevorrow,328.09,75.38,2,7.0,365.0,90500.0
Robert Stromberg,241.41,180.0,1,7.0,171.0,89000.0
Alan Taylor,148.05,162.5,2,6.85,230.0,72500.0
Noam Murro,106.37,110.0,1,6.2,263.0,71000.0
Kyle Balda,336.03,74.0,1,6.4,22.0,70000.0
Ben Affleck,114.1,40.75,2,7.65,0.0,59000.0


## Analzye Genre Success

In [32]:
print(range(len(movies['genres'])))

range(0, 5043)


In [41]:
genre_action = movies[movies['genres'].str.contains('Action')]
genre_adventure = movies[movies['genres'].str.contains('Adventure')]
genre_fantasy = movies[movies['genres'].str.contains('Fantasy')]
genre_thriller = movies[movies['genres'].str.contains('Thriller')]
genre_scifi = movies[movies['genres'].str.contains('Sci-Fi')]
genre_crime = movies[movies['genres'].str.contains('Crime')]
genre_comedy = movies[movies['genres'].str.contains('Comedy')]
genre_drama = movies[movies['genres'].str.contains('Drama')]
genre_doc = movies[movies['genres'].str.contains('Documentary')]
genre_romance = movies[movies['genres'].str.contains('Romance')]
genre_horror = movies[movies['genres'].str.contains('Horror')]

genre_lst = ['Action','Adventure','Fantasy','Thriller','Sci-Fi','Crime','Comedy','Drama','Documentary','Romance','Horror']

for x in genre_lst:
    movies['genre_' + x.lower()] = ""

In [45]:
for x in range(len(movies['genres'])):
    for y in genre_lst:
        if y in movies['genres'][x]:
            movies['genre_' + y.lower()][x] = True
        else:
            movies['genre_' + y.lower()][x] = False

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
