In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from bokeh.plotting import figure, output_file, show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LinearColorMapper,
    BasicTicker, LogTicker, ContinuousTicker, AdaptiveTicker,
    PrintfTickFormatter,
    ColorBar,
    FactorRange
)
import bokeh.palettes
from bokeh.transform import factor_cmap
from wordcloud import WordCloud, STOPWORDS
from math import floor
from bokeh.io import output_notebook

from bokeh.models import NumeralTickFormatter

In [3]:
data = pd.read_csv("../data/cleaned_data.csv")

In [4]:
data["profit"] = data["revenue"] - data["budget"]

In [5]:
data.head(3).transpose()

Unnamed: 0,0,1,2
budget,237000000,300000000,245000000
genres,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","['Adventure', 'Fantasy', 'Action']","['Action', 'Adventure', 'Crime']"
keywords,"['culture clash', 'future', 'space war', 'spac...","['ocean', 'drug abuse', 'exotic island', 'east...","['spy', 'based on novel', 'secret agent', 'seq..."
original_language,en,en,en
overview,"In the 22nd century, a paraplegic Marine is di...","Captain Barbossa, long believed to be dead, ha...",A cryptic message from Bond’s past sends him o...
popularity,150.438,139.083,107.377
production_companies,"['Ingenious Film Partners', 'Twentieth Century...","['Walt Disney Pictures', 'Jerry Bruckheimer Fi...","['Columbia Pictures', 'Danjaq', 'B24']"
production_countries,"['United States of America', 'United Kingdom']",['United States of America'],"['United Kingdom', 'United States of America']"
release_date,2009-12-10,2007-05-19,2015-10-26
revenue,2787965087,961000000,880674609


### test out some actor, director stuff

In [6]:
df = data

In [198]:
director_df = df.groupby('director', as_index=False)
director_df = director_df.mean()

In [199]:
name_counts = df['director'].value_counts().to_dict() # dictionary of director and number of rows/movies per
name_counts

{'Steven Spielberg': 27,
 'Woody Allen': 21,
 'Martin Scorsese': 20,
 'Clint Eastwood': 20,
 'Ridley Scott': 16,
 'Robert Rodriguez': 16,
 'Spike Lee': 16,
 'Steven Soderbergh': 15,
 'Renny Harlin': 15,
 'Tim Burton': 14,
 'Oliver Stone': 14,
 'Joel Schumacher': 13,
 'Ron Howard': 13,
 'Robert Zemeckis': 13,
 'Barry Levinson': 13,
 'Kevin Smith': 12,
 'Francis Ford Coppola': 12,
 'Tony Scott': 12,
 'Michael Bay': 12,
 'Brian De Palma': 12,
 'Richard Donner': 11,
 'Chris Columbus': 11,
 'Shawn Levy': 11,
 'Joel Coen': 11,
 'Richard Linklater': 11,
 'Sam Raimi': 11,
 'Bobby Farrelly': 11,
 'Rob Reiner': 11,
 'John McTiernan': 10,
 'John Carpenter': 10,
 'David Fincher': 10,
 'Wes Craven': 10,
 'Paul W.S. Anderson': 10,
 'Stephen Frears': 10,
 'Lasse Hallström': 9,
 'Dennis Dugan': 9,
 'M. Night Shyamalan': 9,
 'Rob Cohen': 9,
 'Peter Jackson': 9,
 'Tyler Perry': 9,
 'Phillip Noyce': 9,
 'Brett Ratner': 9,
 'Christopher Nolan': 8,
 'F. Gary Gray': 8,
 'Ivan Reitman': 8,
 'James Mangold': 

In [200]:
director_df['film_count'] = director_df['director'].map(name_counts) # match the film count to the director

In [201]:
director_df.head()

Unnamed: 0,director,budget,popularity,revenue,runtime,vote_average,vote_count,year,month,day,dow,profit,film_count
0,Aaron Hann,0.0,6.752141,0.0,87.0,6.0,333.0,2015.0,10.0,2.0,4.0,0.0,1
1,Aaron Schneider,7500000.0,6.557643,0.0,103.0,6.5,100.0,2010.0,1.0,22.0,4.0,-7500000.0,1
2,Abel Ferrara,12500000.0,2.950618,1227324.0,99.0,7.3,31.0,1996.0,1.0,26.0,4.0,-11272676.0,1
3,Adam Brooks,0.0,29.173266,55447968.0,112.0,6.7,620.0,2008.0,2.0,8.0,4.0,55447968.0,1
4,Adam Carolla,1500000.0,0.859014,0.0,97.0,6.6,8.0,2015.0,3.0,6.0,4.0,-1500000.0,1


In [202]:
director_df.sort_values('profit', ascending=False, inplace=True)

In [203]:
director_df['director+count'] = director_df['director'].map(str) + " (" + director_df['film_count'].map(str) + ")"

In [204]:
director_df.head()

Unnamed: 0,director,budget,popularity,revenue,runtime,vote_average,vote_count,year,month,day,dow,profit,film_count,director+count
318,Chris Buck,150000000.0,165.125366,1274219000.0,102.0,7.3,5295.0,2013.0,11.0,27.0,2.0,1124219000.0,1,Chris Buck (1)
1281,Kyle Balda,74000000.0,875.581305,1156731000.0,91.0,6.4,4571.0,2015.0,6.0,17.0,2.0,1082731000.0,1,Kyle Balda (1)
1305,Lee Unkrich,200000000.0,59.995418,1066970000.0,103.0,7.6,4597.0,2010.0,6.0,16.0,2.0,866969700.0,1,Lee Unkrich (1)
1172,Joss Whedon,179666700.0,108.038933,987943700.0,134.333333,7.366667,6602.333333,2010.666667,5.333333,24.0,2.333333,808277000.0,3,Joss Whedon (3)
332,Chris Renaud,75000000.0,31.482872,875958300.0,87.0,5.9,3462.0,2016.0,6.0,18.0,5.0,800958300.0,1,Chris Renaud (1)


In [205]:
dir_means = director_df[['director+count',  'film_count', 'profit']]

In [206]:
dir_means.head()

Unnamed: 0,director+count,film_count,profit
318,Chris Buck (1),1,1124219000.0
1281,Kyle Balda (1),1,1082731000.0
1305,Lee Unkrich (1),1,866969700.0
1172,Joss Whedon (3),3,808277000.0
332,Chris Renaud (1),1,800958300.0


In [207]:
dir_subset = director_df[director_df['film_count'] > 5]

### average profit and movie rating for directors who have produced more than 5 movies

In [208]:
top10pro = dir_subset.sort_values(ascending = False, by = 'profit')[['director+count','profit']].head(10)
top10pro

Unnamed: 0,director+count,profit
880,James Cameron (7),733809900.0
1731,Peter Jackson (9),578404800.0
360,Christopher Nolan (8),402810400.0
1475,Michael Bay (12),368543700.0
1110,Jon Favreau (6),300309000.0
907,James Wan (7),298802100.0
765,Gore Verbinski (7),295716200.0
1965,Sam Mendes (7),292792700.0
320,Chris Columbus (11),281693800.0
2102,Steven Spielberg (27),277033100.0


In [209]:
top10rat = dir_subset.sort_values(ascending = False, by = 'vote_average')[['director+count', 'vote_average']].head(10)
top10rat

Unnamed: 0,director+count,vote_average
360,Christopher Nolan (8),7.8
1772,Quentin Tarantino (8),7.775
2049,Stanley Kubrick (6),7.683333
2286,Wes Anderson (7),7.414286
486,David Fincher (10),7.34
1731,Peter Jackson (9),7.333333
880,James Cameron (7),7.328571
1426,Martin Scorsese (20),7.295
39,Alejandro González Iñárritu (6),7.233333
1698,Paul Thomas Anderson (6),7.216667


In [210]:
y=top10pro['director+count']

In [211]:
y

880         James Cameron (7)
1731        Peter Jackson (9)
360     Christopher Nolan (8)
1475         Michael Bay (12)
1110          Jon Favreau (6)
907             James Wan (7)
765        Gore Verbinski (7)
1965           Sam Mendes (7)
320       Chris Columbus (11)
2102    Steven Spielberg (27)
Name: director+count, dtype: object

In [212]:
import pandas as pd
import numpy as np
import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
import datetime

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff

py.offline.init_notebook_mode(connected=True)

from math import floor
from plotly import tools
from plotly.graph_objs import *

In [213]:
import plotly.plotly as py
import plotly.graph_objs as go

data = [go.Bar(
            x=top10pro['profit'],
            y=top10pro['director+count'],
            orientation = 'h',
            marker=dict(
            color='rgb(127,205,187)'
        )
)]

layout = dict(
        title='Average Profit for Directors who have directed more than 5 movies',
        margin=go.Margin(
        l=210,
        r=100,
        pad=1),
        xaxis=dict(
            title='Average Profit'
        ),
    
        yaxis=dict(
            title='&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Director (Number of Movies)',
            tickfont=dict(
                size=12,
            )
        )
    
    )
    

fig = go.Figure(data = data, layout = layout)

iplot(fig)

In [214]:
import plotly.plotly as py
import plotly.graph_objs as go

data = [go.Bar(
            x=top10rat['vote_average'],
            y=top10rat['director+count'],
            orientation = 'h',
            marker=dict(
            color='rgb(67,162,202)'
        )
)]

layout = dict(
        title='Average Movie Rating for Directors who have directed more than 5 movies',
        margin=go.Margin(
        l=210,
        r=100,
        b=100,
        t=100,
        pad=1),
            xaxis=dict(
            title='Average Rating'
        ),
    
        yaxis=dict(
            title='&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Director (Number of Movies)',
            tickfont=dict(
                size=12,
            )
        )
    
    )

fig = go.Figure(data = data, layout = layout)

iplot(fig)