### These plots were created using the Plotly library.

In [1]:
import pandas as pd
import numpy as np
import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
import datetime
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
py.offline.init_notebook_mode(connected=True)
from math import floor
from plotly import tools
from plotly.graph_objs import *
from IPython.display import display
from plotly.widgets import GraphWidget

<IPython.core.display.Javascript object>

In [2]:
data = pd.read_csv("../data/cleaned_data.csv")

In [3]:
data.shape

(4790, 25)

In [4]:
data["profit"] = data["revenue"] - data["budget"]

In [5]:
data["profit"].describe()

count    4.790000e+03
mean     5.336400e+07
std      1.361221e+08
min     -1.657101e+08
25%     -7.985852e+05
50%      2.602056e+06
75%      5.546026e+07
max      2.550965e+09
Name: profit, dtype: float64

In [6]:
data[data["profit"]==data["profit"].max()]

Unnamed: 0,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,...,crew,actor1,actor2,actor3,director,year,month,day,dow,profit
0,237000000,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","['culture clash', 'future', 'space war', 'spac...",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"['Ingenious Film Partners', 'Twentieth Century...","['United States of America', 'United Kingdom']",2009-12-10,2787965087,...,"[{'credit_id': '52fe48009251416c750aca23', 'de...",Sam Worthington,Zoe Saldana,Sigourney Weaver,James Cameron,2009,12,10,3,2550965087


### Average Profit by Day of Week

In [7]:
profit_by_dow = data[data["year"]>1990].groupby(["dow"]).budget.mean().reset_index()

In [8]:
bar_data = [go.Bar(x=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']\
                   , y=profit_by_dow["budget"])]

py.offline.iplot({ 'data': bar_data,
            'layout': {
               'title': 'Average Revenue by Day of Week',
               'xaxis': {
                 'title': 'Day of Week'},
               'yaxis': {
                'title': 'Profit'}
        }})

### Violin Plots for Movie Ratings by Decade

In [9]:
def extract_decade(x):
    return str(floor(x/10)*10)+"s"

In [10]:
data["decade"] = data["year"].apply(extract_decade)

In [11]:
df = data

In [12]:
df = df.sort_values(by=['decade'], ascending=True)

In [13]:
df.groupby(['decade']).size()

decade
1910s       1
1920s       4
1930s      15
1940s      25
1950s      27
1960s      71
1970s     109
1980s     278
1990s     778
2000s    2044
2010s    1438
dtype: int64

In [14]:
df[df['decade'] == '1910s']

Unnamed: 0,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,...,actor1,actor2,actor3,director,year,month,day,dow,profit,decade
4581,385907,['Drama'],"['usa', 'naivety', 'intolerance', 'mill', 'mar...",en,"The story of a poor young woman, separated by ...",3.232447,"['Triangle Film Corporation', 'Wark Producing ...",['United States of America'],1916-09-04,8394751,...,Lillian Gish,Mae Marsh,Robert Harron,D.W. Griffith,1916,9,4,0,8008844,1910s


Get rid of the 1910's decade since it has only 1 movie

In [15]:
df = df[df['decade'] != '1910s']

In [16]:
da = []
for i in range(0,len(pd.unique(df['decade']))):
    trace = {
            "type": 'violin',
            "x": df['decade'][df['decade'] == pd.unique(df['decade'])[i]],
            "y": df['vote_average'][df['decade'] == pd.unique(df['decade'])[i]],
            "name": pd.unique(df['decade'])[i],
            "box": {
                "visible": True
            },
            "meanline": {
                "visible": True
            }
        }
    da.append(trace)

        
fig = {
    "data": da,
    "layout" : {
        "title": "Average Movie Ratings by Decade",
            "xaxis" : dict(title = 'Decade', autotick=False, showticklabels=True),
            "yaxis" : dict(title = 'Average Rating')
    }
}

iplot(fig, validate = False)

In [17]:
df.columns

Index(['budget', 'genres', 'keywords', 'original_language', 'overview',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages',
       'vote_average', 'vote_count', 'title', 'cast', 'crew', 'actor1',
       'actor2', 'actor3', 'director', 'year', 'month', 'day', 'dow', 'profit',
       'decade'],
      dtype='object')

### Bubble Chart for Profit vs Budget

In [18]:
min(df.budget.values)

0

In [19]:
min(df[df.budget>10000].budget.values)

12000

In [20]:
layout = go.Layout(
    title='Profit vs. Budget',
    xaxis=dict(
        title='Budget',
        gridcolor='rgb(255, 255, 255)',
        range=[0, 4e8],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    yaxis=dict(
        title='Profit',
        gridcolor='rgb(255, 255, 255)',
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    )
)
data = [go.Scatter(
    x=df[df.budget>5000].budget.values, # Budget
    y=df[df.budget>5000].profit.values,  # Gross
    mode='markers',
    text= df[df.budget>5000].title.values, # Movie Titles
    marker=dict(
        size=3*(df[df.budget>5000].vote_average),
        sizeref=1.0,
        color=df[df.budget>5000].vote_average.values,
                
        colorbar=ColorBar(title='Average Rating<br> &nbsp;' , tickvals=[0,1.5,3,5,7,8.5]),

        showscale=True,
        colorscale='Viridis'
    ))]
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [21]:
df.columns

Index(['budget', 'genres', 'keywords', 'original_language', 'overview',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages',
       'vote_average', 'vote_count', 'title', 'cast', 'crew', 'actor1',
       'actor2', 'actor3', 'director', 'year', 'month', 'day', 'dow', 'profit',
       'decade'],
      dtype='object')

In [22]:
df.head().transpose()

Unnamed: 0,2638,4650,4448,4583,3802
budget,92620000,245000,0,379000,3950000
genres,"['Drama', 'Science Fiction']","['Drama', 'Romance', 'War']","['Drama', 'Thriller', 'Romance']","['Drama', 'Music', 'Romance']","['Action', 'Drama', 'History']"
keywords,"['man vs machine', 'underground world', 'inven...","['world war i', 'silent film']","['london england', 'casino', 'irony', 'forbidd...","['musical', 'singer', 'pre-code', 'wisecrack h...","['world war i', 'zeppelin', 'royal air force',..."
original_language,de,en,de,en,en
overview,In a futuristic city sharply divided between t...,The story of an idle rich boy who joins the US...,The rise and inevitable fall of an amoral but ...,"Harriet and Queenie Mahoney, a vaudeville act,...",Two brothers attending Oxford enlist with the ...
popularity,32.3515,0.785744,1.82418,0.968865,8.48412
production_companies,"['Paramount Pictures', 'Universum Film (UFA)']",['Metro-Goldwyn-Mayer (MGM)'],['Nero Films'],['Metro-Goldwyn-Mayer (MGM)'],['The Caddo Company']
production_countries,['Germany'],['United States of America'],['Germany'],['United States of America'],['United States of America']
release_date,1927-01-10,1925-11-05,1929-01-30,1929-02-08,1930-11-15
revenue,650422,22000000,0,4358000,8000000


Remove movies with a runtime of 0

### Bubble Chart for Profit vs Runtime

In [23]:
df2 = df[df.runtime > 20]

In [24]:
layout = go.Layout(
    title='Profit vs. Runtime',
    xaxis=dict(
        title='Runtime',
        gridcolor='rgb(255, 255, 255)',
        range=[0, 250],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    yaxis=dict(
        title='Profit',
        gridcolor='rgb(255, 255, 255)',
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    )
)
data = [go.Scatter(
    x=df2.runtime.values, # Budget
    y=df2.profit.values,  # Gross
    mode='markers',
    text= df2.title.values, # Movie Titles
    marker=dict(
        size=3*(df2.vote_average),
        sizeref=1.0,
        color=df2.vote_average.values,
        colorbar=ColorBar(title='Average Rating<br> &nbsp;'),
        showscale=True,
        colorscale='Viridis'
    ))]
fig = go.Figure(data=data, layout=layout)
iplot(fig)

### Directors with the highest average profit and movie ratings

In [25]:
data = pd.read_csv("../data/cleaned_data.csv")
data["profit"] = data["revenue"] - data["budget"]

In [26]:
df = data

In [27]:
director_df = df.groupby('director', as_index=False)
director_df = director_df.mean()

In [28]:
name_counts = df['director'].value_counts().to_dict() # dictionary of director and number of rows/movies per
name_counts

{'Steven Spielberg': 27,
 'Woody Allen': 21,
 'Martin Scorsese': 20,
 'Clint Eastwood': 20,
 'Ridley Scott': 16,
 'Spike Lee': 16,
 'Robert Rodriguez': 16,
 'Steven Soderbergh': 15,
 'Renny Harlin': 15,
 'Oliver Stone': 14,
 'Tim Burton': 14,
 'Ron Howard': 13,
 'Barry Levinson': 13,
 'Robert Zemeckis': 13,
 'Joel Schumacher': 13,
 'Brian De Palma': 12,
 'Francis Ford Coppola': 12,
 'Michael Bay': 12,
 'Tony Scott': 12,
 'Kevin Smith': 12,
 'Shawn Levy': 11,
 'Richard Donner': 11,
 'Rob Reiner': 11,
 'Bobby Farrelly': 11,
 'Richard Linklater': 11,
 'Joel Coen': 11,
 'Chris Columbus': 11,
 'Sam Raimi': 11,
 'Wes Craven': 10,
 'John Carpenter': 10,
 'John McTiernan': 10,
 'Paul W.S. Anderson': 10,
 'David Fincher': 10,
 'Stephen Frears': 10,
 'Lasse Hallström': 9,
 'Brett Ratner': 9,
 'Tyler Perry': 9,
 'Phillip Noyce': 9,
 'Rob Cohen': 9,
 'Dennis Dugan': 9,
 'Peter Jackson': 9,
 'M. Night Shyamalan': 9,
 'Christopher Nolan': 8,
 'Donald Petrie': 8,
 'Roland Emmerich': 8,
 'Martin Campb

In [29]:
director_df['film_count'] = director_df['director'].map(name_counts) # match the film count to the director

In [30]:
director_df.head(3)

Unnamed: 0,director,budget,popularity,revenue,runtime,vote_average,vote_count,year,month,day,dow,profit,film_count
0,Aaron Hann,0.0,6.752141,0.0,87.0,6.0,333.0,2015.0,10.0,2.0,4.0,0.0,1
1,Aaron Schneider,7500000.0,6.557643,0.0,103.0,6.5,100.0,2010.0,1.0,22.0,4.0,-7500000.0,1
2,Abel Ferrara,12500000.0,2.950618,1227324.0,99.0,7.3,31.0,1996.0,1.0,26.0,4.0,-11272676.0,1


In [31]:
director_df.sort_values('profit', ascending=False, inplace=True)

In [32]:
director_df['director+count'] = director_df['director'].map(str) + " (" + director_df['film_count'].map(str) + ")"

In [33]:
director_df.head(3)

Unnamed: 0,director,budget,popularity,revenue,runtime,vote_average,vote_count,year,month,day,dow,profit,film_count,director+count
318,Chris Buck,150000000.0,165.125366,1274219000.0,102.0,7.3,5295.0,2013.0,11.0,27.0,2.0,1124219000.0,1,Chris Buck (1)
1281,Kyle Balda,74000000.0,875.581305,1156731000.0,91.0,6.4,4571.0,2015.0,6.0,17.0,2.0,1082731000.0,1,Kyle Balda (1)
1305,Lee Unkrich,200000000.0,59.995418,1066970000.0,103.0,7.6,4597.0,2010.0,6.0,16.0,2.0,866969700.0,1,Lee Unkrich (1)


In [34]:
dir_means = director_df[['director+count',  'film_count', 'profit']]

In [35]:
dir_means.head(3)

Unnamed: 0,director+count,film_count,profit
318,Chris Buck (1),1,1124219000.0
1281,Kyle Balda (1),1,1082731000.0
1305,Lee Unkrich (1),1,866969700.0


In [36]:
dir_subset = director_df[director_df['film_count'] > 5]

Average profit and movie rating for directors who have produced more than 5 movies

In [37]:
top10pro = dir_subset.sort_values(ascending = False, by = 'profit')[['director+count','profit']].head(10)
top10pro.head(3)

Unnamed: 0,director+count,profit
880,James Cameron (7),733809900.0
1731,Peter Jackson (9),578404800.0
360,Christopher Nolan (8),402810400.0


In [38]:
top10rat = dir_subset.sort_values(ascending = False, by = 'vote_average')[['director+count', 'vote_average']].head(10)
top10rat.head(3)

Unnamed: 0,director+count,vote_average
360,Christopher Nolan (8),7.8
1772,Quentin Tarantino (8),7.775
2049,Stanley Kubrick (6),7.683333


In [39]:
y=top10pro['director+count']

In [40]:
data = [go.Bar(
            x=top10pro['profit'],
            y=top10pro['director+count'],
            orientation = 'h',
            marker=dict(
            color='rgb(127,205,187)'
        )
)]

layout = dict(
        title='Average Profit for Directors who have directed more than 5 movies',
        margin=go.Margin(
        l=210,
        r=100,
        pad=1),
        xaxis=dict(
            title='Average Profit'
        ),
    
        yaxis=dict(
            title='&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Director (Number of Movies)',
            tickfont=dict(
                size=12,
            )
        )
    
    )
    

fig = go.Figure(data = data, layout = layout)

iplot(fig)

In [41]:
import plotly.plotly as py
import plotly.graph_objs as go

data = [go.Bar(
            x=top10rat['vote_average'],
            y=top10rat['director+count'],
            orientation = 'h',
            marker=dict(
            color='rgb(67,162,202)'
        )
)]

layout = dict(
        title='Average Movie Rating for Directors who have directed more than 5 movies',
        margin=go.Margin(
        l=210,
        r=100,
        b=100,
        t=100,
        pad=1),
            xaxis=dict(
            title='Average Rating'
        ),
    
        yaxis=dict(
            title='&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Director (Number of Movies)',
            tickfont=dict(
                size=12,
            )
        )
    
    )

fig = go.Figure(data = data, layout = layout)

iplot(fig)

### Network Graph of main actors

title year is the year the movie was released
actor1 is actor_1 and so on

df_appearance = df_reduced[['actor_1_name', 'title_year']].groupby('actor_1_name').count()
appearance df is grouping the actor1 name and year of movie by the name, and seeing the count of number of movies an actor has had in a specific year

only want actors who have appeared in more than 4 movies per year
selection = df_appearance['title_year'] > 4

most_prolific = df_actors[selection] # these actors



In [42]:
df_reduced = df[['actor1', 'vote_average',
                 'year', 'title']].reset_index(drop = True)

In [43]:
df_appearance = df[['actor1', 'year']].groupby('actor1').count().reset_index()

In [44]:
most_prolific = list(df_appearance['actor1'][np.array(df_appearance['year'] > 3)])

In [45]:
len(most_prolific)

277

In [46]:
subset1 = df[df.actor1.isin(most_prolific) &\
            df.actor2.isin(most_prolific) & df.actor3.isin(most_prolific)].reset_index(drop = True)

In [47]:
subset2 = df[df.actor1.isin(most_prolific) |\
            df.actor2.isin(most_prolific) | df.actor3.isin(most_prolific)].reset_index(drop = True)

In [48]:
subset1

Unnamed: 0,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,...,crew,actor1,actor2,actor3,director,year,month,day,dow,profit
0,250000000,"['Action', 'Crime', 'Drama', 'Thriller']","['dc comics', 'crime fighter', 'terrorist', 's...",en,Following the death of District Attorney Harve...,112.312950,"['Legendary Pictures', 'Warner Bros.', 'DC Ent...",['United States of America'],2012-07-16,1084939099,...,"[{'credit_id': '52fe4781c3a36847f81398c3', 'de...",Christian Bale,Michael Caine,Gary Oldman,Christopher Nolan,2012,7,16,0,834939099
1,258000000,"['Fantasy', 'Action', 'Adventure']","['dual identity', 'amnesia', 'sandstorm', ""lov...",en,The seemingly invincible Spider-Man goes up ag...,115.699814,"['Columbia Pictures', 'Laura Ziskin Production...",['United States of America'],2007-05-01,890871626,...,"[{'credit_id': '52fe4252c3a36847f80151a5', 'de...",Tobey Maguire,Kirsten Dunst,James Franco,Sam Raimi,2007,5,1,1,632871626
2,225000000,"['Action', 'Comedy', 'Science Fiction']","['time travel', 'time machine', 'alien', 'fict...",en,Agents J (Will Smith) and K (Tommy Lee Jones) ...,52.035179,"['Amblin Entertainment', 'Media Magik Entertai...",['United States of America'],2012-05-23,624026776,...,"[{'credit_id': '52fe45b7c3a36847f80d68c7', 'de...",Will Smith,Tommy Lee Jones,Josh Brolin,Barry Sonnenfeld,2012,5,23,2,399026776
3,207000000,"['Adventure', 'Drama', 'Action']","['film business', 'screenplay', 'show business...",en,"In 1933 New York, an overly ambitious movie pr...",61.226010,"['WingNut Films', 'Universal Pictures', 'Big P...","['New Zealand', 'United States of America', 'G...",2005-12-14,550000000,...,"[{'credit_id': '52fe422ec3a36847f800a1d7', 'de...",Naomi Watts,Jack Black,Adrien Brody,Peter Jackson,2005,12,14,2,343000000
4,250000000,"['Adventure', 'Action', 'Science Fiction']","['civil war', 'war', 'marvel comic', 'sequel',...",en,"Following the events of Age of Ultron, the col...",198.372395,"['Studio Babelsberg', 'Marvel Studios', 'Walt ...",['United States of America'],2016-04-27,1153304495,...,"[{'credit_id': '569443d59251414b67000428', 'de...",Chris Evans,Robert Downey Jr.,Scarlett Johansson,Anthony Russo,2016,4,27,2,903304495
5,200000000,"['Action', 'Adventure', 'Thriller']","['spy', 'secret agent', 'sociopath', 'killer',...",en,When Bond's latest assignment goes gravely wro...,93.004993,['Columbia Pictures'],"['United Kingdom', 'United States of America']",2012-10-25,1108561013,...,"[{'credit_id': '52fe46689251416c910537ad', 'de...",Daniel Craig,Judi Dench,Javier Bardem,Sam Mendes,2012,10,25,3,908561013
6,200000000,"['Action', 'Adventure', 'Fantasy']","['dual identity', ""love of one's life"", 'pizza...",en,Peter Parker is going through a major identity...,35.149586,"['Columbia Pictures', 'Laura Ziskin Production...",['United States of America'],2004-06-25,783766341,...,"[{'credit_id': '52fe4252c3a36847f8015039', 'de...",Tobey Maguire,Kirsten Dunst,James Franco,Sam Raimi,2004,6,25,4,583766341
7,200000000,"['Action', 'Science Fiction', 'Thriller']","['saving the world', 'artificial intelligence'...",en,"All grown up in post-apocalyptic 2018, John Co...",71.862892,"['Columbia Pictures', 'The Halcyon Company', '...","['Germany', 'Italy', 'United Kingdom', 'United...",2009-05-20,371353001,...,"[{'credit_id': '52fe424ec3a36847f8013ffd', 'de...",Christian Bale,Sam Worthington,Anton Yelchin,McG,2009,5,20,2,171353001
8,190000000,['Action'],"['car race', 'speed', 'revenge', 'suspense', '...",en,Deckard Shaw seeks revenge against Dominic Tor...,102.322217,"['Universal Pictures', 'Original Film', 'Fuji ...","['Japan', 'United States of America']",2015-04-01,1506249360,...,"[{'credit_id': '52fe4cc8c3a36847f823e699', 'de...",Vin Diesel,Paul Walker,Dwayne Johnson,James Wan,2015,4,1,2,1316249360
9,250000000,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","['1970s', 'mutant', 'time travel', 'marvel com...",en,The ultimate X-Men ensemble fights a war for t...,118.078691,"['Twentieth Century Fox Film Corporation', ""Do...","['United Kingdom', 'United States of America']",2014-05-15,747862775,...,"[{'credit_id': '54408fa5c3a3686078000858', 'de...",Hugh Jackman,James McAvoy,Michael Fassbender,Bryan Singer,2014,5,15,3,497862775


In [49]:
import plotly.plotly as py
from plotly.graph_objs import *
import networkx as nx

In [50]:
pair = []
for i in range(subset1.shape[0]):
    pair.append((subset1.loc[i,"actor1"],subset1.loc[i,"actor2"]))
    pair.append((subset1.loc[i,"actor1"],subset1.loc[i,"actor3"]))
    pair.append((subset1.loc[i,"actor2"],subset1.loc[i,"actor3"]))

In [51]:
len(pair), subset1.shape[0]

(681, 227)

In [52]:
from collections import *
counter = Counter(tuple(sorted(tup)) for tup in pair)

In [53]:
len(counter)

619

In [54]:
edges = []
for i in range(len(counter)):
    edges.append((list(counter.keys())[i][0],list(counter.keys())[i][1],list(counter.values())[i]))

In [55]:
G=nx.Graph()
# G.add_weighted_edges_from(edges)
G.add_edges_from(pair)
# G.add_nodes_from(most_prolific)

In [56]:
print(nx.info(G)) 

Name: 
Type: Graph
Number of nodes: 216
Number of edges: 619
Average degree:   5.7315


In [57]:
nodes = list(G.node)
d = dict(nx.degree(G))

In [58]:
imp_actors = sorted(d, key=d.get)[201:]

In [59]:
pos=nx.kamada_kawai_layout(G)  

In [60]:
Xv=[pos[k][0] for k in nodes]
Yv=[pos[k][1] for k in nodes]
Xed=[]
Yed=[]
for edge in edges:
    Xed+=[pos[edge[0]][0],pos[edge[1]][0], None]
    Yed+=[pos[edge[0]][1],pos[edge[1]][1], None] 
    
trace3=Scatter(x=Xed,
               y=Yed,
               mode='lines',
               line=Line(color='rgb(210,210,210)', width=1),
               hoverinfo='none'
               )
trace4=Scatter(x=Xv,
               y=Yv,
               mode='markers',
               name='net',
               marker=Marker(symbol='dot',
                             size=10, 
               showscale=True,
               colorscale = 'Viridis',
               reversescale = False,
                             
               colorbar=ColorBar(title='Number of Connections<br> &nbsp;', tickvals=[2,4,6,8,10,12,14,16,18,20,22,24]),

               color= list(d.values()),
               line=Line(color='rgb(50,50,50)', width=0.5)
               ),
               text=nodes,
               hoverinfo='text'
               )
    
    

layout=Layout(title= "Leading Actors and their Connections",  
    font= Font(size=12),
    showlegend=False, 
    xaxis=XAxis(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=YAxis(showgrid=False, zeroline=False, showticklabels=False),
    margin=Margin(
        l=40,
        r=40,
        b=85,
        t=100,
    ),         
    )

data1=Data([trace3, trace4])
fig1=Figure(data=data1, layout=layout)
iplot(fig1)

### Top Actor Comparison

In [61]:
features = []
for item in imp_actors:
    group = df.loc[(df['actor1'] == item) | (df['actor2']== item) ]
    features.append((item, group.profit.mean(), group.vote_average.mean(), group.popularity.mean(), d[item]))

In [62]:
top15 = pd.DataFrame(features)

In [63]:
top15

Unnamed: 0,0,1,2,3,4
0,Scarlett Johansson,29165150.0,6.4375,33.564579,13
1,Russell Crowe,76908230.0,6.6,33.467027,13
2,Cameron Diaz,120614500.0,5.994118,38.642755,14
3,Tom Cruise,236936500.0,6.646154,48.156016,14
4,Brad Pitt,135349500.0,6.813793,48.674368,14
5,Matt Damon,98843630.0,6.636364,36.726481,14
6,Meryl Streep,83846530.0,6.482609,25.091077,14
7,Christian Bale,138891700.0,6.935,55.938757,15
8,Jude Law,59987340.0,6.371429,29.928448,15
9,Kate Winslet,144835600.0,7.007143,30.104512,15


In [64]:
top15.columns = ["Actor","Avg. Profit", "Avg. Vote", "Avg. Popularity", "Connections with Other Important Actors)"]


In [65]:
import plotly.plotly as py
import plotly.graph_objs as go

data = [go.Bar(
            x=top15['Avg. Profit'],
            y=top15['Actor'],
            orientation = 'h',
            marker=dict(
            color='rgb(127,205,187)'
        )
)]

layout = dict(
        title='Average Profit for Important Actors (Ordered by Connections)',
        margin=go.Margin(
        l=210,
        r=100,
        pad=1),
        xaxis=dict(
            title='Average Profit'
        ),
    
        yaxis=dict(
            title='&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Actors',
            tickfont=dict(
                size=12,
            )
        )
    
    )
    

fig = go.Figure(data = data, layout = layout)

iplot(fig)

In [66]:
import plotly.plotly as py
import plotly.graph_objs as go

data = [go.Bar(
            x=top15['Avg. Popularity'],
            y=top15['Actor'],
            orientation = 'h',
            marker=dict(
            color='rgb(127,205,187)'
        )
)]

layout = dict(
        title='Average Popularity for Important Actors (Ordered by Connections)',
        margin=go.Margin(
        l=210,
        r=100,
        pad=1),
        xaxis=dict(
            title='Average Vote'
        ),
    
        yaxis=dict(
            title='&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Actors',
            tickfont=dict(
                size=12,
            )
        )
    
    )
    

fig = go.Figure(data = data, layout = layout)

iplot(fig)

In [67]:
top15['Actor1'] = top15['Actor']+ " ("+ top15['Connections with Other Important Actors)'].astype('str') + ")"

In [68]:
trace1 = go.Bar(    y=top15["Actor1"],
                        orientation = 'h',
                        x=top15["Avg. Profit"]/1000000,
                        name='Profit',
                        marker=dict(
                    color='rgb(161,215,106)'
                ))
                        

trace2 = go.Bar(y=top15["Actor1"],
                   orientation = 'h',
                    x=top15["Avg. Popularity"],
                    marker=dict(
                        color='rgb(37,52,148)'
                        #color='rgb(65,182,196)'
                    ),
                    name='Popularity')
                

trace3 = go.Bar(y=top15["Actor1"],
                   orientation = 'h',
                    x=top15["Avg. Vote"]*10,
                    name='Vote',
                    marker=dict(
                        #color='rgb(37,52,148)'
                        color='rgb(65,182,196)'
                    )
               )


data = [trace1, trace2, trace3]


updatemenus = list([
    dict(active=-1,
         x=-0.3,
         buttons=list([  
             
            dict(
                label = 'Average Profit (in Millions)',
                 method = 'update',
                 args = [{'visible': [True, False, False]}, 
                     {'title': 'Average Profit'}]),
             
             dict(
                  label = 'Average Popularity',
                 method = 'update',
                 args = [{'visible': [False, True, False]},
                     {'title': 'Average Popularity'}]),

            dict(
                 label = 'Average Vote',
                 method = 'update',
                 args = [{'visible': [False, False, True]},
                     {'title': 'Average Vote'}])
        ]),
    )
])

layout = dict(title='Average Measures for Important Actors (Select from Dropdown)', showlegend=False,
              updatemenus=updatemenus)

fig = dict(data=data, layout=layout)

iplot(fig)