In [1]:
import pandas as pd
import numpy as np
import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
import datetime

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff

py.offline.init_notebook_mode(connected=True)

from math import floor
from plotly import tools
from plotly.graph_objs import *

In [2]:
data = pd.read_csv("../data/cleaned_data.csv")

In [3]:
data.shape

(4790, 25)

In [4]:
data["profit"] = data["revenue"] - data["budget"]

In [5]:
data["profit"].describe()

count    4.790000e+03
mean     5.336400e+07
std      1.361221e+08
min     -1.657101e+08
25%     -7.985852e+05
50%      2.602056e+06
75%      5.546026e+07
max      2.550965e+09
Name: profit, dtype: float64

In [6]:
data[data["profit"]==data["profit"].max()]

Unnamed: 0,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,...,crew,actor1,actor2,actor3,director,year,month,day,dow,profit
0,237000000,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","['culture clash', 'future', 'space war', 'spac...",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"['Ingenious Film Partners', 'Twentieth Century...","['United States of America', 'United Kingdom']",2009-12-10,2787965087,...,"[{'credit_id': '52fe48009251416c750aca23', 'de...",Sam Worthington,Zoe Saldana,Sigourney Weaver,James Cameron,2009,12,10,3,2550965087


In [7]:
profit_by_dow = data[data["year"]>1990].groupby(["dow"]).budget.mean().reset_index()

In [8]:
bar_data = [go.Bar(x=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']\
                   , y=profit_by_dow["budget"])]

py.offline.iplot({ 'data': bar_data,
            'layout': {
               'title': 'Average Revenue by Day of Week',
               'xaxis': {
                 'title': 'Day of Week'},
               'yaxis': {
                'title': 'Profit'}
        }})

#### Release movies on Tuesday 

### Violin Plots

In [9]:
def extract_decade(x):
    return str(floor(x/10)*10)+"s"

In [10]:
data["decade"] = data["year"].apply(extract_decade)

In [11]:
df = data

In [12]:
df = df.sort_values(by=['decade'], ascending=True)

In [13]:
df.groupby(['decade']).size()

decade
1910s       1
1920s       4
1930s      15
1940s      25
1950s      27
1960s      71
1970s     109
1980s     278
1990s     778
2000s    2044
2010s    1438
dtype: int64

In [14]:
df[df['decade'] == '1910s']

Unnamed: 0,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,...,actor1,actor2,actor3,director,year,month,day,dow,profit,decade
4581,385907,['Drama'],"['usa', 'naivety', 'intolerance', 'mill', 'mar...",en,"The story of a poor young woman, separated by ...",3.232447,"['Triangle Film Corporation', 'Wark Producing ...",['United States of America'],1916-09-04,8394751,...,Lillian Gish,Mae Marsh,Robert Harron,D.W. Griffith,1916,9,4,0,8008844,1910s


### get rid of the 1910's decade since it has only 1 movie

In [15]:
df = df[df['decade'] != '1910s']

In [16]:
da = []
for i in range(0,len(pd.unique(df['decade']))):
    trace = {
            "type": 'violin',
            "x": df['decade'][df['decade'] == pd.unique(df['decade'])[i]],
            "y": df['vote_average'][df['decade'] == pd.unique(df['decade'])[i]],
            "name": pd.unique(df['decade'])[i],
            "box": {
                "visible": True
            },
            "meanline": {
                "visible": True
            }
        }
    da.append(trace)

        
fig = {
    "data": da,
    "layout" : {
        "title": "Average Movie Ratings by Decade",
            "xaxis" : dict(title = 'Decade', autotick=False, showticklabels=True),
            "yaxis" : dict(title = 'Average Rating')
    }
}

iplot(fig, validate = False)

#### movies were better in the 1990's than the 2010's

In [17]:
df.columns

Index(['budget', 'genres', 'keywords', 'original_language', 'overview',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages',
       'vote_average', 'vote_count', 'title', 'cast', 'crew', 'actor1',
       'actor2', 'actor3', 'director', 'year', 'month', 'day', 'dow', 'profit',
       'decade'],
      dtype='object')

### Bubble Chart

In [18]:
layout = go.Layout(
    title='Profit vs. Budget',
    xaxis=dict(
        title='Budget',
        gridcolor='rgb(255, 255, 255)',
        range=[0, 4e8],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    yaxis=dict(
        title='Profit',
        gridcolor='rgb(255, 255, 255)',
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    )
)
data = [go.Scatter(
    x=df.budget.values, # Budget
    y=df.profit.values,  # Gross
    mode='markers',
    text= df.title.values, # Movie Titles
    marker=dict(
        size=3*(df.vote_average),
        sizeref=1.0,
        color=df.vote_average.values,
        colorbar=ColorBar(title='Average Rating'),
        showscale=True,
        colorscale='Viridis'
    ))]
fig = go.Figure(data=data, layout=layout)
iplot(fig)

### Runtime vs Profit

### Actors vs Profit

### Directors vs Profit

### Month vs Profit

In [19]:
df.columns

Index(['budget', 'genres', 'keywords', 'original_language', 'overview',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages',
       'vote_average', 'vote_count', 'title', 'cast', 'crew', 'actor1',
       'actor2', 'actor3', 'director', 'year', 'month', 'day', 'dow', 'profit',
       'decade'],
      dtype='object')

In [20]:
df.head().transpose()

Unnamed: 0,2638,4650,4448,4583,3802
budget,92620000,245000,0,379000,3950000
genres,"['Drama', 'Science Fiction']","['Drama', 'Romance', 'War']","['Drama', 'Thriller', 'Romance']","['Drama', 'Music', 'Romance']","['Action', 'Drama', 'History']"
keywords,"['man vs machine', 'underground world', 'inven...","['world war i', 'silent film']","['london england', 'casino', 'irony', 'forbidd...","['musical', 'singer', 'pre-code', 'wisecrack h...","['world war i', 'zeppelin', 'royal air force',..."
original_language,de,en,de,en,en
overview,In a futuristic city sharply divided between t...,The story of an idle rich boy who joins the US...,The rise and inevitable fall of an amoral but ...,"Harriet and Queenie Mahoney, a vaudeville act,...",Two brothers attending Oxford enlist with the ...
popularity,32.3515,0.785744,1.82418,0.968865,8.48412
production_companies,"['Paramount Pictures', 'Universum Film (UFA)']",['Metro-Goldwyn-Mayer (MGM)'],['Nero Films'],['Metro-Goldwyn-Mayer (MGM)'],['The Caddo Company']
production_countries,['Germany'],['United States of America'],['Germany'],['United States of America'],['United States of America']
release_date,1927-01-10,1925-11-05,1929-01-30,1929-02-08,1930-11-15
revenue,650422,22000000,0,4358000,8000000


### Runtime vs Profit

### Remove movies with a runtime of 0

In [21]:
df2 = df[df.runtime > 20]

In [22]:
layout = go.Layout(
    title='Profit vs. Runtime',
    xaxis=dict(
        title='Runtime',
        gridcolor='rgb(255, 255, 255)',
        range=[0, 4e8],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    yaxis=dict(
        title='Profit',
        gridcolor='rgb(255, 255, 255)',
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    )
)
data = [go.Scatter(
    x=df2.runtime.values, # Budget
    y=df2.profit.values,  # Gross
    mode='markers',
    text= df2.title.values, # Movie Titles
    marker=dict(
        size=3*(df.vote_average),
        sizeref=1.0,
        color=df.vote_average.values,
        colorbar=ColorBar(title='Average Rating'),
        showscale=True,
        colorscale='Viridis'
    ))]
fig = go.Figure(data=data, layout=layout)
iplot(fig)

### Network Attempt ...

In [23]:
df.columns

Index(['budget', 'genres', 'keywords', 'original_language', 'overview',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages',
       'vote_average', 'vote_count', 'title', 'cast', 'crew', 'actor1',
       'actor2', 'actor3', 'director', 'year', 'month', 'day', 'dow', 'profit',
       'decade'],
      dtype='object')

title year is the year the movie was released
actor1 is actor_1 and so on

df_appearance = df_reduced[['actor_1_name', 'title_year']].groupby('actor_1_name').count()
appearance df is grouping the actor1 name and year of movie by the name, and seeing the count of number of movies an actor has had in a specific year

only want actors who have appeared in more than 4 movies per year
selection = df_appearance['title_year'] > 4

most_prolific = df_actors[selection] # these actors



In [66]:
df_reduced = df[['actor1', 'vote_average',
                 'year', 'title']].reset_index(drop = True)

In [72]:
df_appearance = df[['actor1', 'year']].groupby('actor1').count().reset_index()

In [172]:
selection = np.array(df_appearance['year'] > 3)

In [173]:
most_prolific = list(df_appearance[selection].reset_index()['actor1'])

In [174]:
subset1 = df[df.actor1.isin(most_prolific) &\
            df.actor2.isin(most_prolific) & df.actor3.isin(most_prolific)].reset_index(drop = True)

In [99]:
subset2 = df[df.actor1.isin(most_prolific) |\
            df.actor2.isin(most_prolific) | df.actor3.isin(most_prolific)].reset_index(drop = True)

### Kishan Solution

In [104]:
test = pd.crosstab(df['actor1'], df['actor2'])

In [115]:
edge = []
for actor_1, actor_2 in list(test[test > 0].stack().index):
    if actor_1 not in actors_list: continue
    if actor_2 not in actors_list: continue
   
    if actor_1 not in actors_list or actor_2 not in actors_list: continue
    if actor_1 != actor_2:
        edge.append([actor_1, actor_2])

In [116]:
edge

[['Aaron Abrams', 'Alan Alda'],
 ['Aaron Eckhart', 'Hilary Swank'],
 ['Aaron Eckhart', 'Jennifer Aniston'],
 ['Aaron Paul', 'Dominic Cooper'],
 ['Aaron Stanford', 'Kate Mara'],
 ['Aaron Taylor-Johnson', 'Chloë Grace Moretz'],
 ['Abbie Cornish', 'Ben Whishaw'],
 ['Abigail Breslin', "Chris O'Donnell"],
 ['Abigail Breslin', 'James Purefoy'],
 ['Abigail Breslin', 'Jodie Foster'],
 ['Adam Brody', 'Michael Ealy'],
 ['Adam Butcher', 'Campbell Scott'],
 ['Adam Sandler', 'Andy Samberg'],
 ['Adam Sandler', 'Ben Stiller'],
 ['Adam Sandler', 'Chris Rock'],
 ['Adam Sandler', 'Don Cheadle'],
 ['Adam Sandler', 'Drew Barrymore'],
 ['Adam Sandler', 'Emily Watson'],
 ['Adam Sandler', 'Jennifer Aniston'],
 ['Adam Sandler', 'John Turturro'],
 ['Adam Sandler', 'Kate Beckinsale'],
 ['Adam Sandler', 'Katie Holmes'],
 ['Adam Sandler', 'Keri Russell'],
 ['Adam Sandler', 'Kevin James'],
 ['Adam Sandler', 'Michelle Monaghan'],
 ['Adam Sandler', 'Patricia Arquette'],
 ['Adam Sandler', 'Salma Hayek'],
 ['Adam Sand

fill in a list of actor size which is the number of connections per actor

In [117]:
num_of_adjacencies = np.array([0 for _ in range(len(df_actors))])

In [121]:
d = {}
for ind, col in df_actors:
    print(ind)

"Weird Al" Yankovic
50 Cent
AJ Michalka
Aamir Khan
Aaron Abrams
Aaron Eckhart
Aaron Kwok
Aaron Paul
Aaron Stanford
Aaron Taylor-Johnson
Aasheekaa Bathija
Abbie Cornish
Abhishek Bachchan
Abigail Breslin
Adam Beach
Adam Bousdoukos
Adam Brody
Adam Butcher
Adam Carolla
Adam Goldberg
Adam Greaves-Neal
Adam Sandler
Adam Scott
Adam West
Adelaide Clemens
Adriana Barraza
Adriana Caselotti
Adrien Brody
Adrienne Barbeau
Adrienne Pickering
Agata Trzebuchowska
Agnes Bruckner
Aidan Quinn
Aimee Garcia
Aimee Teegarden
Aishwarya Rai Bachchan
Aksel Hennie
Akshay Kumar
Al Gore
Al Pacino
Alain Moussi
Alan Alda
Alan Arkin
Alan King
Alan Rickman
Alan Tudyk
Alan van Sprang
Albert Brooks
Albert Finney
Alden Ehrenreich
Alec Baldwin
Alessandro Gassman
Alex Briley
Alex D. Linz
Alex Karpovsky
Alex Kendrick
Alex Michaeletos
Alex Pettyfer
Alex Vincent
Alexa PenaVega
Alexander Siddig
Alexander Skarsgård
Alexandra Daddario
Alexandre Rodrigues
Alexis Bledel
Alexis Kendra
Alfre Woodard
Ali Larter
Alice Eve
Alicia Silve

Hank Azaria
Hans Matheson
Harriet Andersson
Harriet Owen
Harrison Ford
Harry Dean Stanton
Harry Styles
Harvey Keitel
Hayden Christensen
Hayden Panettiere
Hayley Atwell
Hayley Orrantia
Heath Ledger
Heather Langenkamp
Heather Matarazzo
Heike Makatsch
Helen Hayes
Helen Hunt
Helen Mirren
Helena Bonham Carter
Henri Garcin
Henry Cavill
Henry Fonda
Henry Thomas
Hilary Duff
Hilary Swank
Hiroki Hasegawa
Hiroyuki Ikeuchi
Holly Hunter
Hope Davis
Howard Keel
Hristos Passalis
Hrithik Roshan
Huang Bo
Hugh Dancy
Hugh Grant
Hugh Jackman
Humphrey Bogart
Ian Bannen
Ian Holm
Ian McKellen
Ian Ziering
Ice Cube
Idris Elba
Iko Uwais
Imelda Staunton
Imogen Poots
Indira Varma
Ingrid Bergman
Ingvar Eggert Sigurðsson
Ioan Gruffudd
Irene Bedard
Irrfan Khan
Isabelle Adjani
Ivan Barnev
Ivan Dixon
Ivan Okhlobystin
Ivana Baquero
Izzy Diaz
J.D. Williams
J.J. Johnson
J.P. Davis
Ja Rule
Jack Black
Jack Huston
Jack Lemmon
Jack Nance
Jack Nicholson
Jack O'Connell
Jack Warden
Jackie Chan
Jacob Zachar
Jacques Gamblin
Jada P

In [None]:
d = {}
for ind, col in df_actors:
    actor = col["actor1"]
    nb = sum([1 for i, j in edge if (i == actor).all() or (j == actor).all() ])
    d[ind] = nb

### Alvira Solution

In [207]:
import plotly.plotly as py
from plotly.graph_objs import *

In [208]:
pair = []
for i in range(subset1.shape[0]):
    pair.append((subset1.loc[i,"actor1"],subset1.loc[i,"actor2"]))
    pair.append((subset1.loc[i,"actor1"],subset1.loc[i,"actor3"]))
    pair.append((subset1.loc[i,"actor2"],subset1.loc[i,"actor3"]))

In [209]:
from collections import *
counter = Counter(tuple(sorted(tup)) for tup in pair)

In [210]:
edges = []
for i in range(len(counter)):
    edges.append((list(counter.keys())[i][0],list(counter.keys())[i][1],list(counter.values())[i]))

In [211]:
edges[:3]

[('Al Pacino', 'Robert Duvall', 1),
 ('Al Pacino', 'Diane Keaton', 1),
 ('Diane Keaton', 'Robert Duvall', 1)]

In [212]:
import plotly.plotly as py
from plotly.graph_objs import *
import networkx as nx

In [213]:
G=nx.Graph()
G.remove_edges_from(G.edges())
G.add_weighted_edges_from(edges)
G.add_nodes_from(most_prolific)

In [214]:
pos=nx.fruchterman_reingold_layout(G) 
pos

{'Aaron Eckhart': array([0.11442842, 0.03057507]),
 'Adam Sandler': array([ 0.08175938, -0.13853118]),
 'Adrien Brody': array([0.04459496, 0.09836826]),
 'Akshay Kumar': array([-0.46934756, -0.87544684]),
 'Al Pacino': array([0.04365443, 0.05095653]),
 'Alec Baldwin': array([0.03865273, 0.02709411]),
 'Alex Pettyfer': array([0.03267367, 0.23893538]),
 'Alexis Bledel': array([ 0.94761244, -0.20016023]),
 'Amy Adams': array([ 0.02210807, -0.02103649]),
 'Angelina Jolie': array([0.10905857, 0.07479387]),
 'Anna Faris': array([ 0.16431165, -0.02364984]),
 'Anna Kendrick': array([0.09888402, 0.18454165]),
 'Anne Hathaway': array([ 0.1355488 , -0.03863659]),
 'Anthony Hopkins': array([0.12654407, 0.1225715 ]),
 'Anton Yelchin': array([0.06213822, 0.07766908]),
 'Antonio Banderas': array([0.12835203, 0.07815711]),
 'Arnold Schwarzenegger': array([0.0948259 , 0.16688772]),
 'Ashley Judd': array([0.09250246, 0.12570852]),
 'Ashton Kutcher': array([ 0.03276458, -0.08762283]),
 'Ben Affleck': arr

In [None]:
Xv=[pos[k][0] for k in most_prolific]
Yv=[pos[k][1] for k in most_prolific]
Xed=[]
Yed=[]
for edge in edges:
    Xed+=[pos[edge[0]][0],pos[edge[1]][0], None]
    Yed+=[pos[edge[0]][1],pos[edge[1]][1], None] 
    
trace3=Scatter(x=Xed,
               y=Yed,
               mode='lines',
               line=Line(color='rgb(210,210,210)', width=1),
               hoverinfo='none'
               )
trace4=Scatter(x=Xv,
               y=Yv,
               mode='markers',
               name='net',
               marker=Marker(symbol='dot',
                             size=5, 
                             color='#6959CD',
                             line=Line(color='rgb(50,50,50)', width=0.5)
                             ),
               text=most_prolific,
               hoverinfo='text'
               )

layout=Layout(title= "trial1",  
    font= Font(size=12),
    showlegend=False,
    autosize=False,        
    margin=Margin(
        l=40,
        r=40,
        b=85,
        t=100,
    ),
    hovermode='closest',
    annotations=Annotations([
           Annotation(
           showarrow=False, 
            text='This igraph.Graph has the Kamada-Kawai layout',  
            xref='paper',     
            yref='paper',     
            x=0,  
            y=-0.1,  
            xanchor='left',   
            yanchor='bottom',  
            font=Font(
            size=14 
            )     
            )
        ]),           
    )


annot="This networkx.Graph has the Fruchterman-Reingold layout<br>Code:"+\
"<a href='http://nbviewer.ipython.org/gist/empet/07ea33b2e4e0b84193bd'> [2]</a>"

data1=Data([trace3, trace4])
fig1=Figure(data=data1, layout=layout)
fig1['layout']['annotations'][0]['text']=annot
py.iplot(fig1, filename='Coautorship-network-nx')

In [219]:
fig1['layout']['annotations']

[]

In [222]:
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.go_offline
init_notebook_mode(connected=True)