In [2]:
import numpy as np, pandas as pd
import warnings
warnings.filterwarnings('ignore')
from bokeh.plotting import figure, output_file, show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LinearColorMapper,
    BasicTicker,
    PrintfTickFormatter,
    ColorBar,
    FactorRange
)
import bokeh.palettes
from bokeh.transform import factor_cmap
from bokeh.io import output_notebook

In [3]:
output_notebook()

In [7]:
data = pd.read_csv("../data/cleaned_data.csv")

In [9]:
data["profit"] = data["revenue"] - data["budget"]

In [40]:
data.head(3).transpose()

Unnamed: 0,0,1,2
budget,237000000,300000000,245000000
genres,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","['Adventure', 'Fantasy', 'Action']","['Action', 'Adventure', 'Crime']"
keywords,"['culture clash', 'future', 'space war', 'spac...","['ocean', 'drug abuse', 'exotic island', 'east...","['spy', 'based on novel', 'secret agent', 'seq..."
original_language,en,en,en
overview,"In the 22nd century, a paraplegic Marine is di...","Captain Barbossa, long believed to be dead, ha...",A cryptic message from Bond’s past sends him o...
popularity,150.438,139.083,107.377
production_companies,"['Ingenious Film Partners', 'Twentieth Century...","['Walt Disney Pictures', 'Jerry Bruckheimer Fi...","['Columbia Pictures', 'Danjaq', 'B24']"
production_countries,"['United States of America', 'United Kingdom']",['United States of America'],"['United Kingdom', 'United States of America']"
release_date,2009-12-10,2007-05-19,2015-10-26
revenue,2787965087,961000000,880674609


### Temporal Pattern in Profit

In [56]:
profit_by_month_year = data[data["year"]>1990].groupby(["year","month"]).profit.mean().reset_index()
count_by_month_year = data[data["year"]>1990].groupby(["year","month"]).size().reset_index()

In [57]:
profit_by_month_year[:3]

Unnamed: 0,year,month,profit
0,1991,1,-7103420.0
1,1991,2,253742922.0
2,1991,3,24708446.5


In [58]:
newdata = ColumnDataSource(profit_by_month_year)

mapper = LinearColorMapper(palette=bokeh.palettes.RdBu[9], 
                           low=profit_by_month_year["profit"].min(), high=profit_by_month_year["profit"].max())

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title = "Profit",
           x_axis_label = 'Year', 
           y_axis_label = 'Month',
           tools=TOOLS,
           plot_width = 900)

p.rect(x="year", y="month",width=1, height=1,source = newdata,
       fill_color={'field': 'profit', 'transform': mapper})

color_bar = ColorBar(color_mapper=mapper, location=(18, 0),
                     ticker=BasicTicker(desired_num_ticks=9))

p.add_layout(color_bar, 'right')
show(p)

W-1005 (SNAPPED_TOOLBAR_ANNOTATIONS): Snapped toolbars and annotations on the same side MAY overlap visually: Figure(id='9881e639-5b15-412a-8468-84bc0a0d3dee', ...)


In [60]:
profit_by_dow= data[data["year"]>1990].groupby(["dow"]).profit.mean().reset_index()

In [62]:
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title = "Profit vs Day_of_Week", 
           x_axis_label = 'DOW', 
           y_axis_label = 'Profit',
           tools=TOOLS)

p.vbar(x=profit_by_dow["dow"],
       top=profit_by_dow["profit"], 
       width = 1,
       line_color="white" )

show(p)

### Genre Exploration

In [127]:
df1 = pd.concat([pd.Series(row['year'], row['genres'].replace('[','').replace(']','').replace("'","").split(','))              
                    for _, row in data[["year","genres"]].iterrows()]).reset_index()
df2 = pd.concat([pd.Series(row['profit'], row['genres'].replace('[','').replace(']','').replace("'","").split(','))              
                    for _, row in data[["profit","genres"]].iterrows()]).reset_index()

In [128]:
df1.columns = ["genres","year"]
df2.columns = ["genres","profit"]

In [129]:
df1.shape, df2.shape

((12165, 2), (12165, 2))

In [130]:
df1["profit"] = df2["profit"]
df1["genres"] = df1["genres"].apply(lambda x: x.strip())
del df2

In [131]:
df1.head(3)

Unnamed: 0,genres,year,profit
0,Action,2009,2550965087
1,Adventure,2009,2550965087
2,Fantasy,2009,2550965087


In [138]:
df1.groupby(["genres"]).size().sort_values(ascending=False)[:10]

genres
Drama              2290
Comedy             1717
Thriller           1272
Action             1153
Romance             893
Adventure           790
Crime               696
Science Fiction     534
Horror              519
Family              511
dtype: int64

In [145]:
genres = ["Drama","Comedy","Thriller","Action","Romance","Adventure","Crime","Science", "Fiction","Horror","Family"]

In [149]:
grouped_data = df1[df1["genres"].isin(genres)].groupby(["genres","year"]).profit.mean().reset_index()

In [154]:
grouped_data.head(3)

Unnamed: 0,genres,year,profit
0,Action,1930,4050000.0
1,Action,1936,1536000.0
2,Action,1947,0.0


In [160]:
dict_group_data = {"xs":[grouped_data.loc[(grouped_data["genres"]==name),"year"].values \
                         for name in grouped_data['genres'].unique()],
                  "ys": [grouped_data.loc[(grouped_data["genres"]==name),"profit"].values\
                         for name in grouped_data['genres'].unique()],
                  "labels": grouped_data["genres"].unique(),
                  "color": bokeh.palettes.Category10[9]}

source = ColumnDataSource(dict_group_data)

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(x_axis_label = 'year', 
           y_axis_label = 'profit', 
           width = 1000,
           title = "Profit for each Genre across years",
           tools=TOOLS) 

p.multi_line(xs= "xs", ys= "ys", line_color= "color", source = source, legend = "labels",line_width=3)
p.legend.location = "top_left"

show(p)