In [1]:
import numpy as np, pandas as pd
import warnings
warnings.filterwarnings('ignore')
from bokeh.plotting import figure, output_file, show
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LinearColorMapper,
    BasicTicker,
    PrintfTickFormatter,
    ColorBar,
    FactorRange
)
import bokeh.palettes
from bokeh.transform import factor_cmap
from bokeh.io import output_notebook

In [2]:
output_notebook()

In [3]:
data = pd.read_csv("../data/cleaned_data.csv")

In [4]:
data["profit"] = data["revenue"] - data["budget"]

In [5]:
data.head(3).transpose()

Unnamed: 0,0,1,2
budget,237000000,300000000,245000000
genres,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","['Adventure', 'Fantasy', 'Action']","['Action', 'Adventure', 'Crime']"
keywords,"['culture clash', 'future', 'space war', 'spac...","['ocean', 'drug abuse', 'exotic island', 'east...","['spy', 'based on novel', 'secret agent', 'seq..."
original_language,en,en,en
overview,"In the 22nd century, a paraplegic Marine is di...","Captain Barbossa, long believed to be dead, ha...",A cryptic message from Bond’s past sends him o...
popularity,150.438,139.083,107.377
production_companies,"['Ingenious Film Partners', 'Twentieth Century...","['Walt Disney Pictures', 'Jerry Bruckheimer Fi...","['Columbia Pictures', 'Danjaq', 'B24']"
production_countries,"['United States of America', 'United Kingdom']",['United States of America'],"['United Kingdom', 'United States of America']"
release_date,2009-12-10,2007-05-19,2015-10-26
revenue,2787965087,961000000,880674609


### Temporal Pattern in Profit

In [6]:
profit_by_month_year = data[data["year"]>1990].groupby(["year","month"]).profit.mean().reset_index()
count_by_month_year = data[data["year"]>1990].groupby(["year","month"]).size().reset_index()

In [7]:
profit_by_month_year[:3]

Unnamed: 0,year,month,profit
0,1991,1,-7103420.0
1,1991,2,253742922.0
2,1991,3,24708446.5


In [9]:
newdata = ColumnDataSource(profit_by_month_year)

mapper = LinearColorMapper(palette=bokeh.palettes.RdBu[9], 
                           low=profit_by_month_year["profit"].min(), high=profit_by_month_year["profit"].max())

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title = "Profit Heatmap by Year and Month",
           x_axis_label = 'Year', 
           y_axis_label = 'Month',
           tools=TOOLS,
           plot_width = 900)

p.rect(x="year", y="month",width=1, height=1,source = newdata,
       fill_color={'field': 'profit', 'transform': mapper})

color_bar = ColorBar(color_mapper=mapper, location=(18, 0),
                     ticker=BasicTicker(desired_num_ticks=9))

p.add_layout(color_bar, 'right')
show(p)

In [10]:
profit_by_dow= data[data["year"]>1990].groupby(["dow"]).profit.mean().reset_index()

In [12]:
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title = "Profit vs the Day of Week", 
           x_axis_label = 'DOW', 
           y_axis_label = 'Profit',
           tools=TOOLS)

p.vbar(x=profit_by_dow["dow"],
       top=profit_by_dow["profit"], 
       width = 1,
       line_color="white" )

show(p)

### Genre Exploration

In [13]:
df1 = pd.concat([pd.Series(row['year'], row['genres'].replace('[','').replace(']','').replace("'","").split(','))              
                    for _, row in data[data["year"]>1990][["year","genres"]].iterrows()]).reset_index()
df2 = pd.concat([pd.Series(row['profit'], row['genres'].replace('[','').replace(']','').replace("'","").split(','))              
                    for _, row in data[data["year"]>1990][["profit","genres"]].iterrows()]).reset_index()

In [14]:
df1.columns = ["genres","year"]
df2.columns = ["genres","profit"]

In [15]:
df1.shape, df2.shape

((10723, 2), (10723, 2))

In [16]:
df1["profit"] = df2["profit"]
df1["genres"] = df1["genres"].apply(lambda x: x.strip())
del df2

In [17]:
df1.head(3)

Unnamed: 0,genres,year,profit
0,Action,2009,2550965087
1,Adventure,2009,2550965087
2,Fantasy,2009,2550965087


In [18]:
df1.groupby(["genres"]).size().sort_values(ascending=False)[:10]

genres
Drama              2040
Comedy             1571
Thriller           1146
Action             1003
Romance             793
Adventure           657
Crime               632
Family              467
Science Fiction     439
Horror              432
dtype: int64

In [19]:
genres = ["Drama","Comedy","Thriller","Action","Romance","Adventure","Crime","Science", "Fiction","Horror","Family"]

In [20]:
grouped_data = df1[df1["genres"].isin(genres)].groupby(["genres","year"]).profit.mean().reset_index()

In [21]:
grouped_data.head(3)

Unnamed: 0,genres,year,profit
0,Action,1991,42690320.0
1,Action,1992,127958800.0
2,Action,1993,33954080.0


In [22]:
dict_group_data = {"xs":[grouped_data.loc[(grouped_data["genres"]==name),"year"].values \
                         for name in grouped_data['genres'].unique()],
                  "ys": [grouped_data.loc[(grouped_data["genres"]==name),"profit"].values\
                         for name in grouped_data['genres'].unique()],
                  "labels": grouped_data["genres"].unique(),
                  "color": bokeh.palettes.Category10[9]}

source = ColumnDataSource(dict_group_data)

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(x_axis_label = 'year', 
           y_axis_label = 'profit', 
           width = 1000,
           title = "Profit for each Genre across years",
           tools=TOOLS) 

p.multi_line(xs= "xs", ys= "ys", line_color= "color", source = source, legend = "labels",line_width=3)
p.legend.location = "top_left"

show(p)