## Peace. Love. Bayes. 

Using Bayes to calculate the probability of success given the probability of success (pos ROI) per genre, season, and director.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [3]:
#data import
df = pd.read_csv('final.csv')

In [7]:
#create season column

# catogorize month into season

def season(month):
    if 3.0 <= month <= 5.0:
        return "Spring"
    elif 6.0 <= month <= 8.0:
        return "Summer"
    elif 9.0 <= month <= 11.0:
        return "Autumn"
    else:
        return "Winter"
    
#apply and make new column
df['season'] = df['month'].apply(season)

In [16]:
# create subset with only movies with runtime data
runtime = df[['title','runtime', 'roi']]
runtime = runtime.dropna()

In [17]:
#create runtime column
# catogorize runtime into length

def runtime_cat(runtime):
    if 0.0 <= runtime <= 40.0:
        return "Films less than 40 Minutes"
    elif 41.0 <= runtime <= 150.0:
        return "Films between 40 minutes and 150 minutes"
    else:
        return "Films longer than 150 minutes"
    
#apply and make new column
runtime['runtime_length'] = runtime['runtime'].apply(runtime_cat)

## P(Positive ROI) & P(Negative ROI)

In [8]:
# calc P(Positive ROI) & P(Negative ROI)

# P(pos)
p_pos_roi = (df['roi'] > 0).mean()

# P(neg)
p_neg_roi = (df['roi'] < 0).mean()

print("P(Positive ROI):", p_pos_roi)
print("P(Negative ROI):", p_neg_roi)

P(Positive ROI): 0.6859925788497218
P(Negative ROI): 0.3140074211502783


## P(Pos|Season) & P(Neg|Season)

In [13]:
# prob of each season
total_movies = len(df)
season_counts = df['season'].value_counts()
season_probabilities = season_counts / total_movies

print("Probability of a movie being debuted in each season:")
print(season_probabilities)

Probability of a movie being debuted in each season:
Autumn    0.277829
Summer    0.243506
Spring    0.241651
Winter    0.237013
Name: season, dtype: float64


In [15]:
# 
# conditional prob for season and positive ROI
positive_roi_probabilities = {}
for season in df['season'].unique():
    szn_dat = df[df['season'] == season]
    pos_roi = (szn_dat['roi'] > 0).sum()
    tot_movies = len(szn_dat)
    positive_roi_probabilities[season] = pos_roi / tot_movies

# conditional prob for season and negative ROI
negative_roi_probabilities = {}
for season in df['season'].unique():
    szn_dat = df[df['season'] == season]
    neg_roi = (szn_dat['roi'] < 0).sum()
    total_movies = len(szn_dat)
    negative_roi_probabilities[season] = neg_roi / total_movies

print("Probab of a positive ROI given each season:")
print(positive_roi_probabilities)

print("\nProb of a negative ROI given each season:")
print(negative_roi_probabilities)

Probab of a positive ROI given each season:
{'Winter': 0.6927592954990215, 'Summer': 0.7238095238095238, 'Spring': 0.6660268714011516, 'Autumn': 0.664440734557596}

Prob of a negative ROI given each season:
{'Winter': 0.30724070450097846, 'Summer': 0.2761904761904762, 'Spring': 0.33397312859884837, 'Autumn': 0.335559265442404}


## P(Pos|Runtime)

In [18]:
# count the number of movies in each category
pos_short = len(runtime[(runtime['runtime_length'] == 'Films less than 40 Minutes') & (runtime['roi'] > 0)])
short = len(runtime[runtime['runtime_length'] == 'Films less than 40 Minutes'])
                    
pos_avg = len(runtime[(runtime['runtime_length'] == 'Films between 40 minutes and 150 minutes') & (runtime['roi'] > 0)])
average = len(runtime[runtime['runtime_length'] == 'Films between 40 minutes and 150 minutes'])

pos_long = len(runtime[(runtime['runtime_length'] == 'Films longer than 150 minutes') & (runtime['roi'] > 0)])
long = len(runtime[runtime['runtime_length'] == 'Films longer than 150 minutes'])


# Total number of movies
total_movies = len(runtime)

In [19]:
#P(Pos|Short)
p_given_short = pos_short / short

#P(pos|average)
p_given_avg = pos_avg / average

#P(pos|average)
p_given_long = pos_long / long

print("P(p_positive_roi | p_short):", p_given_short)
print("P(p_positive_roi | p_avg):", p_given_avg)
print("P(p_positive_roi | p_long):", p_given_long)

P(p_positive_roi | p_short): 0.6363636363636364
P(p_positive_roi | p_avg): 0.6807297605473204
P(p_positive_roi | p_long): 0.75


## P(Pos|Genre)

## P(Pos|Director)

### Known Probabilities

P(Positive ROI) = 0.6199664429530202
P(Negative ROI) = 0.3800335570469799

Genre
P(Pos|Animation) = 0.83

Season
P(Pos|Summer) = 0.67

Runtime
P(Pos|Long) = .64

In [4]:
## Missing Director

# probabilities
p_positive_roi = 0.6199664429530202
p_negative_roi = 0.3800335570469799

#highest probabilities
pos_animation = 0.83
pos_summer = 0.67
pos_long = 0.64

# Calculate P(Pos | Genre & Runtime & Season)
trifecta = pos_animation * pos_long * pos_summer * p_positive_roi

print("P(Pos | Genre & Runtime & Season):", trifecta)


P(Pos | Genre & Runtime & Season): 0.22064853691275169
