# Adjust Rating and Genre

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.stats.outliers_influence as sm_vif
from math import exp

In [2]:
df_movies = pd.read_csv('df_movies.csv')

## Adjust Genre

In [3]:
df_movies.groupby('genre_adj').size()

genre_adj
Action       1210
Adventure     259
Comedy       1263
Drama         665
Others       1009
dtype: int64

## Adjust Rating

In [4]:
df_movies.groupby('rating_adj').size()

rating_adj
G and PG                863
PG-13                  1587
R, NC-17 and others    1956
dtype: int64

# Profitability Models

### base without Gender

In [5]:
# Add dummy variable for budget that is above the median
df_movies["over_median_budget_adj_flag"] = df_movies["budget_adj"] > df_movies["budget_adj"].median()
# df_movies["over_median_budget_adj_flag"] = df_movies["budget_adj"] > df_movies["budget_adj"].quantile(0.75)


# A model without Gender for comparision
model_profit_base = sm.OLS.from_formula('profitability_ln ~ over_median_budget_adj_flag + np.log(budget_adj) + year + score + rating_adj + genre_adj', df_movies)
print(model_profit_base.fit().summary())

                            OLS Regression Results                            
Dep. Variable:       profitability_ln   R-squared:                       0.241
Model:                            OLS   Adj. R-squared:                  0.239
Method:                 Least Squares   F-statistic:                     122.9
Date:                Fri, 02 Dec 2022   Prob (F-statistic):          2.74e-223
Time:                        23:16:04   Log-Likelihood:                -4968.8
No. Observations:                3891   AIC:                             9960.
Df Residuals:                    3880   BIC:                         1.003e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
In

### Gender as a constant

In [6]:
# Add Gender
model_profit_gen_constant = sm.OLS.from_formula('profitability_ln ~ gender + np.log(budget_adj) + year + score + rating_adj + genre_adj', df_movies)
print(model_profit_gen_constant.fit().summary())

                            OLS Regression Results                            
Dep. Variable:       profitability_ln   R-squared:                       0.234
Model:                            OLS   Adj. R-squared:                  0.232
Method:                 Least Squares   F-statistic:                     118.7
Date:                Fri, 02 Dec 2022   Prob (F-statistic):          1.96e-216
Time:                        23:16:04   Log-Likelihood:                -4984.8
No. Observations:                3891   AIC:                             9992.
Df Residuals:                    3880   BIC:                         1.006e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

### Gender with budget_adj 

In [7]:
# Add Gender * budget_adj
model_profit_gen_budget = sm.OLS.from_formula('profitability_ln ~ gender*over_median_budget_adj_flag + np.log(budget_adj) + year + score + rating_adj + genre_adj', df_movies)
print(model_profit_gen_budget.fit().summary())

                            OLS Regression Results                            
Dep. Variable:       profitability_ln   R-squared:                       0.243
Model:                            OLS   Adj. R-squared:                  0.241
Method:                 Least Squares   F-statistic:                     103.9
Date:                Fri, 02 Dec 2022   Prob (F-statistic):          3.27e-224
Time:                        23:16:05   Log-Likelihood:                -4961.8
No. Observations:                3891   AIC:                             9950.
Df Residuals:                    3878   BIC:                         1.003e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                                                         coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------

### Gender with year

In [8]:
# Add Gender * year
model_profit_gen_year = sm.OLS.from_formula('profitability_ln ~ gender*year + np.log(budget_adj) + score + rating_adj + genre_adj', df_movies)
print(model_profit_gen_year.fit().summary())

                            OLS Regression Results                            
Dep. Variable:       profitability_ln   R-squared:                       0.235
Model:                            OLS   Adj. R-squared:                  0.233
Method:                 Least Squares   F-statistic:                     108.2
Date:                Fri, 02 Dec 2022   Prob (F-statistic):          6.93e-216
Time:                        23:16:05   Log-Likelihood:                -4983.6
No. Observations:                3891   AIC:                             9991.
Df Residuals:                    3879   BIC:                         1.007e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

### Gender with both budget_adj and year

In [9]:
# Add Gender * both budget_adj & year
model_profit_gen_constant = sm.OLS.from_formula('profitability_ln ~ gender*over_median_budget_adj_flag + gender*year + np.log(budget_adj) + score + rating_adj + genre_adj', df_movies)
print(model_profit_gen_constant.fit().summary())

                            OLS Regression Results                            
Dep. Variable:       profitability_ln   R-squared:                       0.244
Model:                            OLS   Adj. R-squared:                  0.241
Method:                 Least Squares   F-statistic:                     96.09
Date:                Fri, 02 Dec 2022   Prob (F-statistic):          1.34e-223
Time:                        23:16:05   Log-Likelihood:                -4960.9
No. Observations:                3891   AIC:                             9950.
Df Residuals:                    3877   BIC:                         1.004e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                                         coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------

# Revenue Models

In [10]:
model_base_gross = sm.OLS.from_formula('np.log(gross_adj) ~ gender', df_movies)
print(model_base_gross.fit().summary())

                            OLS Regression Results                            
Dep. Variable:      np.log(gross_adj)   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     13.30
Date:                Fri, 02 Dec 2022   Prob (F-statistic):           0.000269
Time:                        23:16:05   Log-Likelihood:                -6969.6
No. Observations:                4406   AIC:                         1.394e+04
Df Residuals:                    4404   BIC:                         1.396e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          4.2383      0.038    110.

### Base without Gender

In [11]:
# This will be our base model before adding Gender
model_base_gross = sm.OLS.from_formula('np.log(gross_adj) ~ np.log(budget_adj) + year + score + rating_adj + genre_adj', df_movies)
print(model_base_gross.fit().summary())

                            OLS Regression Results                            
Dep. Variable:      np.log(gross_adj)   R-squared:                       0.441
Model:                            OLS   Adj. R-squared:                  0.440
Method:                 Least Squares   F-statistic:                     340.2
Date:                Fri, 02 Dec 2022   Prob (F-statistic):               0.00
Time:                        23:16:05   Log-Likelihood:                -4990.7
No. Observations:                3891   AIC:                         1.000e+04
Df Residuals:                    3881   BIC:                         1.006e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

### Add gender into the base model

In [12]:
# Add gender into the base model
model_1_gross = sm.OLS.from_formula('np.log(gross_adj) ~ gender + np.log(budget_adj) + year + score + rating_adj + genre_adj', df_movies)
print(model_1_gross.fit().summary())

                            OLS Regression Results                            
Dep. Variable:      np.log(gross_adj)   R-squared:                       0.443
Model:                            OLS   Adj. R-squared:                  0.441
Method:                 Least Squares   F-statistic:                     308.2
Date:                Fri, 02 Dec 2022   Prob (F-statistic):               0.00
Time:                        23:16:05   Log-Likelihood:                -4984.8
No. Observations:                3891   AIC:                             9992.
Df Residuals:                    3880   BIC:                         1.006e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

### Add Gender *np.log(budget_adj)

In [13]:
# Add Gender *np.log(budget_adj)
model_2_gross = sm.OLS.from_formula('np.log(gross_adj) ~ gender*np.log(budget_adj) + year + np.log(budget_adj) + year + score + rating_adj + genre_adj', df_movies)
model_2_gross_fit = model_2_gross.fit()
print(model_2_gross_fit.summary())

male_ln_budget_adj_max = np.log(df_movies[df_movies["gender"]=="male"]["budget_adj"]).max()
male_coeff = model_2_gross_fit.params["gender[T.male]"]
male_budget_coeff = model_2_gross_fit.params["gender[T.male]:np.log(budget_adj)"]

print("\nMaximum value of budget for male movies is {:} millions".format(round(exp(male_ln_budget_adj_max), 2)))
print("At maximum budget, male will generate {:.2%} more revenue than female".format(exp(male_coeff + (male_ln_budget_adj_max * male_budget_coeff))-1))
print("Male will generate more revenue than female when the budget is {:} millions or higher".format(round(exp(-male_coeff/male_budget_coeff), 2)))


                            OLS Regression Results                            
Dep. Variable:      np.log(gross_adj)   R-squared:                       0.444
Model:                            OLS   Adj. R-squared:                  0.443
Method:                 Least Squares   F-statistic:                     281.8
Date:                Fri, 02 Dec 2022   Prob (F-statistic):               0.00
Time:                        23:16:06   Log-Likelihood:                -4979.8
No. Observations:                3891   AIC:                             9984.
Df Residuals:                    3879   BIC:                         1.006e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

### Add Gender * genre

In [14]:
# Add Gender * genre
model_4_gross = sm.OLS.from_formula('np.log(gross_adj) ~ gender + np.log(budget_adj) + year + score + rating_adj + genre_adj*gender', df_movies)
print(model_4_gross.fit().summary())

                            OLS Regression Results                            
Dep. Variable:      np.log(gross_adj)   R-squared:                       0.443
Model:                            OLS   Adj. R-squared:                  0.441
Method:                 Least Squares   F-statistic:                     220.3
Date:                Fri, 02 Dec 2022   Prob (F-statistic):               0.00
Time:                        23:16:06   Log-Likelihood:                -4983.5
No. Observations:                3891   AIC:                             9997.
Df Residuals:                    3876   BIC:                         1.009e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------

### Add Gender * year

In [15]:
# Add Gender * year
model_gender_interaction = sm.OLS.from_formula('np.log(gross_adj) ~ gender + np.log(budget_adj) + year + score + rating_adj + genre_adj + gender*year', df_movies)
print(model_gender_interaction.fit().summary())

                            OLS Regression Results                            
Dep. Variable:      np.log(gross_adj)   R-squared:                       0.443
Model:                            OLS   Adj. R-squared:                  0.441
Method:                 Least Squares   F-statistic:                     280.5
Date:                Fri, 02 Dec 2022   Prob (F-statistic):               0.00
Time:                        23:16:06   Log-Likelihood:                -4983.6
No. Observations:                3891   AIC:                             9991.
Df Residuals:                    3879   BIC:                         1.007e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

### Add Gender * score

In [16]:
# Add Gender * score
model_4_gross = sm.OLS.from_formula('np.log(gross_adj) ~ gender*np.log(budget_adj) + np.log(budget_adj) + year + score + rating_adj + genre_adj + score*gender', df_movies)
print(model_4_gross.fit().summary())

                            OLS Regression Results                            
Dep. Variable:      np.log(gross_adj)   R-squared:                       0.447
Model:                            OLS   Adj. R-squared:                  0.446
Method:                 Least Squares   F-statistic:                     261.7
Date:                Fri, 02 Dec 2022   Prob (F-statistic):               0.00
Time:                        23:16:06   Log-Likelihood:                -4968.2
No. Observations:                3891   AIC:                             9962.
Df Residuals:                    3878   BIC:                         1.004e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4e650921-8e2a-4c48-bd86-ee489799e499' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>