In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import itertools
import time
movies = pd.read_csv('movie_industry_clean.csv')

In [2]:
# data wrangling
def wrangling(df):
    df = df.drop('Unnamed: 0', axis=1)
    df = df.drop('index', axis=1)
    df.replace(' ', '_', regex=True, inplace = True)
    df.replace('-', '_', regex=True, inplace = True)
    return df

movies = wrangling(movies)

In [3]:
# grouping countries into continents
def continent(s):
    if s['country'] in ['United_States','Mexico', 'Canada','Jamaica']:
        return 'North_America'
    if s['country'] in ['United_Kingdom', 'France', 'Germany', 'Belgium','Spain', 'Norway', 'Iceland', 'West_Germany', 'Ireland', 'Italy', 'Finland', 'Czech_Republic', 'Switzerland', 'Sweden', 'Denmark', 'Netherlands','Federal_Republic_of_Yugoslavia','Yugoslavia', 'Hungary', 'Republic_of_Macedonia', 'Austria','Portugal','Malta']:
        return 'Europe'
    if s['country'] in ['Japan','China','South_Korea', 'Hong_Kong','India','United_Arab_Emirates','Taiwan','Lebanon','Indonesia', 'Iran','Israel','Russia','Thailand']:
        return 'Asia'
    if s['country'] in ['Australia','New_Zealand']:
        return 'Oceania'
    if s['country'] in ['South_Africa', 'Kenya']:
        return 'Africa'
    if s['country'] in ['Brazil', 'Argentina', 'Aruba', 'Chile', 'Panama','Colombia']:
        return 'South_America'
    else:
        return 'NA'

movies['continent'] = movies.apply(continent, axis=1)

In [4]:
# split into training & test
np.random.seed(2)
train = movies.sample(round(movies.shape[0]*0.8)) # 80%
test = movies.drop(train.index)

## MODEL WITHOUT INTERACTIONS

In [5]:
# sqrt of the mode
ols_object = smf.ols(formula = 'np.sqrt(gross)~budget*genre+score*votes+rating+I(budget**2)+I(votes**2)+I(score*votes**2)', data = train)
model_sqrt = ols_object.fit()
model_sqrt.summary()

0,1,2,3
Dep. Variable:,np.sqrt(gross),R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.74
Method:,Least Squares,F-statistic:,649.8
Date:,"Thu, 02 Mar 2023",Prob (F-statistic):,0.0
Time:,00:21:38,Log-Likelihood:,-41271.0
No. Observations:,4337,AIC:,82580.0
Df Residuals:,4317,BIC:,82710.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.248e-06,4.45e-08,28.058,0.000,1.16e-06,1.34e-06
genre[T.Adventure],1.318e-06,4.72e-08,27.931,0.000,1.23e-06,1.41e-06
genre[T.Animation],-5.606e-08,2e-09,-28.014,0.000,-6e-08,-5.21e-08
genre[T.Biography],3.744e-08,1.33e-09,28.107,0.000,3.48e-08,4.01e-08
genre[T.Comedy],3.465e-07,1.24e-08,28.057,0.000,3.22e-07,3.71e-07
genre[T.Crime],6.9e-08,2.46e-09,28.069,0.000,6.42e-08,7.38e-08
genre[T.Drama],2.447e-07,8.72e-09,28.075,0.000,2.28e-07,2.62e-07
genre[T.Family],8.949e-10,3.17e-11,28.200,0.000,8.33e-10,9.57e-10
genre[T.Fantasy],3.06e-08,1.09e-09,28.027,0.000,2.85e-08,3.27e-08

0,1,2,3
Omnibus:,430.233,Durbin-Watson:,1.944
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2352.843
Skew:,0.311,Prob(JB):,0.0
Kurtosis:,6.554,Cond. No.,6.96e+16


In [6]:
# removing outliers/influential points

#Computing the leverage statistic for each observation
influence = model_sqrt.get_influence()
leverage = influence.hat_matrix_diag

out = model_sqrt.outlier_test()

#Average leverage of points
average_leverage = (model_sqrt.df_model+1)/model_sqrt.nobs
average_leverage

#We will remove all observations that have leverage higher than the threshold value.
high_leverage_threshold = 4*average_leverage

#Number of high leverage points in the dataset
np.sum(leverage>high_leverage_threshold)

#Dropping influential points from data
train_filtered = train.drop(np.intersect1d(np.where(np.abs(out.student_resid)>3)[0],
                                           (np.where(leverage>high_leverage_threshold)[0])))

train_filtered.shape

#Number of points removed as they were influential
train.shape[0]-train_filtered.shape[0]

KeyboardInterrupt: 

In [8]:
model_sqrt.outlier_test()

  return self.resid / sigma / np.sqrt(1 - hii)


Unnamed: 0,student_resid,unadj_p,bonf(p)
3496,-0.803135,0.421941,1.0
738,0.308789,0.757497,1.0
2826,0.088204,0.929719,1.0
4048,0.370916,0.710718,1.0
727,0.974128,0.330048,1.0
...,...,...,...
4916,-1.308813,0.190667,1.0
4396,0.338529,0.734981,1.0
1528,0.387610,0.698324,1.0
4155,0.174081,0.861810,1.0


In [9]:
# the start of a model by hand - NO INTERACTIONS/TRANSFORMATIONS
ols_object = smf.ols(formula = 'gross~budget+genre+rating+votes+continent+runtime+score+year', data = train)
model = ols_object.fit()
model.summary()

0,1,2,3
Dep. Variable:,gross,R-squared:,0.676
Model:,OLS,Adj. R-squared:,0.674
Method:,Least Squares,F-statistic:,272.6
Date:,"Thu, 02 Mar 2023",Prob (F-statistic):,0.0
Time:,00:29:09,Log-Likelihood:,-86289.0
No. Observations:,4337,AIC:,172600.0
Df Residuals:,4303,BIC:,172900.0
Df Model:,33,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-5.257e+08,3.63e+08,-1.448,0.148,-1.24e+09,1.86e+08
genre[T.Adventure],6.217e+06,7.74e+06,0.803,0.422,-8.97e+06,2.14e+07
genre[T.Animation],6.414e+07,1.01e+07,6.336,0.000,4.43e+07,8.4e+07
genre[T.Biography],-1.005e+07,8.08e+06,-1.243,0.214,-2.59e+07,5.79e+06
genre[T.Comedy],1.418e+07,4.74e+06,2.992,0.003,4.89e+06,2.35e+07
genre[T.Crime],-2.856e+06,7.08e+06,-0.404,0.687,-1.67e+07,1.1e+07
genre[T.Drama],1.931e+06,5.63e+06,0.343,0.731,-9.1e+06,1.3e+07
genre[T.Family],4.694e+08,6.16e+07,7.621,0.000,3.49e+08,5.9e+08
genre[T.Fantasy],2.11e+07,2.01e+07,1.050,0.294,-1.83e+07,6.05e+07

0,1,2,3
Omnibus:,3404.351,Durbin-Watson:,2.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,250386.962
Skew:,3.176,Prob(JB):,0.0
Kurtosis:,39.677,Cond. No.,13300000000.0


In [12]:
# removing outliers/influential points

#Computing the leverage statistic for each observation
influence = model.get_influence()
leverage = influence.hat_matrix_diag

out = model.outlier_test()

#Average leverage of points
average_leverage = (model.df_model+1)/model.nobs
average_leverage

#We will remove all observations that have leverage higher than the threshold value.
high_leverage_threshold = 4*average_leverage

#Number of high leverage points in the dataset
print(np.sum(leverage>high_leverage_threshold))

#Dropping influential points from data
np.intersect1d(np.where(np.abs(out.student_resid)>3)[0], (np.where(leverage>high_leverage_threshold)[0]))
#train_filtered = train.drop(np.intersect1d(np.where(np.abs(out.student_resid)>3)[0], (np.where(leverage>high_leverage_threshold)[0])))

train_filtered.shape

#Number of points removed as they were influential
print(train.shape[0]-train_filtered.shape[0])

124


  return self.resid / sigma / np.sqrt(1 - hii)


NameError: name 'train_filtered' is not defined

In [15]:
np.where(leverage>high_leverage_threshold)

(array([   9,   71,  100,  173,  179,  227,  256,  295,  348,  469,  487,
         509,  519,  592,  603,  622,  625,  650,  692,  702,  748,  752,
         886,  892,  949,  950, 1011, 1043, 1049, 1073, 1083, 1103, 1139,
        1179, 1229, 1246, 1296, 1337, 1346, 1401, 1432, 1439, 1441, 1473,
        1541, 1543, 1568, 1586, 1598, 1685, 1696, 1805, 1846, 1856, 1863,
        1902, 1930, 1994, 2000, 2005, 2076, 2126, 2130, 2208, 2215, 2273,
        2281, 2288, 2333, 2353, 2369, 2425, 2426, 2478, 2519, 2524, 2640,
        2642, 2716, 2753, 2800, 2848, 2929, 2952, 2980, 3024, 3032, 3040,
        3043, 3082, 3136, 3157, 3246, 3313, 3351, 3485, 3529, 3540, 3599,
        3620, 3624, 3630, 3726, 3731, 3784, 3797, 3805, 3828, 3844, 3850,
        3910, 3967, 3970, 3971, 3994, 4025, 4049, 4106, 4146, 4223, 4241,
        4295, 4323, 4324]),)

In [14]:
print(np.where(np.abs(out.student_resid)>3))

(array([  35,   67,  172,  323,  348,  433,  619,  650,  663,  721,  753,
        758,  764,  767,  783,  807,  887,  894,  899, 1036, 1152, 1253,
       1307, 1395, 1401, 1456, 1527, 1563, 1568, 1591, 1611, 1652, 1722,
       1824, 1935, 1980, 2007, 2056, 2131, 2177, 2294, 2407, 2420, 2432,
       2459, 2476, 2619, 2629, 2716, 2817, 2829, 2845, 2906, 2931, 3050,
       3082, 3273, 3277, 3321, 3354, 3386, 3514, 3646, 3679, 3748, 3768,
       3850, 4056, 4128, 4132, 4149, 4221, 4236, 4325]),)


In [16]:
influential_pts = []
for high_lev_index in [9,   71,  100,  173,  179,  227,  256,  295,  348,  469,  487,
         509,  519,  592,  603,  622,  625,  650,  692,  702,  748,  752,
         886,  892,  949,  950, 1011, 1043, 1049, 1073, 1083, 1103, 1139,
        1179, 1229, 1246, 1296, 1337, 1346, 1401, 1432, 1439, 1441, 1473,
        1541, 1543, 1568, 1586, 1598, 1685, 1696, 1805, 1846, 1856, 1863,
        1902, 1930, 1994, 2000, 2005, 2076, 2126, 2130, 2208, 2215, 2273,
        2281, 2288, 2333, 2353, 2369, 2425, 2426, 2478, 2519, 2524, 2640,
        2642, 2716, 2753, 2800, 2848, 2929, 2952, 2980, 3024, 3032, 3040,
        3043, 3082, 3136, 3157, 3246, 3313, 3351, 3485, 3529, 3540, 3599,
        3620, 3624, 3630, 3726, 3731, 3784, 3797, 3805, 3828, 3844, 3850,
        3910, 3967, 3970, 3971, 3994, 4025, 4049, 4106, 4146, 4223, 4241,
        4295, 4323, 4324]:
    if high_lev_index in [35,   67,  172,  323,  348,  433,  619,  650,  663,  721,  753,
        758,  764,  767,  783,  807,  887,  894,  899, 1036, 1152, 1253,
       1307, 1395, 1401, 1456, 1527, 1563, 1568, 1591, 1611, 1652, 1722,
       1824, 1935, 1980, 2007, 2056, 2131, 2177, 2294, 2407, 2420, 2432,
       2459, 2476, 2619, 2629, 2716, 2817, 2829, 2845, 2906, 2931, 3050,
       3082, 3273, 3277, 3321, 3354, 3386, 3514, 3646, 3679, 3748, 3768,
       3850, 4056, 4128, 4132, 4149, 4221, 4236, 4325]:
        influential_pts.append(high_lev_index)

In [17]:
influential_pts

[348, 650, 1401, 1568, 2716, 3082, 3850]