In [6]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import itertools
import time
movies = pd.read_csv('movie_industry_clean.csv')

In [7]:
# data wrangling
def wrangling(df):
    df = df.drop('Unnamed: 0', axis=1)
    df = df.drop('index', axis=1)
    df.replace(' ', '_', regex=True, inplace = True)
    df.replace('-', '_', regex=True, inplace = True)
    return df

movies = wrangling(movies)
movies

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The_Shining,R,Drama,1980,"June_13,_1980_(United_States)",8.4,927000.0,Stanley_Kubrick,Stephen_King,Jack_Nicholson,United_Kingdom,19000000.0,46998772.0,Warner_Bros.,146.0
1,The_Blue_Lagoon,R,Adventure,1980,"July_2,_1980_(United_States)",5.8,65000.0,Randal_Kleiser,Henry_De_Vere_Stacpoole,Brooke_Shields,United_States,4500000.0,58853106.0,Columbia_Pictures,104.0
2,Star_Wars:_Episode_V___The_Empire_Strikes_Back,PG,Action,1980,"June_20,_1980_(United_States)",8.7,1200000.0,Irvin_Kershner,Leigh_Brackett,Mark_Hamill,United_States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July_2,_1980_(United_States)",7.7,221000.0,Jim_Abrahams,Jim_Abrahams,Robert_Hays,United_States,3500000.0,83453539.0,Paramount_Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July_25,_1980_(United_States)",7.3,108000.0,Harold_Ramis,Brian_Doyle_Murray,Chevy_Chase,United_States,6000000.0,39846344.0,Orion_Pictures,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5416,Bad_Boys_for_Life,R,Action,2020,"January_17,_2020_(United_States)",6.6,140000.0,Adil_El_Arbi,Peter_Craig,Will_Smith,United_States,90000000.0,426505244.0,Columbia_Pictures,124.0
5417,Sonic_the_Hedgehog,PG,Action,2020,"February_14,_2020_(United_States)",6.5,102000.0,Jeff_Fowler,Pat_Casey,Ben_Schwartz,United_States,85000000.0,319715683.0,Paramount_Pictures,99.0
5418,Dolittle,PG,Adventure,2020,"January_17,_2020_(United_States)",5.6,53000.0,Stephen_Gaghan,Stephen_Gaghan,Robert_Downey_Jr.,United_States,175000000.0,245487753.0,Universal_Pictures,101.0
5419,The_Call_of_the_Wild,PG,Adventure,2020,"February_21,_2020_(United_States)",6.8,42000.0,Chris_Sanders,Michael_Green,Harrison_Ford,Canada,135000000.0,111105497.0,20th_Century_Studios,100.0


In [8]:
# grouping countries into continents
def continent(s):
    if s['country'] in ['United_States','Mexico', 'Canada','Jamaica']:
        return 'North_America'
    if s['country'] in ['United_Kingdom', 'France', 'Germany', 'Belgium','Spain', 'Norway', 'Iceland', 'West_Germany', 'Ireland', 'Italy', 'Finland', 'Czech_Republic', 'Switzerland', 'Sweden', 'Denmark', 'Netherlands','Federal_Republic_of_Yugoslavia','Yugoslavia', 'Hungary', 'Republic_of_Macedonia', 'Austria','Portugal','Malta']:
        return 'Europe'
    if s['country'] in ['Japan','China','South_Korea', 'Hong_Kong','India','United_Arab_Emirates','Taiwan','Lebanon','Indonesia', 'Iran','Israel','Russia','Thailand']:
        return 'Asia'
    if s['country'] in ['Australia','New_Zealand']:
        return 'Oceania'
    if s['country'] in ['South_Africa', 'Kenya']:
        return 'Africa'
    if s['country'] in ['Brazil', 'Argentina', 'Aruba', 'Chile', 'Panama','Colombia']:
        return 'South_America'
    else:
        return 'NA'

movies['continent'] = movies.apply(continent, axis=1)

In [9]:
# split into training & test
np.random.seed(2)
train = movies.sample(round(movies.shape[0]*0.8)) # 80%
test = movies.drop(train.index)

## MODEL WITHOUT INTERACTIONS

In [10]:
# the start of a model by hand - NO INTERACTIONS/TRANSFORMATIONS
ols_object = smf.ols(formula = 'gross~budget+genre+rating+votes+continent+runtime+score+year', data = train)
model = ols_object.fit()
model.summary()

0,1,2,3
Dep. Variable:,gross,R-squared:,0.676
Model:,OLS,Adj. R-squared:,0.674
Method:,Least Squares,F-statistic:,272.6
Date:,"Wed, 01 Mar 2023",Prob (F-statistic):,0.0
Time:,20:17:30,Log-Likelihood:,-86289.0
No. Observations:,4337,AIC:,172600.0
Df Residuals:,4303,BIC:,172900.0
Df Model:,33,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-5.257e+08,3.63e+08,-1.448,0.148,-1.24e+09,1.86e+08
genre[T.Adventure],6.217e+06,7.74e+06,0.803,0.422,-8.97e+06,2.14e+07
genre[T.Animation],6.414e+07,1.01e+07,6.336,0.000,4.43e+07,8.4e+07
genre[T.Biography],-1.005e+07,8.08e+06,-1.243,0.214,-2.59e+07,5.79e+06
genre[T.Comedy],1.418e+07,4.74e+06,2.992,0.003,4.89e+06,2.35e+07
genre[T.Crime],-2.856e+06,7.08e+06,-0.404,0.687,-1.67e+07,1.1e+07
genre[T.Drama],1.931e+06,5.63e+06,0.343,0.731,-9.1e+06,1.3e+07
genre[T.Family],4.694e+08,6.16e+07,7.621,0.000,3.49e+08,5.9e+08
genre[T.Fantasy],2.11e+07,2.01e+07,1.050,0.294,-1.83e+07,6.05e+07

0,1,2,3
Omnibus:,3404.351,Durbin-Watson:,2.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,250386.961
Skew:,3.176,Prob(JB):,0.0
Kurtosis:,39.677,Cond. No.,13300000000.0


In [11]:
# removing outliers/influential points

#Computing the leverage statistic for each observation
influence = model.get_influence()
leverage = influence.hat_matrix_diag

out = model.outlier_test()

#Average leverage of points
average_leverage = (model.df_model+1)/model.nobs
average_leverage

#We will remove all observations that have leverage higher than the threshold value.
high_leverage_threshold = 4*average_leverage

#Number of high leverage points in the dataset
np.sum(leverage>high_leverage_threshold)

#Dropping influential points from data
train_filtered = train.drop(np.intersect1d(np.where(np.abs(out.student_resid)>3)[0],
                                           (np.where(leverage>high_leverage_threshold)[0])))

train_filtered.shape

#Number of points removed as they were influential
train.shape[0]-train_filtered.shape[0]

  r = _umath_linalg.det(a, signature=signature)
  return self.resid / sigma / np.sqrt(1 - hii)


KeyError: '[348, 1401] not found in axis'