In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
TMDBAPI = pd.read_csv("Data/tmdb_results_combined.csv.gz", low_memory = False)
TMDBAPI = TMDBAPI[['imdb_id', 'revenue', 'budget', 'certification']]
TMDBAPI = TMDBAPI.drop(TMDBAPI.index[0])
TMDBAPI= TMDBAPI.reset_index(drop=True)
TMDBAPI

Unnamed: 0,imdb_id,revenue,budget,certification
0,tt0113026,0.0,10000000.0,
1,tt0113092,0.0,0.0,
2,tt0116391,0.0,0.0,
3,tt0118694,14204632.0,150000.0,PG
4,tt0118852,0.0,0.0,R
...,...,...,...,...
2571,tt7797790,0.0,0.0,
2572,tt8665056,0.0,0.0,
2573,tt8795764,0.0,0.0,NR
2574,tt9071078,0.0,0.0,


In [2]:
TMDBAPI2=TMDBAPI.copy()
TMDBAPI2=TMDBAPI2[TMDBAPI2['certification'].isin(['R', 'PG-13', 'PG', 'G'])]

TMDBAPI2['certification'].value_counts()

R        467
PG-13    182
PG        63
G         25
Name: certification, dtype: int64

The stakeholder's first question is: does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?

There are four categories to compare and we are dealing with numerical values, therefore, ANOVA method is used.

Null Hypothesis: There is no difference on revenue produced for ratings of a movie.

Alternate Hypothesis: There is an impact on revenue based on the ratings of a movie.

In [3]:
ratings ={}

for rating in TMDBAPI2['certification'].unique():
    temp = TMDBAPI2.loc[TMDBAPI2['certification']== rating, 'revenue']
    ratings[rating] = temp

ratings.keys()

dict_keys(['PG', 'R', 'G', 'PG-13'])

In [4]:
for certification, rating in ratings.items():

    ## determine if there are any outliers
    outliers = np.abs(stats.zscore(rating)) > 3

    ## print a statement about how many outliers for which group name
    print(f"There were {outliers.sum()} outliers in the {certification} ratings.")

    ## Remove the outiers from data and overwrite the sector data in the dict
    rating = rating.loc[~outliers]
    ratings[certification] = rating

There were 1 outliers in the PG ratings.
There were 12 outliers in the R ratings.
There were 1 outliers in the G ratings.
There were 5 outliers in the PG-13 ratings.


In [5]:
## Running normal test on each group and confirming there are >20 in each group

## Save a list with an inner list of column names
norm_results = [['ratings','n','pval','sig?']]


## loop through group dict
for certification, rating in ratings.items():
    ## calculate normaltest results
    stat, p = stats.normaltest(rating)

    ## Append the right info into norm_resutls (as a list)
    norm_results.append([certification,len(rating), p, p<.05])


## Make norm_results a dataframe (first row is columns, everything else data)
normal_results = pd.DataFrame(norm_results[1:], columns = norm_results[0])
normal_results

Unnamed: 0,ratings,n,pval,sig?
0,PG,62,3.673797e-13,True
1,R,455,2.810209e-75,True
2,G,24,0.01287961,True
3,PG-13,177,1.581495e-18,True


n is large enough to disregard this test.

In [6]:
stats.levene(*ratings.values())

LeveneResult(statistic=34.46387286735003, pvalue=8.26208476292791e-21)

We met the assumption of normality but did NOT meet the assumption of equal variance. We wanted to run an ANOVA test but will need to use a Kurskal-Wallis test instead.

In [7]:
result = stats.kruskal(*ratings.values())
print(result)
result.pvalue<.05

KruskalResult(statistic=86.80820460151303, pvalue=1.0616406407462143e-18)


True

Our Kruskal Wallis test returned a p-value less than .05. We  reject the null hypothesis and conclude that there is a significant proof to say that the four groups have different revenue due to their movie ratings.

In [8]:
mean_values = TMDBAPI2.groupby('certification').mean()
mean_values

  mean_values = TMDBAPI2.groupby('certification').mean()


Unnamed: 0_level_0,revenue,budget
certification,Unnamed: 1_level_1,Unnamed: 2_level_1
G,70418860.0,22880000.0
PG,61488380.0,24904720.0
PG-13,71465440.0,30945920.0
R,16129250.0,9663393.0


In average, PG-13 movies have the highest revenue while R rated movies have the lowest revenue.

The stakeholder's second question is: does the MPAA rating of a movie (G/PG/PG-13/R) affect the budget of the movie?

There are four categories to compare and we are dealing with numerical values, therefore, ANOVA method is used.

Null Hypothesis: There is no difference on budget produced for ratings of a movie.

Alternate Hypothesis: There is an impact on revenue based on the budget of a movie.

In [9]:
TMDBAPI3=TMDBAPI.copy()
TMDBAPI3=TMDBAPI3[TMDBAPI3['certification'].isin(['R', 'PG-13', 'PG', 'G'])]

ratings2 ={}

for rating in TMDBAPI3['certification'].unique():
    temp = TMDBAPI3.loc[TMDBAPI3['certification']== rating, 'budget']
    ratings2[rating] = temp

ratings2.keys()

dict_keys(['PG', 'R', 'G', 'PG-13'])

In [10]:
for certification, rating in ratings2.items():

    ## determine if there are any outliers
    outliers = np.abs(stats.zscore(rating)) > 3

    ## print a statement about how many outliers for which group name
    print(f"There were {outliers.sum()} outliers in the {certification} ratings.")

    ## Remove the outiers from data and overwrite the sector data in the dict
    rating = rating.loc[~outliers]
    ratings2[certification] = rating

There were 0 outliers in the PG ratings.
There were 10 outliers in the R ratings.
There were 0 outliers in the G ratings.
There were 2 outliers in the PG-13 ratings.


In [11]:
## Running normal test on each group and confirming there are >20 in each group

## Save a list with an inner list of column names
norm_results2 = [['ratings','n','pval','sig?']]


## loop through group dict
for certification, rating in ratings2.items():
    ## calculate normaltest results
    stat, p = stats.normaltest(rating)

    ## Append the right info into norm_resutls (as a list)
    norm_results2.append([certification,len(rating), p, p<.05])


## Make norm_results a dataframe (first row is columns, everything else data)
normal_results2 = pd.DataFrame(norm_results2[1:], columns = norm_results2[0])
normal_results2

Unnamed: 0,ratings,n,pval,sig?
0,PG,63,0.0001378867,True
1,R,457,1.1221719999999999e-48,True
2,G,25,0.0006152003,True
3,PG-13,180,2.385245e-05,True


n is large enough to disregard this test.

In [12]:
stats.levene(*ratings.values())

LeveneResult(statistic=34.46387286735003, pvalue=8.26208476292791e-21)

We met the assumption of normality but did NOT meet the assumption of equal variance. We wanted to run an ANOVA test but will need to use a Kurskal-Wallis test instead.

In [13]:
result = stats.kruskal(*ratings.values())
print(result)
result.pvalue<.05

KruskalResult(statistic=86.80820460151303, pvalue=1.0616406407462143e-18)


True

Our Kruskal Wallis test returned a p-value less than .05. We reject the null hypothesis and conclude that there is a significant proof to say that the four groups have different budget based on their movie ratings.

In [14]:
mean_values2 = TMDBAPI3.groupby('certification').mean()
mean_values2

  mean_values2 = TMDBAPI3.groupby('certification').mean()


Unnamed: 0_level_0,revenue,budget
certification,Unnamed: 1_level_1,Unnamed: 2_level_1
G,70418860.0,22880000.0
PG,61488380.0,24904720.0
PG-13,71465440.0,30945920.0
R,16129250.0,9663393.0


PG 13 movies tend to have the highest budget while the rated R movies tend to have the lowest budget. 

The stakeholder's third question is: Did the running time of movies change over the years?

In [15]:
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics['startYear'].value_counts()

2019    5877
2018    5770
2017    5641
2016    5250
2021    5147
2015    5052
2020    5000
2014    4909
2013    4707
2012    4514
2011    4223
2010    3855
2009    3552
2008    2906
2007    2572
2006    2434
2005    2178
2004    1900
2003    1678
2001    1569
2002    1564
2000    1448
Name: startYear, dtype: int64

There are 20 different years to compare and we are dealing with numerical values, therefore, ANOVA method is used.

Null Hypothesis: There is no difference in running time over the years.

Alternate Hypothesis: There is an change in running time over the years.

In [16]:
years ={}

for year in basics['startYear'].unique():
    temp = basics.loc[basics['startYear']== year, 'runtimeMinutes']
    years[year] = temp

years.keys()

dict_keys([2001, 2013, 2020, 2018, 2005, 2002, 2017, 2006, 2004, 2000, 2009, 2008, 2007, 2003, 2011, 2021, 2010, 2012, 2016, 2014, 2015, 2019])

In [17]:
for year, data in years.items():

    ## determine if there are any outliers
    outliers = np.abs(stats.zscore(data)) > 3

    ## print a statement about how many outliers for which group name
    print(f"There were {outliers.sum()} outliers in the year {year}.")

    ## Remove the outiers from data and overwrite the sector data in the dict
    data = data.loc[~outliers]
    years[year] = data

There were 31 outliers in the year 2001.
There were 43 outliers in the year 2013.
There were 24 outliers in the year 2020.
There were 45 outliers in the year 2018.
There were 8 outliers in the year 2005.
There were 24 outliers in the year 2002.
There were 47 outliers in the year 2017.
There were 37 outliers in the year 2006.
There were 14 outliers in the year 2004.
There were 23 outliers in the year 2000.
There were 32 outliers in the year 2009.
There were 25 outliers in the year 2008.
There were 21 outliers in the year 2007.
There were 23 outliers in the year 2003.
There were 8 outliers in the year 2011.
There were 43 outliers in the year 2021.
There were 12 outliers in the year 2010.
There were 27 outliers in the year 2012.
There were 17 outliers in the year 2016.
There were 49 outliers in the year 2014.
There were 30 outliers in the year 2015.
There were 25 outliers in the year 2019.


In [18]:
norm_results = [['group','n','pval','sig?']]


## loop through group dict
for year, data in years.items():
    ## calculate normaltest results
    stat, p = stats.normaltest(data)

    ## Append the right info into norm_resutls (as a list)
    norm_results.append([year,len(data), p, p<.05])


## Make norm_results a dataframe (first row is columns, everything else data)
normal_results = pd.DataFrame(norm_results[1:], columns = norm_results[0])
normal_results

Unnamed: 0,group,n,pval,sig?
0,2001,1538,1.0166050000000001e-22,True
1,2013,4664,1.982401e-45,True
2,2020,4976,9.697430000000001e-39,True
3,2018,5725,8.330371e-64,True
4,2005,2170,1.6417169999999998e-41,True
5,2002,1540,1.247233e-13,True
6,2017,5594,1.0132269999999999e-48,True
7,2006,2397,8.467188e-21,True
8,2004,1886,4.393069e-34,True
9,2000,1425,4.2751659999999997e-26,True


This shows that the groups do not have normal distribution, but our n is large enough to disregard it.



In [19]:
stats.levene(*years.values())


LeveneResult(statistic=41.88534461877514, pvalue=1.1198727996507978e-171)

We met the assumption of normality but did NOT meet the assumption of equal variance. We wanted to run an ANOVA test but will need to use a Kurskal-Wallis test instead.



In [20]:
result = stats.kruskal(*years.values())
print(result)
result.pvalue<.05

KruskalResult(statistic=318.78830180104444, pvalue=4.695236557898575e-55)


True

Our Kruskal Wallis test returned a p-value less than .05. We reject the null hypothesis and conclude that there is a significant change over the years in runtime.