In [None]:
# imports
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy.types import *
from sqlalchemy_utils import database_exists
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.stats.multicomp as mc

In [None]:
connection = f"mysql+pymysql://{'root'}:{urlquote('Hjtsai$$1031')}@localhost/belt-exam"

# create engine
engine = create_engine(connection)


if database_exists(connection):
    print('It exists!')
else:
    create_database(connection)
    print("The database created!")

In [None]:
# check by showing tables in db
sql = """SHOW TABLES IN movies;
"""

pd.read_sql_query(sql, engine)

In [None]:
sql = """SELECT t.revenue, t.certification
FROM tmdb_data as t
WHERE t.certification IS NOT NULL
    AND t.certification <> 'NC-17'
    AND t.certification <> 'NR'
    AND t.revenue <> 0;
"""

# save to df
df = pd.read_sql_query(sql, engine)

print(df.info())
df.head()

In [None]:
df['certification'].apply(lambda x: f"'{x}'").value_counts()

# Questions to Answer


## Q1: Does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?



### State your Null Hypothesis and Alternative Hypothesis

Null Hypothesis: No difference between mpaa rating and movie revenue.

Alternative Hypothesis: There is difference between mpaa rating and movie revenue.

### Select the correct test according to the data type and number of samples

it is a four sample numercial data, so I will perform ANOVA and/or Tukey test.



### Test the assumptions of your selected test

ANOVA Assumptions(One Way ANOVA)

- Normality
- Equal variance
- No significant outliers

In [None]:
## Create groups dictionary. 
groups = {}
## Loop through all unique categories
for region in df['certification'].unique():
    ## Get series for groups
    data = df.loc[df['certification']== region,'revenue'].copy()
    
    # save into the dictionary
    groups[region] = data
groups.keys()

In [None]:
## Running normal test on each group and confirming there are >20 in each group
norm_results = {}
for region, data in groups.items():
    stat, p = stats.normaltest(data)
    ## save the p val, test statistic, and the size of the group
    norm_results[region] = {'n': len(data),
                             'p':p,
                             'test stat':stat,}
## convert to a dataframe
norm_results_df = pd.DataFrame(norm_results).T
### checking sig with pandas 
norm_results_df['sig'] = norm_results_df['p'] < .05 
norm_results_df

In [None]:
# check equal variance
stats.levene(*groups.values())

### Test Execution

In [None]:
#try one way anova
result = stats.f_oneway( *groups.values())
result

In [None]:
# make dataframe of regions and charges
tukeys_df = pd.DataFrame(columns = ['region', 'charges'])

# convert dictionary groups into df (outliers were deleted
# from dictionary)
for region, series in groups.items():
    values = pd.Series(series.tolist())
    tukeys_df = pd.concat([tukeys_df, pd.DataFrame(
        {'region': [region] * len(values), 
         'charges': values})], ignore_index = False)
    
## save the values as kg_lost and the labels to the Diet
values = df['charges']
labels = df['region']

## perform tukey's multiple comparison test and display the summary
tukeys_results = pairwise_tukeyhsd(values,labels)
tukeys_results.summary()

### p-value Interpretation

- from one way anova, we will reject the null hypothesis since p value is less than .05.

- there is a reject between southeast and southwest

- This means that we will consider these two groups to have UNEQUAL (different) variances.

- We will run the nonparametric Kruskal-Wallis test instead, which does not require that the data have equal variance.

### Test Execution (Kruskal-Wallis)

In [None]:
result = stats.kruskal(*groups.values())
result

### p-value Interpretation

The p-value from the Kruskal-Wallis test is more than 0.05, meaning that we cannot reject the null hypothesis.

Therefore, we conclude that the different regions have statistically similar insurance charges.

### supporting visualization 

In [None]:
# Sample data
certification = ['G', 'PG', 'PG-13', 'R']
revenue = [10000000, 25000000, 50000000, 80000000]  # Revenue values (example)

# Size scaling factor for the bubble sizes
size_scale = 50000

# Plotting the bubble chart
plt.scatter(range(len(certification)), revenue, s=[x / size_scale for x in revenue], alpha=0.5)

# Adding labels and titles
plt.xlabel('Certification')
plt.ylabel('Revenue')
plt.title('Movie Certification vs. Revenue')

# Setting x-axis tick labels
plt.xticks(range(len(certification)), certification)

# Display the bubble chart
plt.show()