## Part 4
Kass Doran

9/12/23

Questions to answer:
    
    1. Does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?
    2. Do some movie genres earn more revenue than others?
    3. Do movies that are over 2.5 hours long earn more revenue than movies that are 1.5 hours long (or less)?

### 1. Extract additional data 

In [1]:
import os, time,json
from tqdm.notebook import tqdm_notebook
import tmdbsimple as tmdb 
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['title_ratings_final.csv.gz',
 'title_basics_final.csv.gz',
 'title.basics_real.tsv.gz',
 'title.akas_real.csv',
 'tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'tmdb_api_results_2001.json',
 'title_akas_final.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 '.ipynb_checkpoints',
 'tmdb_results_combined.csv.gz',
 'title.akas_real.tsv.gz',
 'title.ratings_real.tsv.gz']

In [2]:
with open('/Users/kass/secret/tmbd_api.json','r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [3]:
tmdb.API_KEY = login['api-key']

In [4]:
def get_movie_with_rating(movie_id):
    """Adapted from source = https://github.com/celiao/tmdbsimple"""
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    
    # Save the .info and .release dictionaries
    info = movie.info()
    
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1'] == 'US':
            ## save a "certification key" in info with the certification
            info['certification'] = c['certification']
    
    return info

In [5]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [6]:
import pandas as pd
# Load in the dataframe from part 1
basics = pd.read_csv("Data/title_basics_final.csv.gz", low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,,90,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [7]:
#years we want to explore
YEARS_TO_GET = [2010,2011,2012,2013,2014,2015,2017,2018,2019,2020]

In [8]:
# place to store errors
errors = [ ]

In [None]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)

    # If it does not exist: create it
    if file_exists == False:
    # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)

    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()

    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)

    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])


    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz",compression='gzip', index=False)

YEARS:   0%|          | 0/10 [00:00<?, ?it/s]

Movies from 2010:   0%|          | 0/3863 [00:00<?, ?it/s]

Movies from 2011:   0%|          | 0/4224 [00:00<?, ?it/s]

Movies from 2012:   0%|          | 0/4521 [00:00<?, ?it/s]

Movies from 2013:   0%|          | 0/4717 [00:00<?, ?it/s]

Movies from 2014:   0%|          | 0/4914 [00:00<?, ?it/s]

Movies from 2015:   0%|          | 0/5057 [00:00<?, ?it/s]

Movies from 2017:   0%|          | 0/5639 [00:00<?, ?it/s]

Movies from 2018:   0%|          | 0/5784 [00:00<?, ?it/s]

Movies from 2019:   0%|          | 0/5874 [00:00<?, ?it/s]

### Combine Data frames

In [None]:
##Load in data sets
#TMDB_2010 movies
df_2010 = pd.read_csv('Data/final_tmdb_data_2010.csv.gz')
df_2010.head()

In [None]:
#TMDB_2011 movies
df_2011 = pd.read_csv('Data/final_tmdb_data_2011.csv.gz')
df_2011.head()

In [None]:
#TMDB_2012 movies
df_2012 = pd.read_csv('Data/final_tmdb_data_2012.csv.gz')
df_2012.head()

In [None]:
#TMDB_2013 movies
df_2013 = pd.read_csv('Data/final_tmdb_data_2013.csv.gz')
df_2013.head()

In [None]:
#TMDB_2014 movies
df_2014 = pd.read_csv('Data/final_tmdb_data_2014.csv.gz')
df_2014.head()

In [None]:
#TMDB_2015 movies
df_2015 = pd.read_csv('Data/final_tmdb_data_2015.csv.gz')
df_2015.head()

In [None]:
#TMDB_2016 movies
df_2016 = pd.read_csv('Data/final_tmdb_data_2016.csv.gz')
df_2016.head()

In [None]:
#TMDB_2017 movies
df_2017 = pd.read_csv('Data/final_tmdb_data_2017.csv.gz')
df_2017.head()

In [None]:
#TMDB_2018 movies
df_2018 = pd.read_csv('Data/final_tmdb_data_2018.csv.gz')
df_2018.head()

In [None]:
#TMDB_2019 movies
df_2019 = pd.read_csv('Data/final_tmdb_data_2019.csv.gz')
df_2019.head()

In [None]:
#TMDB_2020 movies
df_2020 = pd.read_csv('Data/final_tmdb_data_2020.csv.gz')
df_2020.head()

In [None]:
#Concatenate TMDB files into one dataframe
df = pd.concat((df_2010, df_2011,df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019, df_2020))
df.info()

In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import scipy.stats as stats

###  1. Does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?

    - ANOVA
    
    Ho: There is no difference in revenue between ratings
    H1: There is a difference in revenue between ratings

In [None]:
## Create groups dictionary. 
groups = {}
## Loop through all unique categories
for i in df['certification'].unique():
    ## Get series for group and rename
    data = df.loc[df['genres']==i,'revenue'].copy()
    
    # save into the dictionary
    groups[i] = data
groups.keys()

In [None]:
## Running normal test on each group and confirming there are >20 in each group
norm_results = {}
for i, data in groups.items():
    stat, p = stats.normaltest(data)
    ## save the p val, test statistic, and the size of the group
    norm_results[i] = {'n': len(data),
                             'p':p,
                             'test stat':stat,}
## convert to a dataframe
norm_results_df = pd.DataFrame(norm_results).T
norm_results_df

In [None]:
### checking sig with pandas 
norm_results_df['sig'] = norm_results_df['p'] < .05 
norm_results_df

In [None]:
# Testing Assumption of Equal Variance with the * operator 
stats.levene(*groups.values())

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
## save the values as kg_lost and the labels to the Diet
values = df['revenue']
labels = df['certification']

In [None]:
## perform tukey's multiple comparison test and display the summary
tukeys_results = pairwise_tukeyhsd(values,labels)
tukeys_results.summary()

In [None]:
ax = sns.countplot(data=df, x='revenue',hue='generes',)
ax.set_title("Revenure by Genre");

### 2. Do some genres earn more revenue than others?
    - Chi-Square
    
       
    Ho: There is no difference in revenue between genres
    H1: There is a difference in revenue between genres

In [None]:
# calculate the contingency table with pd.cross-tab
table = pd.crosstab(df['genres'], df['revenue'])
table

In [None]:
# Run the Test and Save the Output
result = stats.chi2_contingency(table)
chi2, p, deg_free, expected_vals = result #this labels the results

In [None]:
# What was our p-value?
print(p)

In [None]:
ax = sns.countplot(data=df, x='revenue',hue='generes',)
ax.set_title("Revenure by Genre");

  ### 3. Do movies that are over 2.5 hours long earn more revenue than movies that are 1.5 hours long (or less)?
      - T-test
      
         
    Ho: There is no difference in revenue between movies that are 2.5hours long and movies that are 1.5 hours long and under
    H1: There is a difference in revenue between movies that are 2.5hours long and movies that are 1.5 hours long and under

In [None]:
#split into two groups
# Filtering out those with and without super strength
twohalf_df = df.loc[df['runtimeMinutes'] > 150].copy()
under1half_df = df.loc[df['runtimeMinutes'] < 90].copy()

In [None]:
# define feature of interest
twohalf = strength_df['runtimeMinutes']
under1half = no_strength_df['runtimeMinutes']

In [None]:
#remove outliers for twohalf
# Check for outliers in weight for strength group
zscores= stats.zscore(twohalf)
outliers = abs(zscores)>3
print(np.sum(outliers))

# remove outliers from twohalf group
twohalf= twohalf_df[(np.abs(stats.zscore(twohalf)) < 3)]

In [None]:
#remove outliers for under1half
# Check for outliers in weight for strength group
zscores= stats.zscore(under1half)
outliers = abs(zscores)>3
print(np.sum(outliers))

# remove outliers from under1half group
under1half = twohalf[(np.abs(stats.zscore(under1half)) < 3)]

In [None]:
# test the twohalf group for normality
result_twohalf = stats.normaltest(twohalf)
result_twohalf

In [None]:
# test the twohalf group for normality
result_under1half = stats.normaltest(under1half)
result_under1half

In [None]:
# Test for equal variance
result = stats.levene(under1half, twohalf)
result

In [None]:
# Independent t-test with equal_var set to False
result = stats.ttest_ind(under1half, twohalf, equal_var = False)
result

In [None]:
ax = sns.countplot(data=df, x='runtimeMinutes',hue='generes',)
ax.set_title("Revenure by Genre");