# Producing boxplots using the produce_boxplot function

Note: Must be using up-to-date Python3 for this to work. Do not use learn-env environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
master_df = pd.read_pickle('movie_master_dataset_with_inflation.pkl')

In [None]:
def produce_boxplot(master, budget, movie_count_thresh=100, ylim=(-1.5,30)):
    '''
    Produces a boxplot with inputs:
        master - the master dataframe as pandas df
        budget - an int of the minimum movie budget for the data
        movie_count_thresh - the minimum number of movies per genre required for plotting
        ylim - the limits of the y axis a tuple -> (min, max)
        
    Returns None
    Spits out boxplots
    '''
    
    
    
    # df with no duplicated
    no_duplicates_df = master.drop_duplicates(subset=['movie','inf_adj_production_budget'])
    
    # list with all genre names
    genre_names = master_df.columns.tolist()[10:]

    # df with minimum budget and no duplicates
    budget_min_df = no_duplicates_df[no_duplicates_df['inf_adj_production_budget'] > budget]
    
    # creating list of genres where there are at least movie_count_thresh number of movies
    remaining_genres = []
    for genre in genre_names:
        if budget_min_df[genre].sum() > movie_count_thresh:
            remaining_genres.append(genre)
    
    # get list of column names and get list of total movies for that column
    col_names_desc = [names for counts, names in sorted(zip(budget_min_df[remaining_genres].sum().values.tolist(), 
                                               budget_min_df[remaining_genres].sum().index.tolist()), reverse=True)]
    col_counts_desc = [counts for counts, names in sorted(zip(budget_min_df[remaining_genres].sum().values.tolist(), 
                                               budget_min_df[remaining_genres].sum().index.tolist()), reverse=True)]
    
    # labels for x axis
    plt_x_labels = [ str(x) + ' - ' + str(y) for x, y in zip(col_names_desc, col_counts_desc)]
    
    # THE PLOT
    fig, ax = plt.subplots(figsize=(12,7))
    ax.boxplot([budget_min_df[budget_min_df[col] == True]['ROI'] for col in col_names_desc], 
           labels=plt_x_labels)
    ax.set_ylim(ylim)
    ax.set_title('Budget > ' + format(int(budget), ',') + ' Million ROI for each genre boxplot')
    ax.set_ylabel('ROI')
    ax.set_xlabel('Genre')
    plt.xticks(fontsize=12, rotation=90)
    plt.show()
    
    print()

In [None]:
produce_boxplot(master_df, 1e7, 100, (-1.5, 32))

In [None]:
produce_boxplot(master_df, 5e7, 100, (-1.5,16))