### IS445 Final Project, Part 2
Kay Avila

References:
* https://coderzcolumn.com/tutorials/data-science/interactive-charts-using-matplotlib-and-ipywidgets
* https://stackoverflow.com/questions/7908636/how-to-add-hovering-annotations-in-matplotlib
* https://stackoverflow.com/questions/36104500/pandas-filtering-and-comparing-dates

In [1]:
import numpy as np
import pandas as pd
import ipywidgets as widgets
import matplotlib.pyplot as plt

# Requires ipympl to be installed in the conda environment
#%matplotlib widget   

In [2]:
df = pd.read_csv('Storybook-Bibles.csv',  # Input file needs to be in the local directory  
                 usecols=['isbn', 'title', 'first_author', 'illustrator', 'other_authors', 'other_illustrators',
                          'publisher', 'location', 'year', 'pages', 'idb_msrp']
                )
df.rename(columns={'idb_msrp': 'msrp'}, inplace=True)

# Set $0.00 MSRP to NaN
df.loc[df['msrp'] <= 0, 'msrp'] = None

# Set dates below 1900 and above 2020 as invalid, then convert to a pandas time
df[(df['year'] > 2020) | (df['year'] < 1900)] = None
df['year'] = pd.to_datetime(df['year'], format='%Y')

In [3]:
# Create the simple 2D scatter plots
data_series = ['year', 'pages', 'msrp']

x_dropdown = widgets.Dropdown(options=data_series, description='x values')
y_dropdown = widgets.Dropdown(options=data_series, value='pages', description='y values')
controls_label = widgets.Label(value='Controls')

def create_scatter(x_dataset, y_dataset):
    with plt.style.context('ggplot'):
        fig = plt.figure(figsize=(9,6))

        plt.xlabel(x_dataset.capitalize())
        plt.ylabel(y_dataset.capitalize())

        plt.scatter(x = df[x_dataset], y = df[y_dataset])
        plt.title("{} vs {}".format(x_dataset.capitalize(), y_dataset.capitalize()))
    
    
scatter_plot = widgets.interactive_output(create_scatter, {'x_dataset': x_dropdown, 'y_dataset': y_dropdown})
scatter_display = widgets.HBox([
    scatter_plot,
    widgets.VBox([controls_label, x_dropdown, y_dropdown],
                layout=widgets.Layout(padding='30px'))
])

In [4]:
#test = df.groupby('year')['msrp']
#median = test.median()
#median.values

In [5]:
#widgets.IntRangeSlider(
#    value=[5, 7],
#    min=0,
#    max=10,
#    step=1,
#    description='Test:',
#    disabled=False,
    #continuous_update=False,
    #orientation='horizontal',
    #readout=True,
    #readout_format='d',
#)

In [6]:
#df['year'].max()
#print(df['year'].sort_values().head())
#(df['year'].sort_values() >= pd.Timestamp(1960, 1, 1)) & (df['year'].sort_values() <= pd.Timestamp(1972, 1, 1))

In [7]:
# Create the graphs handled by value counts by year
min_year = df['year'].min().year
max_year = df['year'].max().year

options = ['books, authors, publishers', 'average number of pages', 'average msrp']
line_dropdown = widgets.Dropdown(options=options, description='subject:', value='books, authors, publishers')
year_slider = widgets.IntRangeSlider(value=[min_year, max_year], min=min_year, max=max_year, step=1, continuous_update=False,
                                     description='years:')
lines_select = widgets.SelectMultiple(options=['number of books', 'different authors', 'different publishers'],
                                      value=['number of books'],
                                      description='plot values:')

def create_line_by_year(plot_type, min_max_years=(min_year, max_year), multiplot_values=[]):
    start_year, end_year = min_max_years[0], min_max_years[1]
    df_slice = df[(df['year'] >= pd.Timestamp(start_year, 1, 1)) & (df['year'] <= pd.Timestamp(end_year, 1, 1))]
    
    # Only show the lines_select if type is 'counts by year'
    if plot_type == 'books, authors, publishers':
        lines_select.layout.display = 'block'
    else:
        lines_select.layout.display = 'none'
    
    with plt.style.context('ggplot'):
        fig = plt.figure(figsize=(9,6))
        
        if plot_type == 'books, authors, publishers':
            if 'number of books' in multiplot_values:
                books_by_year = df_slice['year'].value_counts().sort_index()
                plt.plot(books_by_year.index, books_by_year.values, label='number of books')
            
            if 'different authors' in multiplot_values:
                authors_by_year = df_slice.groupby('year')['first_author'].count()
                plt.plot(authors_by_year.index, authors_by_year.values, label='number of authors')
                  
            if 'different publishers' in multiplot_values:
                publisher_by_year = df_slice.groupby('year')['publisher'].count()
                plt.plot(publisher_by_year.index, publisher_by_year.values, label='number of publishers')
                     
        elif plot_type == 'avg number of pages':
            pages_by_year = df_slice.groupby('year')['pages']
            mean = pages_by_year.mean()
            median = pages_by_year.median()
            plt.plot(mean.index, mean.values, label='mean')
            plt.plot(mean.index, median.values, label='median')
            plt.legend()
                     
        elif plot_type == 'avg msrp':
            msrp_by_year = df_slice.groupby('year')['msrp']
            mean = msrp_by_year.mean()
            median = msrp_by_year.median()
            plt.plot(mean.index, mean.values, label='mean')
            plt.plot(mean.index, median.values, label='median')
            plt.legend()
            
        plt.legend()
                    
line_plot = widgets.interactive_output(create_line_by_year, {'plot_type': line_dropdown,
                                                             'min_max_years': year_slider,
                                                             'multiplot_values': lines_select})
line_display = widgets.HBox([
    line_plot,
    widgets.VBox(
        [controls_label, line_dropdown, lines_select, year_slider],
        layout=widgets.Layout(padding='30px'))
])

#line_display

In [8]:
#books_by_year = df['year'].value_counts().sort_index()
#plt.plot(books_by_year.index, books_by_year.values)
#plt.plot(books_by_year.index, books_by_year.values)
#plt.show()

In [19]:
# Create the Top Ten graphs
df['first_author'].value_counts().head(10)
df['illustrator'].value_counts().head(10)
df['publisher'].value_counts().head(10)
df['location'].value_counts().head(10)

New York            16
Nashville, TN       15
Grand Rapids, MI    12
Wheaton, IL          6
Chicago              5
Minneapolis, MN      4
Elgin, IL            4
Sisters, OR          4
Colorado Springs     4
St. Louis, MO        3
Name: location, dtype: int64

In [9]:
tab_contents = {
    'Simple Scatters': scatter_display,
    'Trend by Year Lines': line_display
}

tab = widgets.Tab()
tab.children = list(tab_contents.values())
for i in range(len(tab.children)):
    tab.set_title(i, list(tab_contents.keys())[i])
tab

Tab(children=(HBox(children=(Output(), VBox(children=(Label(value='Controls'), Dropdown(description='x values'…

In [10]:
# TODO - Add min/max year on each one