# 1.) Import the modules we will need

In [None]:
import pandas as pd
import numpy as np
import glob
import warnings
warnings.filterwarnings("ignore")

# 2.) Take a look at the files we have to work with

In [None]:
glob.glob('./input/sales-*.xlsx')

## a.) Now let's grab all the files and concatenate them together

In [None]:
all_data = pd.DataFrame()
for f in glob.glob('./input/sales-*.xlsx'):
    df = pd.read_excel(f)
    all_data = pd.concat([all_data, df])
all_data.info()

## b.) We can get quick stats on numeric columns just by running a "describe"

In [None]:
all_data.describe()

## c.) Let's look at the raw data

In [None]:
all_data.head()

In [None]:
all_data.tail()

# 3.) Time to do a little house cleaning and transformation

## a.) First the clean up

In [None]:
all_data['discount'].fillna(0, inplace=True)          #  Let's fill mising values with zeros
all_data['ext price'] = abs(all_data['ext price'])    #  Let's make the negative numbers go bye bye
all_data['unit price'] = abs(all_data['unit price'])
all_data['quantity'] = abs(all_data['quantity'])
all_data['date'] = pd.to_datetime(all_data['date'])   #  Let's make this a date object so we can do fancy stuff
all_data.head()

## b.) Now let's add a couple calculated columns

In [None]:
all_data['total price'] = all_data['unit price'] * all_data['quantity'] * (1 - all_data['discount'])
all_data['month'] = all_data['date'].dt.month
all_data.tail()

# 4.) Let's join our data to another data set

In [None]:
customer_data = pd.read_excel('./input/customer-status.xlsx')
customer_data.head()

In [None]:
all_data = pd.merge(all_data, customer_data, how='left')    #  Just like in SQL, we can left join data sets
all_data.head()

## a.) Looks like we have some missing values again, let's fix that

In [None]:
all_data['status'].fillna('bronze',inplace=True)    #  Let's replace missing values with 'bronze' as a default
all_data['address'].fillna('?',inplace=True)    #  Let's replace missing values with 'bronze' as a default
all_data['city'].fillna('?',inplace=True)    #  Let's replace missing values with 'bronze' as a default
all_data['state'].fillna('?',inplace=True)    #  Let's replace missing values with 'bronze' as a default
all_data.head()

# 5.) Lets try to join another dirty data set to get location info

## a.) Insert a blank column as a placeholder

In [None]:
all_data.insert(6, "abbrev", np.nan)

## b.) Open a JSON file that has abbreviations for all the states

In [None]:
import json
states = json.loads(open('./input/state_abbreviations.json').read())

## c.) Create a function we will use to lookup state abbreviations

In [None]:
from fuzzywuzzy import process
def convert_state(row):
    abbrev = process.extractOne(row["state"], choices=states.keys(), score_cutoff=80)
    if abbrev:
        return states[abbrev[0]]
    return np.nan

## d.) Let's apply our function to populate the correct abbreviations

In [None]:
all_data['abbrev'] = all_data.apply(convert_state, axis=1)
all_data.head()

# 6.) Now that our data is combined and cleaned up, let's pivot!

In [None]:
pt = pd.pivot_table(
    all_data,                            #  This is the Dataframe we will pivot
    index=["name", 'sku'],               #  These are the fields, in order, that we want to pivot
    columns=['month'],                   #  Let's group by this these columns
    values=['quantity', 'total price'],  #  These are the value we want to agg
    aggfunc=[np.sum],                    #  This is the methods of aggrigation we want to use
    fill_value=0                         #  Since we know this will happen, let's fill missing values with zero
)
pt

## a.) Did you know that you can cross section pivots? 

In [None]:
pt.xs('Will LLC', level=0)    #  Let's cut out just the data for one account

# 7.) Now let's write all this data to the files we need

## a.) An Excel file with the full data set

In [None]:
all_data.to_excel('./output/all_data.xls')

## b.) An Excel file of the pivoted data

In [None]:
pt.to_excel('./output/pivot_data.xls')

## c.) An Excel file with a different cross-section on each tab

In [None]:
writer = pd.ExcelWriter('./output/pivoted_x-sections.xlsx')
for name in pt.index.get_level_values(0).unique():
    temp_df = pt.xs(name, level=0)
    temp_df.to_excel(writer,name)
writer.save()

# 8.) Now for the charts

The Bar high-level chart can produce bar charts in various styles. Bar charts are configured with a DataFrame data object, and a column to group. This column will label the x-axis range. Each group is aggregated over the values column and bars are show for the totals.


In [None]:
from bokeh.charts import Bar, output_file, output_notebook, show
output_notebook()
p = Bar(all_data, 'name', values='total price', title="Total Sales by Accunt")
output_file("./output/bar.html")

In [None]:
show(p)