### Notebook - 1d (Corrected Store Names)

In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',95)
pd.set_option('display.max_rows',None)
import pickle
import re

In [2]:
# Opening the pickled file
f = open('df2.pkl', 'rb')
df2 = pickle.load(f)
f.close()

In [3]:
df2.head()

Unnamed: 0,invoice/item_number,date,store_number,category,category_name,item_number,item_description,bottles_sold,sale_dollars,volume_sold_liters,store_name
0,S04763500007,2012-03-27,2534,1012100.0,canadian whiskies,11788,Black Velvet,6,94.02,10.5,Hy-Vee Drugtown / Urbandale
1,S27474100012,2015-08-25,4924,1022100.0,tequila,89194,Jose Cuervo Especial Reposado Flask,4,33.0,1.5,Abby Lea's
2,S10731000040,2013-02-21,4652,1032200.0,imported vodka - misc,34449,Ketel One Citroen,2,40.48,1.5,Brady Mart Food & Liquor
3,S17037900080,2014-01-27,4794,1041100.0,american dry gins,32236,Seagrams Extra Dry Gin,1,8.99,0.75,Smokin' Joe's #17 Tobacco and Liquor Outlet
4,S14396900023,2013-09-09,2647,1012100.0,canadian whiskies,13038,Canadian Reserve Whisky,6,80.94,10.5,Hy-Vee #7 / Edgewood Cedar Rapids


### Methodology for re-classifying liquor categories
There are 133 category_names, to bucket them into a broader category we created 2 functions,

1) the "contains" function which will look at every category_name and see if there is a string match and 2) the "category_cleaner" function which will replace the value of the category_name with a given word if there is a match from the contains function.

In [4]:
# Creating and cleaning the category names into broader categories

def contains(string, match_list):
    for match in match_list:
        if match in string:
            return True
    return False

def category_cleaner(value):
    """
    To be applied to a dataframe column to clean
    alchol categories so they are simpler.
    """
    import pandas as pd
    
    if pd.isnull(value):
        return value
    elif contains(value, ['rum', 'rums']):
        return 'rum'
    elif contains(value, ['whiskies','whiskey', 'whisky', 'scotch', 'rye', 'bourbon', 'iowa distilleries']):
        return 'whiskey'
    elif contains(value, ['anisette', 'triple sec', 'amaretto', 'creme', 'cremes', 'creams', 'cream', 'cordials', 'cordial', 'liqueurs', 'liqueur']):
        return 'cordials liqueurs'
    elif contains(value, ['brandy','brandies']):
        return 'brandy'
    elif contains(value, ['tequila','tequilas', 'mezcal']):
        return 'tequila'
    elif contains(value, ['schnapps','schnapp']):
        return 'schnapps'
    elif contains(value, ['vodka','vodkas']):
        return 'vodka'
    elif contains(value, ['gin','gins']):
        return 'gin'
    elif contains(value, ['cocktails','cocktail']):
        return 'cocktails'
    elif contains(value, ['spirit','spirits']):
        return 'spirits'
    elif contains(value, ['american alcohol']):
        return 'grain alcohol'
    elif contains(value, ['decanters', 'decanter', 'packages', 'accessories']):
        return 'accessories'
    elif contains(value, ['egg', 'egg nog', 'eggnog', 'nog']):
        return 'egg nog'
    else:
        return 'special orders'
    
df2['cat_name2'] = df2['category_name'].apply(category_cleaner)

In [5]:
df2['cat_name2'].value_counts()

whiskey              6321041
vodka                5138390
rum                  2334552
cordials liqueurs    1615077
brandy               1081807
schnapps             1026707
tequila               942921
gin                   720229
cocktails             567225
accessories           161457
spirits               134976
special orders         25647
grain alcohol          24351
egg nog                 1269
Name: cat_name2, dtype: int64

In [6]:
df2['cat_name2'].tolist()

['whiskey',
 'tequila',
 'vodka',
 'gin',
 'whiskey',
 'rum',
 'whiskey',
 'whiskey',
 'whiskey',
 'whiskey',
 'whiskey',
 'whiskey',
 'whiskey',
 'whiskey',
 'brandy',
 'vodka',
 'vodka',
 'whiskey',
 'whiskey',
 'vodka',
 'whiskey',
 'vodka',
 'cordials liqueurs',
 'whiskey',
 'vodka',
 'vodka',
 'cordials liqueurs',
 'cordials liqueurs',
 'rum',
 'rum',
 'whiskey',
 'schnapps',
 'cordials liqueurs',
 'whiskey',
 'cordials liqueurs',
 'whiskey',
 'tequila',
 'vodka',
 'cocktails',
 'whiskey',
 'cocktails',
 'whiskey',
 'cocktails',
 'rum',
 'vodka',
 'schnapps',
 'cordials liqueurs',
 'cocktails',
 'tequila',
 'rum',
 'vodka',
 'cordials liqueurs',
 'cordials liqueurs',
 'whiskey',
 'whiskey',
 'vodka',
 'vodka',
 'rum',
 'vodka',
 'whiskey',
 'whiskey',
 'whiskey',
 'vodka',
 'whiskey',
 'vodka',
 'schnapps',
 'vodka',
 'tequila',
 'rum',
 'brandy',
 'schnapps',
 'whiskey',
 'vodka',
 'cordials liqueurs',
 'cordials liqueurs',
 'cordials liqueurs',
 'vodka',
 'rum',
 'vodka',
 'bran

In [7]:
df2.shape

(20095649, 12)

In [8]:
df2.head()

Unnamed: 0,invoice/item_number,date,store_number,category,category_name,item_number,item_description,bottles_sold,sale_dollars,volume_sold_liters,store_name,cat_name2
0,S04763500007,2012-03-27,2534,1012100.0,canadian whiskies,11788,Black Velvet,6,94.02,10.5,Hy-Vee Drugtown / Urbandale,whiskey
1,S27474100012,2015-08-25,4924,1022100.0,tequila,89194,Jose Cuervo Especial Reposado Flask,4,33.0,1.5,Abby Lea's,tequila
2,S10731000040,2013-02-21,4652,1032200.0,imported vodka - misc,34449,Ketel One Citroen,2,40.48,1.5,Brady Mart Food & Liquor,vodka
3,S17037900080,2014-01-27,4794,1041100.0,american dry gins,32236,Seagrams Extra Dry Gin,1,8.99,0.75,Smokin' Joe's #17 Tobacco and Liquor Outlet,gin
4,S14396900023,2013-09-09,2647,1012100.0,canadian whiskies,13038,Canadian Reserve Whisky,6,80.94,10.5,Hy-Vee #7 / Edgewood Cedar Rapids,whiskey


In [9]:
df2 = df2.drop('category_name', axis=1)

In [10]:
df2 = df2.rename(columns= {'cat_name2':'cat_name'})

In [11]:
df2.drop('category',axis=1,inplace=True)

In [12]:
df2.columns

Index(['invoice/item_number', 'date', 'store_number', 'item_number',
       'item_description', 'bottles_sold', 'sale_dollars',
       'volume_sold_liters', 'store_name', 'cat_name'],
      dtype='object')

In [13]:
df2.head(50)

Unnamed: 0,invoice/item_number,date,store_number,item_number,item_description,bottles_sold,sale_dollars,volume_sold_liters,store_name,cat_name
0,S04763500007,2012-03-27,2534,11788,Black Velvet,6,94.02,10.5,Hy-Vee Drugtown / Urbandale,whiskey
1,S27474100012,2015-08-25,4924,89194,Jose Cuervo Especial Reposado Flask,4,33.0,1.5,Abby Lea's,tequila
2,S10731000040,2013-02-21,4652,34449,Ketel One Citroen,2,40.48,1.5,Brady Mart Food & Liquor,vodka
3,S17037900080,2014-01-27,4794,32236,Seagrams Extra Dry Gin,1,8.99,0.75,Smokin' Joe's #17 Tobacco and Liquor Outlet,gin
4,S14396900023,2013-09-09,2647,13038,Canadian Reserve Whisky,6,80.94,10.5,Hy-Vee #7 / Edgewood Cedar Rapids,whiskey
5,S23690800026,2015-01-28,3825,43333,Captain Morgan Spiced Rum Pet,48,153.6,9.601562,Shop N Save #2 / E 14th,rum
6,S07774100019,2012-09-17,3830,11788,Black Velvet,24,376.08,42.0,Wal-Mart 1435 / Creston,whiskey
7,S23288100058,2015-01-05,3644,11776,Black Velvet,12,94.2,9.0,Wal-Mart 2764 / Altoona,whiskey
8,S19601200008,2014-06-17,4448,64866,Fireball Cinnamon Whiskey,12,161.64,9.0,Kum & Go #572 / URBANDALE,whiskey
9,S17942800002,2014-03-18,2951,86916,Southern Comfort 100 Prf,12,209.64,9.0,Dahl's / Hickman,whiskey


In [14]:
# This is our table with correct store names, saving as df3
df3 = df2.copy()

In [15]:
# Writing/saving as a pickled file 
f = open('df3.pkl', 'wb')
pickle.dump(df3, f)
f.close()