### Notebook - 1c (Filling in Category Missingness)

In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',95)
pd.set_option('display.max_rows',None)
import pickle

In [2]:
# Opening the pickled file
f = open('df1.pkl', 'rb')
df1 = pickle.load(f)
f.close()

### Category Cleaning
In order to analyze liquor sales, attention must be given to how each liquor order has been categorized. Looking at the original data set, we see there are 133 unique category names which is too long a list to make sense of so we are going to re-classify the category_name into 13 simplified liquor categories (ie. rum, whiskey, gin, brandy, vodka, etc..) so we can have a clearer, more top-line overview of the types of liquor being ordered/sold.

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20095649 entries, 0 to 20095648
Data columns (total 11 columns):
 #   Column               Dtype         
---  ------               -----         
 0   invoice/item_number  category      
 1   date                 datetime64[ns]
 2   store_number         int64         
 3   category             float64       
 4   category_name        object        
 5   item_number          category      
 6   item_description     category      
 7   bottles_sold         object        
 8   sale_dollars         object        
 9   volume_sold_liters   float16       
 10  store_name           category      
dtypes: category(4), datetime64[ns](1), float16(1), float64(1), int64(1), object(3)
memory usage: 1.4+ GB


In [4]:
#liquor_cat.loc[liquor_cat.category_name == 'NaN'] 
category_null = df1[df1['category_name'].isna()]

In [None]:
# Cleaning item_description so object is in all lower case letters 
category_null['item_description'] = category_null['item_description'].str.lower()

In [None]:
# Checking our datatypes
category_null.info()

In [None]:
category_null.head() 

In [None]:
# Replacing NaN with a blank string 
category_null.category_name = category_null.category_name.fillna('') 

In [None]:
# Checking to see that the category_name is a blank cell (we will fill in with the correct category in the next cell)
category_null.head()

In [None]:
# Creating and cleaning the category_name empty cells with a category description using item_description as a proxy

def contains(string, match_list):
    for match in match_list:
        if match in string:
            return True
    return False

def item_cleaner(value):
    """
    To be applied to a dataframe column to clean
    alcohol categories so they are simpler.
    """
    import pandas as pd
    
    if contains(value, ['rum', 'rums', 'bacardi', 'cruzan', 'abuelo', 'morgan']):
        return 'rum'
    elif contains(value, ['whiskies','whiskey', 'whisky', 'scotch', 'rye', 'bourbon', 'iowa distilleries', 'fireball', 'mark', 'ha', 'medley', 'forester', 'jack', 'woodford']):
        return 'whiskey'
    elif contains(value, ['anisette', 'triple sec', 'amaretto', 'creme', 'cremes', 'creams', 'cream', 'cordials', 'cordial', 'liqueurs', 'liqueur', '99', 'marnier']):
        return 'cordials liqueurs'
    elif contains(value, ['brandy','brandies', 'cognac', 'coganc', 'hennessy']):
        return 'brandy'
    elif contains(value, ['tequila','tequilas', 'mezcal', 'hornitos', 'herradura']):
        return 'tequila'
    elif contains(value, ['schnapps','schnapp']):
        return 'schnapps'
    elif contains(value, ['vodka','vodkas','eddy', 'burnett\'s', 'burnetts', 'wapsi', 'smirnoff', 'cane']):
        return 'vodka'
    elif contains(value, ['gin','gins']):
        return 'gin'
    elif contains(value, ['cocktails','cocktail']):
        return 'cocktails'
    elif contains(value, ['spirit','spirits']):
        return 'spirits'
    elif contains(value, ['american alcohol', 'everclear']):
        return 'grain alcohol'
    elif contains(value, ['decanters', 'decanter', 'packages', 'glasses', 'coaster', 'shot', '/flask', 'flask', 'w/glass']):
        return 'accessories'
    elif contains(value, ['egg', 'egg nog', 'eggnog', 'nog']):
        return 'egg nog'
    else:
        return 'other'
    
category_null['category_name'] = category_null['item_description'].apply(item_cleaner)

In [None]:
# Checking to see if the category_name has a category (not empty)
category_null.head()

In [None]:
category_null.category_name.value_counts() 

In [None]:
# Filling in missing values in df1 table's category_name column with empty string so we can replace
df1.category_name = df1.category_name.fillna('')

In [None]:
# Dropping extra columns to simplify dataframe
category_null.drop(columns = ['date', 'store_number', 'category', 'item_number', 'item_description','bottles_sold', 
                    'sale_dollars', 'volume_sold_liters','store_name'], axis = 1, inplace = True)

In [None]:
# Confirming columns dropped 
category_null.columns

In [None]:
# Replacing the empty cells in df1 with category_null category names
df1.loc[df1['invoice/item_number'].isin(category_null['invoice/item_number']), ['category_name']] =category_null['category_name']

In [None]:
# Checking the df1 table
df1.loc[df1['invoice/item_number'].isin(category_null['invoice/item_number']), ['category_name']].head()

In [None]:
# Checking the category_null table (looks good)
category_null.head()

In [None]:
# Test to see that the category_name is oorrect using invoice # S06688800087
df1[df1['invoice/item_number'] == 'S06688800087']

In [None]:
# Cleaning category_name so object is in all lower case letters
df1['category_name'] = df1['category_name'].str.lower()

In [None]:
df1.head()

In [None]:
# Saving dataframe as 'df2'
df2 = df1.copy()

In [None]:
f = open('df2.pkl', 'wb')
pickle.dump(df2, f)
f.close()

In [None]:
# next steps: re-classify below to get the corrected 'category_name' column
# then create the egg nog column
# then need the store closed column 

### Methodology for re-classifying liquor categories
As mentioned above, there are 133 category_names, to bucket them into a broader category we created 2 functions,

1) the "contains" function which will look at every category_name and see if there is a string match and 2) the "category_cleaner" function which will replace the value of the category_name with a given word if there is a match from the contains function.

In [None]:
# Creating and cleaning the category names into broader categories

def contains(string, match_list):
    for match in match_list:
        if match in string:
            return True
    return False

def category_cleaner(value):
    """
    To be applied to a dataframe column to clean
    alchol categories so they are simpler.
    """
    import pandas as pd
    
    if pd.isnull(value):
        return value
    elif contains(value, ['rum', 'rums']):
        return 'rum'
    elif contains(value, ['whiskies','whiskey', 'whisky', 'scotch', 'rye', 'bourbon', 'iowa distilleries']):
        return 'whiskey'
    elif contains(value, ['anisette', 'triple sec', 'amaretto', 'creme', 'cremes', 'creams', 'cream', 'cordials', 'cordial', 'liqueurs', 'liqueur']):
        return 'cordials liqueurs'
    elif contains(value, ['brandy','brandies']):
        return 'brandy'
    elif contains(value, ['tequila','tequilas', 'mezcal']):
        return 'tequila'
    elif contains(value, ['schnapps','schnapp']):
        return 'schnapps'
    elif contains(value, ['vodka','vodkas']):
        return 'vodka'
    elif contains(value, ['gin','gins']):
        return 'gin'
    elif contains(value, ['cocktails','cocktail']):
        return 'cocktails'
    elif contains(value, ['spirit','spirits']):
        return 'spirits'
    elif contains(value, ['american alcohol']):
        return 'grain alcohol'
    elif contains(value, ['decanters', 'decanter', 'packages']):
        return 'accessories'
    else:
        return 'special orders'
    
liquor_cat['cat_name2'] = liquor_cat['category_name'].apply(category_cleaner)

In [None]:
liquor_cat['cat_name2'].value_counts()

In [None]:
liquor_cat.columns.tolist()

In [None]:
liquor_cat = liquor_cat.drop('category_name', axis=1)

In [None]:
liquor_cat = liquor_cat.rename(columns= {'store_name_y':'store_name2','cat_name2':'category_name2'})

In [None]:
liquor_cat.drop('category',axis=1,inplace=True)

In [None]:
liquor_cat.columns

In [None]:
# This is our original table with correct store names and saving to csv so we don't have to re-load
liquor_cat.to_csv('./data/main_df2.csv.gz',index=False,compression='gzip')