# The Home Depot Decor Case
Getting Started  |  Data Prep  |  Data Exploration  |  Preprocessing  |  Model Tuning  |  **Final Model**

In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing

%matplotlib inline

In [None]:
import sys
sys.path.insert(0, '../modules')

# read in functions/modules
from helpers import read_in_dataset, get_num_of_levels, flatten_categories
import hierarchy

# Define Recommender Class

In [None]:
class Recommender: 
    def __init__(self, data):
        self.data = data
        
    def create_similarity_matrix(self):
        
    
    

# Define Hierarchy Class

In [None]:
class Hierarchy:
    '''
    Build parent, child dictionary
    '''
    self.prod_map=None

    def __init__(self, data, num_levels):
        '''
        Keyword Arguments:
        ------------------
        * data - flattened pandas dataframe
        * num_levels - number of product category levels
        '''

        self.data= data
        self.num_levels = num_levels
        self.level_cols = list(data.columns[-self.num_levels:])
        self.was_fit=False


    def fit_transform(self):
        '''
        Fit (Encode Nodes) and Transform (Categories to Ids in Dataframe)
        '''
        # get notes
        self.nodes = list(pd.unique(self.data.iloc[:,-self.num_levels:].values.ravel()))

        #encode nodes
        le = preprocessing.LabelEncoder()
        le.fit(self.nodes)

        #save mapping
        self.node2id = dict(zip(le.classes_, le.transform(le.classes_)))
        self.id2node = dict(zip(le.transform(le.classes_),le.classes_))

        self.data_encoded = self.data

        for col in self.level_cols:
            self.data_encoded[col] = pd.Series(le.transform(self.data_encoded[col].to_list()))

        return self


    def get_mappings(self, map_direction='encode'):
        '''
        Get mapping for nodes and ids (and ids to nodes)

        Returns:
        --------
        dictionary
        '''

        if map_direction == 'encode':
            return self.node2id
        elif map_direction == 'decode':
            return self.id2node


    def get_parent_child_dict(self):
        '''
        Construct encoded parent - child dictionary
        (e.g. {2: [5,9,22]...})

        Returns:
        --------
        dictionary
        '''

        # Function to group category levels with sliding window
        def parent_child_dict(col_lst):
            for i in range(len(col_lst)-1):
                j=i+1
                yield(self.data_encoded.groupby(col_lst[i])[col_lst[j]].apply(set).to_dict())

        level_dicts = list(parent_child_dict(self.level_cols))

        # Add dictionaries together
        for i in range(1, len(level_dicts)):
            level_dicts[0].update(level_dicts[i])

        self.prod_map = level_dicts[0]


# Define Data Class

**Preprocess Data:**
* Load Data


In [None]:
# class to preprocess data

class Data:
    def __init__(self):
        '''init Data class'''
        self.data=None
        
    def load_data(self, dataset, data_folder='raw', data_type='csv', verbose=False):
        '''Read in dataset (csv format) to pandas dataframe

            Keyword Arguments:
            ------------------
            * dataset - string with dataset filename
            * data_folder - string with either raw or processed
            * verbose - True will print intormation about the dataset

            Returns:
            --------
            a pandas dataframe
        '''
        if data_type == 'csv':
            df = pd.read_csv('../data/{}/{}'.format(data_folder, dataset))
        elif data_type == 'excel':
            df = pd.read_excel('../data/{}/{}'.format(data_folder, dataset))
        else:
            raise ValueError('Invalid file format. Please specify "excel" or "csv".')

        if verbose:
            print('\n{0:-^80}'.format(' Reading in the following dataset: {0}'.format(dataset)))
            print("\n Shape: {0} rows and {1} columns".format(*df.shape))
            print('\n{0:-^80}\n'.format(' It has the following columns '))
            print(df.columns)
            print('\n{0:-^80}\n'.format(' The first 5 rows look like this '))
            print(df.head())

        return df
    
    
    def expand_columns(self, category_series, df=None, drop_col=None, sep='>'):
        '''
    Take in Series with categories in string format and flatten into columns

     Keyword Arguments:
    ------------------
    * category_series - series with string of categories
    * df - pandas dataframe
    * drop_col - name of column with nested categories (string)
    * sep - puncuation that separates categories


    Returns:
    --------
    a pandas dataframe
    '''
        num_levels = category_series.str.split(sep).apply(len).max()

        col_labels = ['L' + str(i) for i in range(1, num_levels+1)]


        category_levels = pd.DataFrame(category_series.str.split(sep).values.tolist(), columns=col_labels)
        category_levels.fillna(value=pd.np.nan, inplace=True)

        if df is not None:
            merged_df = pd.merge(df,category_levels, left_index=True, right_index=True).drop(drop_col, axis=1)

            return merged_df
        else:
            return category_levels
    
    def preprocess_text():
       
        
    

# Data Flow

In [None]:
load_and process_data = True
get_columns = True
run_rec_engine = True

## Load Data into DataFrame and Process

In [None]:
if load_and_process_data:
    # drop orders with few items, one-hot encode l3 category information, drop unnecessary columns, 
    # and consolidate unique orders into single records
    data = Data()
    data.load_data('test_data\All Transations - 2 Weeks.txt', format='tsv')
    data.drop_small_orders(order_col='order_number', min_order_size=20)
    data.expand_columns(['l3'])  
    data.drop_columns(['l1', 'l2', 'l3', 'sku', 'brand'])
    data.consolidate_orders(order_col='order_number')

## Get Column Names

In [None]:
if get_columns:
    search_col = 
    prod_col = 
    category_col =  

## Run Rec Engine and Generate Results

In [None]:
if run_rec_engine:
    rec_engine = Recommender(data.data, user_col=user_col, item_cols=item_cols, cf_method='item', similarity='jaccard')
    rec_engine.create_similarity_matrix()
    rec_engine.score_users()
    rec_engine.generate_recs()
    rec_engine.save_recs()
    rec_engine.print_recs()

# Structure

## Create Model Object

## Fit Final Model

## Score Test Data

## Model Evaluation

## Plot Results