In [None]:
# this notebook loads and manipulates E-Corp's 2-week dataset that was given to us, uses
# collaborative filtering to train an item-based rec engine, generates and saves the
# recommendations into Excel

__author__ = 'Josh Bacher'
__email__ = 'bacherjp@gmail.com'
__website__ = 'https://github.com/jpbacher'


import numpy as np
import pandas as pd


class Data:

    def __init__(self, file):

        self.data = self._get_data(file)

    def _get_data(self, file, process=True):

        data = self._read_data(file)
        if process:
            data = self._discard_small_basket_size(data, small_basket_size)
            data = self._ohe(data, features)
            data = self._remove_features(data, features)
            data = self._combine_orders(data, order_feat)

    def _read_data(self, file):

        return pd.read_csv(file, sep='\t')

    def _discard_small_basket_size(self, df, order_feat='order_number', small_basket_size=2):
        '''drop orders less than the small basket size'''
        df = df.loc[df.groupby(
            order_feat).count() >= small_basket_size]
        return df

    def _ohe(self, df, features=['l3']):
        '''one-hot encodes specified features in the data & concatenates back to the data'''
        dummies_df = []
        for feat in features:
            dummies_df.append(pd.get_dummies(df[feat]))

        return pd.concat([df, dummies_df], axis=1)

    def _remove_features(self, df, features=['l1', 'l2', 'l3', 'sku', 'brand']):
        
        return df.drop(features, axis=1, inplace=True)

    def _combine_orders(self, df, order_feat='order_number'):
        
        data_feats = list(df.columns)
        data_feats.remove(order_feat)
        return df.groupby(order_feat).sum()[data_feats].reset_index()
        