In [1]:
# Basic Imports
import sys
import os
import logging
import shutil

In [2]:
# DS utility imports
import pandas as pd
import numpy as np

# DS analysis imports
from sklearn.decomposition import NMF, LatentDirichletAllocation as LDA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.preprocessing import StandardScaler,RobustScaler,Normalizer,MinMaxScaler
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.neural_network import BernoulliRBM

## imprts for clustering
import time
import warnings

from sklearn import cluster, mixture
from sklearn.neighbors import kneighbors_graph
from itertools import cycle, islice

In [3]:
# plotting import
# import matplotlib.plyplot as plt
# %matplotlib
# %matplotlib inline

# import matplotlib.pyplot as plt
# plt.rcParams['figure.figsize'] = [10, 8]

# from scipy.stats import norm
# import seaborn as sns


## Classes for Processing Categorical, Continuous and Text Data

In [4]:
# A class of storing all the statics of the continuous variables to make it comparable
class ContinuousStats:
    
    def __init__(self,data,con_name = '',normalize='robust',digits = 4):
       
        self.con_name = con_name
        
        data = self.get_cont_data(data)
        if len(data.shape) == 1:
            data = np.array(data).reshape(-1, 1)
        
        self.unique = [len(np.unique(data))]
        self.max = [np.round(np.max(data),decimals=digits)]
        self.min = [np.round(np.min(data),decimals=digits)]
        self.mean = [np.round(np.mean(data),decimals=digits)]
        self.std = [np.round(np.std(data),decimals=digits)]
        self.median = [np.round(np.median(data),decimals=digits)]
        self.digits = digits
        self.normalize = normalize
        self.normalizer = None
        
        self.__normalizeit(self.normalize,data,digits)
        
    def __normalizeit(self,normalize,data,digits):

        if self.unique[0] != 1:
            if normalize == 'minmax':
                self.normalizer = MinMaxScaler()
                self.normalizer.fit(data)
            elif normalize == 'robust':
                self.normalizer = RobustScaler(quantile_range=(25, 75))
                self.normalizer.fit(data)
            elif normalize == 'uni_quantile':
                self.normalizer = QuantileTransformer(output_distribution='uniform')
                self.normalizer.fit(data)
            elif normalize == 'norm_l1':
                self.normalizer = Normalizer(norm = 'l1')
                self.normalizer.fit(data)
            elif normalize == 'norm_l2':
                self.normalizer = Normalizer()
                self.normalizer.fit(data)
            elif normalize == 'gauss_quantile':
                self.normalizer = QuantileTransformer(output_distribution='normal')
                self.normalizer.fit(data)
            else:
                print('Normalizer not defined! Using basic StandardScalar normalizer!')
                self.normalizer = StandardScaler()
                self.normalizer.fit(data)
        else:
            self.normalize = 'identity'
            self.normalizer = None
        
        if self.normalizer is not None:
            trns = self.normalizer.transform(data)
        else:
            print('Identity transformation!')
            trns = data
            
        self.unique.append(len(np.unique(trns)))
        self.max.append(np.round(np.max(trns),decimals=digits))
        self.min.append(np.round(np.min(trns),decimals=digits))
        self.mean.append(np.round(np.mean(trns),decimals=digits))
        self.std.append(np.round(np.std(trns),decimals=digits))
        self.median.append(np.round(np.median(trns),decimals=digits))
        

    def __eq__(self, other):
        """Override the default Equals behavior"""
        
        return  self.unique == other.unique and self.max == other.max and \
                self.min == other.min and self.mean == other.mean and \
                self.std == other.std and self.median == other.median
 
    def __ne__(self, other):
        """Override the default Unequal behavior"""
        
        return self.unique != other.unique and self.max != other.max and \
                self.min != other.min and self.mean != other.mean and \
                self.std != other.std and self.median != other.median
    
    def print_stats(self):
        s ='Continuous Statistics \[without,with\] \'{}\' Transformation \
        \n Unique: {} Max: {} Min: {} Mean: {} SD: {} Median: {}\n'.format(
                self.normalize,self.unique,self.max,self.min,self.mean,self.std,self.median)
        print(s)
        
    def get_cont_data(self,data):
        vector = data[self.con_name]
        return np.array(vector).reshape(-1, 1)
    
    def transform_data(self,data):
        vector = self.get_cont_data(data)
        try:
            if self.normalizer is None:
                return vector
            return self.normalizer.transform(vector)
        except:
            print('There is something fishy while transforming! Check it!!!')
    
    def transformer(self):
        return self.transform_data
    
    def get_features_num(self):
        return 1
    
    def get_parameters(self):
        return {}

In [5]:
# A class of storing all the statics of the Categorical variables to make it comparable
class CategoryStats:
    
    def __init__(self,data,cat_name = '',numCat = 10, prob_thresh = 0.02,data_thresh=0.005,print_sub_df = False):
        
        self.cat_name = cat_name
        self.num_cat = numCat
        self.prob_thresh = prob_thresh
        self.data_thresh = data_thresh
        self.print_sub_df = print_sub_df
        
        self.cat_report, self.categories = self.categorical_binner(data)
        
        # Encoding label to integer and then doing one hot encoding to get the feature space
        self.labelencoder = LabelEncoder()
        self.onehotencoder = OneHotEncoder(dtype=np.int8)
        # fitting label encoder first and then fitting one hot encoder on it
        self.onehotencoder.fit(self.get_cat_data(self.labelencoder.fit_transform(self.get_cat_data(data))))
    
    def get_freq_table(self, data, return_cats = False):
        
        # Setting local values for the functions
        numCat = self.num_cat
        prob_thresh = self.prob_thresh
        data_thresh = self.data_thresh
        
        shp = data.shape
        thresh_val = data_thresh*shp[0]

        if len(shp)==1:
            df_cat = data.value_counts()
            probs = np.round(df_cat.values/shp[0]*100,2)
            thres = df_cat.values >= thresh_val
            df = pd.DataFrame({'index' : df_cat.index,
                               'freq'  : df_cat.values,
                               'prob'  : probs,
                               'prob_t': probs > prob_thresh*100,
                               'thres' : thres})
            shp = df.shape
            prob_cat = sum(df.prob_t)
            thres_cat = sum(df.thres)
            if shp[0] <= numCat:
                df['suggested_cat'] = True
            elif (prob_cat/shp[0]) > (thres_cat/shp[0]) and prob_cat >= numCat:
                df['suggested_cat'] = df.prob_t
            elif (prob_cat/shp[0]) <= (thres_cat/shp[0])and thres_cat >= numCat:
                df['suggested_cat'] = df.thres
            else:
                df['suggested_cat'] = True

            print(' Freq table Shape: {} Prob Thresh :{} Threshold Filter:{}'.format(
                df.shape[0],sum(df.prob_t),sum(df.thres)))

            df = df[['index','freq','prob','prob_t','thres','suggested_cat']]
            
            if return_cats:
                return df,df[df.suggested_cat]['index'].values
            else:
                return df
        
        else:
            print('Only single pandas array is acceptable. Retruning None!')
            return None

    def categorical_binner(self,data,print_sub_df=False):
        if self.cat_name not in data.columns:
            print('{} is not in data hence cannot make categorical features. Retruning None!')
            return None
        else:
            print('For Categorical Column: {}'.format(self.cat_name))
            if len(data[self.cat_name].unique()) >= np.ceil(0.8*data.shape[0]):
                print('Too many categories to process! Probably you should not do categorical feature processing on it!')
            df_cat = data[self.cat_name]
            df , cats = self.get_freq_table(df_cat, return_cats=True)
            if print_sub_df:
                print(df)
            return df, cats
    
    def print_stats(self):
        print('Frequency information for Category: {}'.format(self.cat_name))
        print(self.cat_report)
    
    def __eq__(self, other):
        """Override the default Equals behavior"""
        pass
    
    def __ne__(self,other):
        """Override the default Not Equals behavior"""
        pass
    
    def get_cat_data(self,data):
        
        if len(data.shape) == 1:
            return np.array(data).reshape(-1, 1)
        
        elif self.cat_name not in data.columns:
            print('{} is not in data hence cannot make categorical features. Retruning None!')
            return None
        
        else:
            vector = data[self.cat_name]
           # making data compactible to selected categories
            return vector.where(vector.isin(self.categories),'others')
    
    def transform_data(self,data):
    #         # making data compactible to selected categories
    #         transformed_cat = self.get_cat_data(data)
        
        # Transforming data with on hot encoder
        return self.onehotencoder.transform(self.get_cat_data(
                        self.labelencoder.transform(self.get_cat_data(data)))).toarray()

    def transformer(self):
        return self.transform_data
    
    def get_features_num(self):
        return self.onehotencoder.n_values_[0]
    
    def get_parameters(self):
        return {}

In [63]:
# A class of storing all the statics of the Text variables to make it comparable
class TextStats:
    
    def __init__(self, data, text_name, transformer='cvt', method='lda', n_components=25, max_features=750):
        
        self.text_name = text_name
        self.text_transformer = transformer
        self.method = method
        self.n_components = n_components
        self.max_features = max_features
        self.vectorize = None
        self.model = None
        self.get_word_vec(data[text_name])
            
    def get_word_vec(self,text_data):
    
        # Getting data from Data frame
        text = text_data

        # Setting transformer
        if self.text_transformer == 'tfidf':
            self.vectorize = TfidfVectorizer(analyzer="word",
                                       ngram_range=(1,2),
                                       min_df=2,
                                       max_df = 0.9,
                                       max_features=self.max_features,
                                       stop_words='english'
                                            ).fit(text)
            features = self.vectorize.transform(text)

        elif self.text_transformer == 'cvt':
            self.vectorize = CountVectorizer(analyzer="word"
                                       , ngram_range=(1,2)
                                       , preprocessor=None
                                       , stop_words='english'
                                       , min_df=2
                                       , max_df = 0.9
                                       , max_features=self.max_features
                                      ).fit(text) 
            features = self.vectorize.transform(text)
        else:
            pass

        # Making models
        if self.method == 'lda':
            self.model = LDA(n_components=self.n_components, 
                          max_iter=100,
                          learning_method='batch',
                          batch_size=256,
                          learning_offset=50.,
                          evaluate_every=10,
                          random_state=143,
                          n_jobs=-1,verbose=True).fit(features)

        elif self.method == 'nmf_fn':
            self.model = NMF(n_components=self.n_components, 
                      random_state=1,
                      alpha=.1, 
                      l1_ratio=.5).fit(features)

        elif self.method == 'nmf_kl':
            self.model = NMF(n_components=self.n_components, 
                      random_state=1,
                      beta_loss='kullback-leibler', 
                      solver='mu', max_iter=1000, alpha=.1,
                      l1_ratio=.5).fit(features)

        elif self.method == 'kmeans':
            pass

        else:
            print('Default is set to LDA model!')
            self.model = LDA(n_components=self.n_components, 
                          max_iter=20,
                          learning_method='batch',
                          batch_size=256,
                          learning_offset=50.,
                          evaluate_every=10,
                          random_state=143,
                          n_jobs=4).fit(features)
        
        # TODO: Add features flag to store data with class
        self.features = features
        
    def print_stats(self, plotit=False, print_topics=25):
        # printing topics
        if print_topics is not None and print_topics >0:
            feature_names = self.vectorize.get_feature_names()
            for topic_idx, topic in enumerate(self.model.components_):
                message = "Topic #%d: " % topic_idx
                message += " ".join([feature_names[i]
                                     for i in topic.argsort()[:-print_topics - 1:-1]])
                print(message)

        # plotting it
        if plotit:
            arr = self.features.toarray()
            plt.imshow(arr,aspect=0.25,cmap='binary')
            plt.show()
            plt.hist(x = np.sum(arr,axis=0),
                 bins = np.histogram(np.sum(arr,axis=0),bins=30)[1])
            plt.show()
            plt.hist(x = np.sum(arr,axis=1),
                 bins = np.histogram(np.sum(arr,axis=1),bins=10)[1])
            plt.show()
            plt.imshow(self.model.transform(self.features).T,
               aspect=50,
               cmap='BrBG')
            plt.show()
            
    def __eq__(self, other):
        """Override the default Equals behavior"""
        pass
    
    def __ne__(self,other):
        """Override the default Not Equals behavior"""
        pass
        
    def transform_data(self, data):
        text = data[self.text_name]
        return self.model.transform(self.vectorize.transform(text))
    
    def transformer(self):
        return self.transform_data
    
    def get_features_num(self):
        return self.n_components
    
    def get_parameters(self):
        return {}

## Data loading from ResourceDB

In [7]:
from bt_ai.stable.data_input.dataframe import MultiDataFrameLoader, DataFrameTarget
from bt_ai.stable.data_input.resources import ResourcesDb

from notebook_utils.logging import setup_logging
from notebook_utils.luigi import run_luigi_tasks

In [8]:
logging_overrides = {
    'luigi-interface': logging.INFO,
    'bt_candidates': logging.INFO
}
setup_logging(level=logging.DEBUG, overrides=logging_overrides)
LOG = logging.getLogger('contrec')

# site_id = 'zt2bt-pvh-calvin-klein'
site_id = 'thompson-cigar-prod'


local_data_path = 'resource_data'

In [9]:
if os.path.exists(local_data_path) and os.path.isdir(local_data_path):
    shutil.rmtree(local_data_path)
elif not os.path.exists(local_data_path):
    os.makedirs(local_data_path)
    

In [10]:
from bt_site_configuration.types import FieldType

db = ResourcesDb(chunk_size=5000)

schema = db.client.get_schema(site_id=site_id)
print(schema.field_types)

input_fields = list(schema.field_types.keys())
date_fields = [f for f,t in schema.field_types.items() if t == FieldType.DATETIME]

print(input_fields)
print(date_fields)

resource_ids = db.get_known_resources(site_id)
LOG.info('{} resource ids loaded'.format(len(resource_ids)))


{'cat2': TEXT, 'resource-type': TAGSET, 'thumbnail_ALT1': TEXT, 'cat_full': TEXT, 'thumbnail_ALT4': TEXT, 'thumbnail': TEXT, 'availability': TEXT, 'thumbnail_ALT2': TEXT, 'shipping_info': TEXT, 'title': TEXT, 'resource-id': TEXT, 'price_full': USD, 'isBlacklisted': FLAG, 'price_sale': USD, 'isUnavailable': FLAG, 'modDate': DATETIME, 'price_currency': TEXT, 'returns_info': TEXT, 'keywords': TAGSET, 'cat1': TEXT, 'REPLACE': TEXT, 'pubDate': DATETIME, 'url': TEXT, 'thumbnail_ALT3': TEXT, 'body': TEXT, 'description': TEXT}
['cat2', 'resource-type', 'thumbnail_ALT1', 'cat_full', 'thumbnail_ALT4', 'thumbnail', 'availability', 'thumbnail_ALT2', 'shipping_info', 'title', 'resource-id', 'price_full', 'isBlacklisted', 'price_sale', 'isUnavailable', 'modDate', 'price_currency', 'returns_info', 'keywords', 'cat1', 'REPLACE', 'pubDate', 'url', 'thumbnail_ALT3', 'body', 'description']
['modDate', 'pubDate']


2018-04-24 18:51:13 [32mINFO    [0m contrec: [34m1811 resource ids loaded[0m


In [11]:
props_iter = db.get_props(site_id=site_id,
                        resources=resource_ids,
                        fields=input_fields
                        )

rows = []
columns = ['resource', ] + input_fields
for resource_id, fields in props_iter:
#     print(type(fields))
    row = [resource_id, ]
    row.extend(fields)
    rows.append(row)


In [12]:
resources_df = pd.DataFrame(rows, columns=columns)

LOG.info('{} resources with {} fields loaded'.format(resources_df.shape[0], resources_df.shape[1]))

2018-04-24 18:51:23 [32mINFO    [0m contrec: [34m1811 resources with 27 fields loaded[0m


In [None]:
resources_df.to_csv('~/Documents/thompson_cigar.csv')

In [13]:
resources_df.cat_full.unique()

array(["Women's Fragrance", 'Jeans', 'Sweatshirts + Sweaters', 'Wallets',
       'bras', 'Handbags', 'panties', 'Tops', 'Suiting + Blazers',
       'Activewear', 'Tees + Tanks', 'Outerwear',
       '3 for $33 Panty Essentials', 'Hats', 'Swim', 'Bags', 'Pants',
       'Shirts + Blouses', 'Wallets + Small Goods', 'Dresses',
       'Dresses + Skirts', 'Watches + Jewelry', 'Belts', 'Dinnerware',
       'Shoes', 'Bottoms', 'Multipacks', 'Dress Shirts',
       'Suits + Dress Shirts Sizes 8-20', 'Skirts', 'Sunglasses', 'Briefs',
       'Duvet Covers + Sheets', 'Polos', 'Suiting + Jackets', 'Loungewear',
       'Flatware', 'Casual Shirts', 'T-Shirts', 'Underwear + Lounge',
       'Shorts', 'sleepwear + loungewear', 'Ties', 'Drinkware',
       'Baby 0-24 Months', 'Socks', 'Bath Towels', 'Boxer Briefs',
       '40% Off Sale', 'Hats, Gloves + Scarves', 'Trunks',
       '80% Off Final Sale', "Men's Fragrance", 'bralettes + triangles'], dtype=object)

In [14]:
print(resources_df.head())

           resource      cat2 resource-type  \
0  product|44017100    womens     {product}   
1  product|25017436     jeans     {product}   
2  product|23103237  sweaters     {product}   
3  product|47114014   wallets     {product}   
4  product|18536579     jeans     {product}   

                                      thumbnail_ALT1                cat_full  \
0  https://calvinklein.scene7.com/is/image/Calvin...       Women's Fragrance   
1                                               NULL                   Jeans   
2  https://calvinklein.scene7.com/is/image/Calvin...  Sweatshirts + Sweaters   
3                                               NULL                 Wallets   
4  https://calvinklein.scene7.com/is/image/Calvin...                   Jeans   

                                      thumbnail_ALT4  \
0                                               NULL   
1                                               NULL   
2  https://calvinklein.scene7.com/is/image/Calvin...   
3           

In [15]:
# # loading data and doing some preprocessing
# df_orig = pd.read_csv('ck_resources.csv')

#creating a copy on which we will do changes
df = resources_df.copy(deep=True)

# formatting columns
print(resources_df.columns)
df.columns = [i.replace(' ','_').replace('-','_').lower() for i in df.columns]
print(df.columns)

# Processing numeric varables
print(resources_df.price_full.unique()[:5])
# df.price_full = df.price_full.apply(lambda x: float(x.replace('$','')))
df.price_full = df.price_full.apply(lambda x: float(x))
print(df.price_full.unique()[:5])

print(resources_df.price_sale.unique()[:5])
df.price_sale = df.price_sale.apply(lambda x: float(x))
print(df.price_sale.unique()[:5])

Index(['resource', 'cat2', 'resource-type', 'thumbnail_ALT1', 'cat_full',
       'thumbnail_ALT4', 'thumbnail', 'availability', 'thumbnail_ALT2',
       'shipping_info', 'title', 'resource-id', 'price_full', 'isBlacklisted',
       'price_sale', 'isUnavailable', 'modDate', 'price_currency',
       'returns_info', 'keywords', 'cat1', 'REPLACE', 'pubDate', 'url',
       'thumbnail_ALT3', 'body', 'description'],
      dtype='object')
Index(['resource', 'cat2', 'resource_type', 'thumbnail_alt1', 'cat_full',
       'thumbnail_alt4', 'thumbnail', 'availability', 'thumbnail_alt2',
       'shipping_info', 'title', 'resource_id', 'price_full', 'isblacklisted',
       'price_sale', 'isunavailable', 'moddate', 'price_currency',
       'returns_info', 'keywords', 'cat1', 'replace', 'pubdate', 'url',
       'thumbnail_alt3', 'body', 'description'],
      dtype='object')
[$44.00 $108.00 $98.00 $68.00 $118.00]
[  44.  108.   98.   68.  118.]
[$44.00 $108.00 $98.00 $68.00 $118.00]
[  44.  108.   98.  

In [16]:
df.body[5]

'a seductive comfort add-a-size plunge bra designed with scalloped lace trim, extra padded push-up cups and a smooth stitch-free construction.'

In [17]:
df.head()

Unnamed: 0,resource,cat2,resource_type,thumbnail_alt1,cat_full,thumbnail_alt4,thumbnail,availability,thumbnail_alt2,shipping_info,...,price_currency,returns_info,keywords,cat1,replace,pubdate,url,thumbnail_alt3,body,description
0,product|44017100,womens,{product},https://calvinklein.scene7.com/is/image/Calvin...,Women's Fragrance,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{NULL},fragrance,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/womens-clothing/f...,,Romantic. Timeless. Feminine. ETERNITY was ins...,Romantic. Timeless. Feminine. ETERNITY was ins...
1,product|25017436,jeans,{product},,Jeans,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{jeans},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,a pair of skinny leg jeans made with cotton st...,a pair of skinny leg jeans made with cotton st...
2,product|23103237,sweaters,{product},https://calvinklein.scene7.com/is/image/Calvin...,Sweatshirts + Sweaters,https://calvinklein.scene7.com/is/image/Calvin...,https://calvinklein.scene7.com/is/image/Calvin...,instock,https://calvinklein.scene7.com/is/image/Calvin...,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{whitelabel},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,https://calvinklein.scene7.com/is/image/Calvin...,a soft sweater essential to any men's collecti...,a soft sweater essential to any men's collecti...
3,product|47114014,wallets,{product},,Wallets,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{jeans},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,designed with an allover grid logo print. this...,designed with an allover grid logo print. this...
4,product|18536579,jeans,{product},https://calvinklein.scene7.com/is/image/Calvin...,Jeans,https://calvinklein.scene7.com/is/image/Calvin...,https://calvinklein.scene7.com/is/image/Calvin...,instock,https://calvinklein.scene7.com/is/image/Calvin...,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{jeans},womens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/womens-clothing/w...,https://calvinklein.scene7.com/is/image/Calvin...,"featuring soft cotton denim, these jeans are c...","featuring soft cotton denim, these jeans are c..."


In [18]:
df.describe()

Unnamed: 0,price_full,price_sale
count,1811.0,1811.0
mean,111.194644,111.194644
std,94.129301,94.129301
min,10.0,10.0
25%,58.0,58.0
50%,88.0,88.0
75%,134.0,134.0
max,795.0,795.0


## Class for doing all data processing on the resources dump

In [21]:
class DataAdapterAndTransformer:
    
    def __init__(self,data):
        
        self.processors = {}
        self.processor_column = []
        self.process_data(data)
        #todo set parameters for each processors
    
    def process_data(self,data):
        
        for col_ in data.columns:
            try:
                unq = len(pd.unique(df[col_]))
            except TypeError:
                print('\tGot Type Error for {}. Converting to string!'.format(col_))
                df[col_] = df[col_].apply(lambda x: str(x))
                unq = len(pd.unique(df[col_]))
                
            # unq = len(data[col_].unique())
            nas = sum(pd.isnull(data[col_]))
            dtype = data[col_].dtype
            shp = data.shape
            
            #for debugging and logging
            print('Processing: {} having Uniques: {} NAs: {} Dtype: {}'.format(
                    col_,unq,nas,dtype))
            # thresholding for na values in data so that can be handled seperately
            if nas < np.ceil(0.3*shp[0]):
                
                # Right now all object are divided based on number of possible categories
                # If unique values are less than 50% of data then data is category else text
                # todo: add how this thing will bheave for datatime features
                
                if dtype == 'O':
                    if unq < np.ceil(0.5*shp[0]):
                        self.processor_column.append(col_)
                        processor = CategoryStats(data,col_)
                        self.processors[col_] ={'Uniques' : unq, 'NAs': nas, 'Dtype:': dtype,
                                               'processor': processor, 
                                               'features': processor.get_features_num(),
                                               'col_type' : 'Category'}
                    else:
                        self.processor_column.append(col_)
                        processor = TextStats(data,col_)
                        self.processors[col_] ={'Uniques' : unq, 'NAs': nas, 'Dtype:': dtype,
                                               'processor': processor,
                                               'features': processor.get_features_num(),
                                               'col_type' : 'Text'}
                elif dtype in [np.number]:
                    self.processor_column.append(col_)
                    processor = ContinuousStats(data,col_)
                    self.processors[col_] ={'Uniques' : unq, 'NAs': nas, 'Dtype:': dtype,
                                           'processor': processor, 
                                           'features': processor.get_features_num(),
                                           'col_type' : 'Numeric'}
                else: # be fail proof
                    self.processors[col_] = {'Uniques' : unq, 'NAs': nas, 'Dtype:': dtype,
                                           'processor': None, 
                                           'col_type' : 'Category'}
                    
            else:
                self.processors[col_] = {'Uniques' : unq, 'NAs': nas, 'Dtype:': dtype,
                                           'processor': None, 
                                           'col_type' : 'Category'}   
                
    def get_processors(self):
        processor = []
        for col in self.processor_column:
            processor.append(self.processors[col]['processor'])
        return processor
    
    def print_stats(self,plotit=False):
        for col in self.processor_column:
            print()
            # todo : make plot for each stats
            if self.processors[col]['col_type'] != 'Text':
                self.processors[col]['processor'].print_stats()
            else:
                self.processors[col]['processor'].print_stats(plotit=plotit)
    
    def get_transformed(self,data,col_type='All'):
        if col_type == 'All':
            processor = []
            for col in self.processor_column:
                processor.append(self.processors[col]['processor'].transform_data(data))
            return processor
        else:
            arr = None
            for col in self.processor_column:
                if self.processors[col]['col_type'] == col_type:
                    if arr is None:
                        arr = self.processors[col]['processor'].transform_data(data)
                    else:
                        arr = np.concatenate((arr,self.processors[col]['processor'].transform_data(data)),axis = 1)
                    print('Array Shape',arr.shape)

            return arr

    def get_features_num(self,col_type='All'):
        feature = [] 
        for col in self.processor_column:
            if col_type == 'All':
                feature.append(self.processors[col]['features'])
            elif self.processors[col]['col_type'] == col_type:
                feature.append(self.processors[col]['features'])
        return feature
    
    def get_feature_type(self,unique=False):
        feature = []
        for col in self.processor_column:
            feature.append(self.processors[col]['col_type'])
        if unique:
            return np.unique(feature)
        return feature
    


In [22]:
data_adapt = DataAdapterAndTransformer(df[['cat1','cat2','cat_full','keywords','title','description','price_full']])

Processing: cat1 having Uniques: 10 NAs: 1 Dtype: object
For Categorical Column: cat1
 Freq table Shape: 9 Prob Thresh :6 Threshold Filter:7
Processing: cat2 having Uniques: 61 NAs: 1 Dtype: object
For Categorical Column: cat2
 Freq table Shape: 60 Prob Thresh :11 Threshold Filter:46
Processing: cat_full having Uniques: 54 NAs: 0 Dtype: object
For Categorical Column: cat_full
 Freq table Shape: 54 Prob Thresh :11 Threshold Filter:45
Processing: keywords having Uniques: 8 NAs: 0 Dtype: object
For Categorical Column: keywords
 Freq table Shape: 8 Prob Thresh :5 Threshold Filter:8
Processing: title having Uniques: 1705 NAs: 0 Dtype: object
iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100, perplexity: 384.1793
iteration: 11 of max_iter: 100
i

In [23]:
data_adapt.print_stats()


Frequency information for Category: cat1
       index  freq   prob  prob_t  thres  suggested_cat
0     womens  1105  61.02    True   True           True
1       mens   437  24.13    True   True           True
2       home    74   4.09    True   True           True
3       boys    59   3.26    True   True           True
4      girls    56   3.09    True   True           True
5      women    51   2.82    True   True           True
6  fragrance    19   1.05   False   True           True
7     towels     5   0.28   False  False           True
8  underwear     4   0.22   False  False           True

Frequency information for Category: cat2
                        index  freq  prob  prob_t  thres  suggested_cat
0                       jeans   156  8.61    True   True           True
1                    handbags   147  8.12    True   True           True
2              blouses-shirts   146  8.06    True   True           True
3                     dresses   138  7.62    True   True           T

In [24]:
data_adapt.get_transformed(df)

[array([[0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int8), array([[0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int8), array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int8), array([[1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1],
        ..., 
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int8), array([[ 0.01333333,  0.01333333,  0.01333333, ...,  0.01333333,
          0.01333333,  0.01333333]

In [25]:
data_adapt.get_feature_type(unique=True)

array(['Category', 'Numeric', 'Text'],
      dtype='<U8')

In [26]:
for i in zip(data_adapt.get_features_num(),data_adapt.get_feature_type()):
    print(i)

(10, 'Category')
(47, 'Category')
(46, 'Category')
(8, 'Category')
(25, 'Text')
(25, 'Text')
(1, 'Numeric')


#### To implement Data adator for whole data and make features on all data in one go
#### Also to include following in above code:
* Shape of each feature
* Keras layer generator for each feature
* Feature to \[list set\] convertor for merging and training

## Now we will define the neural networks and train them on these features

We will use keras for neural networks

### Using hidden features generated by RBM for Clustering (Not working)
### Switching to VAE for dimentionality reductions and doing clustring ( Will implement parts from this paper : https://arxiv.org/pdf/1801.07648.pdf )

In [27]:
# from keras.layers.merge import Add, Concatenate
from keras.layers import Dense, Input, Lambda
from keras.models import Sequential, Model
from keras.optimizers import SGD,Adam
from keras.layers.merge import Concatenate
from keras import backend as K
from keras import metrics

Using TensorFlow backend.


We have tried lot of ways to train VAE on merge data set bt essentially the model is not getting optimised properly.

As per the [Microsoft paper](https://arxiv.org/ftp/arxiv/papers/1611/1611.00384.pdf) the tags/categories are used with single FC layer.

Essentially the problem with there approach is, as they are feeding a binary vector of tage to a FC layer, the possiblity of the single layer understanding the global behaviour is very less. Also this does not get any relative behavior of data from overall features. Also they havent mention how these FC are getting trained.

To tackle this we are using RBM to extract relative features of joint behaviour on global data and use these features as input to VAE model to generate the overall reduced features from the data. 

##### Note: We can expand the scope of RBMS to all the categories possible. Hence our task of Binning the Categorical feature will be Reduced.

We will generate cluster on these reduced features and try to see how good or bad they are with resptect to the differnt categories. We are using Gaussian Mixture based Clustering to generate clusters.


In [31]:
def cat_feature_from_rbm(data,components = 'auto',plot_it=False):
    if components == 'auto':
        comp = int(data.shape[1]/9)
    else:
        comp = components
    
    rbm = BernoulliRBM(n_components = comp, 
                       batch_size = 100,
                       n_iter=20,
                       learning_rate=0.01,
                       random_state=123, verbose=True)
    tr=rbm.fit_transform(data)
    if plot_it:
        print('\nPlotting features!\n')
        sns.set(style="ticks")
        df_plot = pd.DataFrame(rbm.transform(data))
        # sns.pairplot(df_plot,diag_kind="kde",).fig.suptitle('Plots of Reduced dimentions!')
        g = sns.PairGrid(df_plot)
        g = g.map_upper(plt.scatter)
        g = g.map_lower(sns.kdeplot, cmap="Blues_d")
        g = g.map_diag(sns.kdeplot, lw=3, legend=False)
        g
    return tr, comp, rbm

# cat_feature_from_rbm(data_adapt.get_transformed(df,'Category'),5)

Another thought on this is using VAE to reduces the features and then use those clustered features. Exploring this dimentions


In [32]:
def cat_feature_from_vae(data,components = 'auto',plot_it=False):
    
    original_dim = data.shape[1]
    intermediate_dim = int(np.ceil(original_dim/2.5))
    
    if components == 'auto':
        latent_dim = int(np.ceil(intermediate_dim/9))
    else:
        latent_dim = components
    
    batch_size = 200
    epochs = 250
    epsilon_std = 1.0
    
    x = Input(shape = (original_dim,))
    h = Dense(intermediate_dim, activation='elu')(x)
    z_mean = Dense(latent_dim)(h)
    z_log_var = Dense(latent_dim)(h)

    def sampling(args):
        z_mean, z_log_var = args
        epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0.,
                                  stddev=epsilon_std)
        return z_mean + K.exp(z_log_var / 2) * epsilon

    # note that "output_shape" isn't necessary with the TensorFlow backend
    z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

    # we instantiate these layers separately so as to reuse them later
    decoder_h = Dense(intermediate_dim, activation='elu')
    decoder_mean = Dense(original_dim, activation='sigmoid')
    h_decoded = decoder_h(z)
    x_decoded_mean = decoder_mean(h_decoded)
    
    # instantiate VAE model
    vae = Model(inputs=x, outputs=x_decoded_mean)

    # Compute VAE loss
    xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    vae_loss = K.mean(xent_loss + kl_loss)

    vae.add_loss(vae_loss)
    opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)
    vae.compile(optimizer=opt)
    vae.summary()
    
    # fitting our generated features on the data
    vae.fit(data,
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size)
    
    # Encoder model for feature generation
    encoder = Model(inputs=x, outputs = z_mean)

    # display a 2D plot of the digit classes in the latent space
    tr = encoder.predict(data, batch_size=batch_size)
    if plot_it:
        print('\nPlotting features!\n')
        # Plotting with SNS
        sns.set(style="ticks")
        df_plot = pd.DataFrame(tr)
        g = sns.PairGrid(df_plot)
        g = g.map_upper(plt.scatter)
        g = g.map_lower(sns.kdeplot, cmap="Blues_d")
        g = g.map_diag(sns.kdeplot, lw=3, legend=False)
        g
        
    return tr, latent_dim, encoder

# cat_feature_from_vae(data_adapt.get_transformed(df,'Category'))

#### Testing Merge layer behaviour

In [33]:
# Defining all features input and training set
# X_train = data_adapt.get_transformed(df)
# print([i.shape[1] for i in X_train])

# function for generating mutiple input feature based on type of feature
def generate_input_training(adaptor,data,cat_ext = 'rbm'):
    train = []
    inputs = []
    for t in adaptor.get_feature_type(unique=True):
        print('\nProcessing for {}'.format(t))
        if t == 'Category':
            tr = adaptor.get_transformed(data,col_type=t)
            if cat_ext == 'rbm':
                tr, comp, mod = cat_feature_from_rbm(tr)
            elif cat_ext == 'vae':
                tr, comp, mod = cat_feature_from_vae(tr)
            train.append(tr)
            inputs.append(Input(shape = (comp,)))
        else:
            tr = adaptor.get_transformed(data,col_type=t)
            train.append(tr)
            inputs.append(Input(shape = (tr.shape[1],),))
    return train,inputs,mod

# X_input = [generate_input(i[0],i[1]) for i in zip(data_adapt.get_features_num(),data_adapt.get_feature_type())]

# todo take rbm to adapter or find another way for easier scoring
X_train,X_input,rbm = generate_input_training(data_adapt,df,cat_ext='vae')


# Conctenating all inputs
# merged = Concatenate()([titler, pricer, keyworder])
merged = Concatenate()(X_input)


Processing for Category
Array Shape (1811, 10)
Array Shape (1811, 57)
Array Shape (1811, 103)
Array Shape (1811, 111)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 111)          0                                            
__________________________________________________________________________________________________
dense_11 (Dense)                (None, 45)           5040        input_6[0][0]                    
__________________________________________________________________________________________________
dense_12 (Dense)                (None, 5)            230         dense_11[0][0]                   
__________________________________________________________________________________________________
dense_13 (Dense)                (None, 5)            230         dense_11[0][0]          



Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

#### Implementing VAE from example in kears

In [34]:
def vae_feature_generation(X_train,X_input,components='auto',funnel=(2.5,9),
                           batch=200,epoch = 250,epsilon=1.0,opt= None,
                           validation=False, plot_it=False):
    
    original_dim = sum([i.shape[1] for i in X_train])

    if components == 'auto':
        intermediate_dim = int(np.ceil(original_dim/2.5))
        latent_dim = int(np.ceil(intermediate_dim/9))
    elif components == 'ratio':
        intermediate_dim = int(np.ceil(original_dim/funnel[0]))
        latent_dim = int(np.ceil(intermediate_dim/funnel[1]))
    elif components == 'values':
        intermediate_dim = funnel[0]
        latent_dim = funnel[1]
    
    batch_size = batch
    epochs = epoch
    epsilon_std = epsilon
    
    x = Concatenate()(X_input)
    h = Dense(intermediate_dim, activation='elu')(x)
    z_mean = Dense(latent_dim)(h)
    z_log_var = Dense(latent_dim)(h)


    def sampling(args):
        z_mean, z_log_var = args
        epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0.,
                                  stddev=epsilon_std)
        return z_mean + K.exp(z_log_var / 2) * epsilon

    # note that "output_shape" isn't necessary with the TensorFlow backend
    z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

    # we instantiate these layers separately so as to reuse them later
    decoder_h = Dense(intermediate_dim, activation='elu')
    decoder_mean = Dense(original_dim, activation='sigmoid')
    h_decoded = decoder_h(z)
    x_decoded_mean = decoder_mean(h_decoded)
    
    # instantiate VAE model
    vae = Model(inputs=X_input, outputs=x_decoded_mean)

    # Compute VAE loss
    xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    vae_loss = K.mean(xent_loss + kl_loss)

    vae.add_loss(vae_loss)
    if opt is None:
        opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)
    vae.compile(optimizer=opt)
    vae.summary()
    
    # fitting our generated features on the data
    vae.fit(X_train,
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size
    #         validation_data=(x_test, None)
           )
    
    # Encoder model for feature generation
    encoder = Model(inputs=X_input, outputs = z_mean)

    # display a 2D plot of the digit classes in the latent space
    tr = encoder.predict(X_train, batch_size=batch_size)
    if plot_it:
        print('\nPlotting features!\n')
        # Plotting with SNS
        sns.set(style="ticks")
        df_plot = pd.DataFrame(tr)
        g = sns.PairGrid(df_plot)
        g = g.map_upper(plt.scatter)
        g = g.map_lower(sns.kdeplot, cmap="Blues_d")
        g = g.map_diag(sns.kdeplot, lw=3, legend=False)
        g
        
    return tr, latent_dim, encoder


In [35]:
x_encoded,_,_ = vae_feature_generation(X_train,X_input,components='values',funnel=(20,5))



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 5)            0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
concatenate_3 (Concatenate)     (None, 56)           0           input_7[0][0]                    
                                                                 input_8[0][0]                    
          

In [36]:
x_encoded

array([[ 0.6381529 , -0.34997982,  1.6095823 ,  0.25520888,  0.6398437 ],
       [-1.63176036, -0.07498963,  0.75491416, -0.21561816, -1.00081027],
       [-1.77263021,  0.34519193, -0.55770314, -0.46109262,  0.91103792],
       ..., 
       [-1.17863512, -0.46446615,  2.49844003,  0.09482893, -0.15387008],
       [ 1.18074477,  0.2008355 , -0.64397943,  0.34267092, -1.55484998],
       [-1.90580106, -0.2067298 ,  0.70348847, -0.15968969, -1.03720844]], dtype=float32)

## Now we'll use the encoded data for clustering
### We'll use different clustring methods and plot them to see which looks better

In [37]:
default_base = {'quantile': .1,
                'eps': .3,
                'damping': .7,
                'preference': -200,
                'n_neighbors': 10,
                'n_clusters': 10}

params = default_base.copy()

# Data for clustering
X = x_encoded

# normalize dataset for easier parameter selection
# X = StandardScaler().fit_transform(X)

# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])
# print(bandwidth)

# connectivity matrix for structured Ward
connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False)

# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# print(connectivity)

# ============
# Create cluster objects
# ============

ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])

ward = cluster.AgglomerativeClustering(
    n_clusters=params['n_clusters'], linkage='ward',
    connectivity=connectivity)

spectral = cluster.SpectralClustering(
    n_clusters=params['n_clusters'], eigen_solver='arpack',
    affinity="nearest_neighbors")

dbscan = cluster.DBSCAN(eps=params['eps'], min_samples=5)

affinity_propagation = cluster.AffinityPropagation(
    damping=params['damping'], preference=params['preference'])

average_linkage = cluster.AgglomerativeClustering(
    linkage="average", affinity="cityblock",
    n_clusters=params['n_clusters'], connectivity=connectivity)

birch = cluster.Birch(n_clusters=params['n_clusters'])

gmm = mixture.GaussianMixture( n_components=params['n_clusters'], covariance_type='full')

clustering_algorithms = (
    ('MiniBatchKMeans', two_means),
#     ('AffinityPropagation', affinity_propagation),
    ('MeanShift', ms),
#     ('SpectralClustering', spectral),
    ('Ward', ward),
#     ('AgglomerativeClustering', average_linkage),
#     ('DBSCAN', dbscan),
    ('Birch', birch),
    ('GaussianMixture', gmm)
)

clusters = pd.DataFrame(index = df.index,columns=[i[0] for i in clustering_algorithms])

t0 = time.time()
for name, algorithm in clustering_algorithms:
#     plt.figure(figsize = (12,14))
    

    # catch warnings related to kneighbors_graph
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="the number of connected components of the " +
            "connectivity matrix is [0-9]{1,2}" +
            " > 1. Completing it to avoid stopping the tree early.",
            category=UserWarning)
        warnings.filterwarnings(
            "ignore",
            message="Graph is not fully connected, spectral embedding" +
            " may not work as expected.",
            category=UserWarning)
        algorithm.fit(X)

    
    if hasattr(algorithm, 'labels_'):
        y_pred = algorithm.labels_.astype(np.int)
    else:
        y_pred = algorithm.predict(X)

#     df_plot = pd.DataFrame(X)
#     df_plot['cluster'] = y_pred
    clusters[name] = y_pred
#     sns.pairplot(df_plot, hue='cluster',vars=df_plot.columns.drop('cluster'),
#                  diag_kind="kde").fig.suptitle(name)
    
#     g = sns.PairGrid(df_plot,hue='cluster',vars=df_plot.columns.drop('cluster'))
#     g = g.map_upper(plt.scatter)
#     g = g.map_lower(sns.kdeplot)
#     g = g.map_diag(sns.kdeplot, lw=3)
#     g

print('Total time to fit all clusters {}'.format(time.time()-t0))

Total time to fit all clusters 0.6573200225830078


In [38]:
clusters.head()

Unnamed: 0,MiniBatchKMeans,MeanShift,Ward,Birch,GaussianMixture
0,4,2,1,3,8
1,6,3,5,2,3
2,3,1,4,1,1
3,0,1,0,1,8
4,6,3,5,2,3


In [39]:
# Mapping Y pred and seeing what all is common in them
df.columns

Index(['resource', 'cat2', 'resource_type', 'thumbnail_alt1', 'cat_full',
       'thumbnail_alt4', 'thumbnail', 'availability', 'thumbnail_alt2',
       'shipping_info', 'title', 'resource_id', 'price_full', 'isblacklisted',
       'price_sale', 'isunavailable', 'moddate', 'price_currency',
       'returns_info', 'keywords', 'cat1', 'replace', 'pubdate', 'url',
       'thumbnail_alt3', 'body', 'description'],
      dtype='object')

In [40]:
def crosstab_sparsity(data,var1,var2,print_stats=True):
    if not isinstance(var1,str) and not isinstance(var2,str):
        cross = pd.crosstab(var1,var2,dropna=False)
    elif isinstance(var2,str):
        cross = pd.crosstab(data[var1],df[var2],dropna=False)
    else:
        cross = pd.crosstab(data[var1],var2,dropna=False)
    if print_stats:
        print('Crosstab:\n')
        print(cross)
        hist = np.histogram(cross.values)
        print('\nMedian: {} , Mean: {}, StDev: {}, Max: {}, Min: {}, \n'\
              'Sum of True Values on Hypothesis: {} \nSparsity: {}\n'.format(np.median(cross.values),
                np.mean(cross.values),np.std(cross.values),np.max(cross.values),
                np.min(cross.values),np.sum(cross.values>np.median(cross.values)),
                np.sum(cross.values>np.median(cross.values))/(cross.values.shape[0]*cross.values.shape[1])))
    return np.sum(cross.values>np.median(cross.values))/(cross.values.shape[0]*cross.values.shape[1])
    

In [41]:
sparsity = []
for col_ in clusters.columns:
    print('Corsstab with \'{}\' for cluster \'{}\''.format('keywords',col_))
    sparsity.append(crosstab_sparsity(df,'keywords',clusters[col_]))

Corsstab with 'keywords' for cluster 'MiniBatchKMeans'
Crosstab:

MiniBatchKMeans        0    1    2    3    4    5    6    7   8   9
keywords                                                           
{'NULL'}               5    3    8    5   22    7    5   15   4  11
{'calvinklein-home'}   0    8    0    0   10    0    0    0   0   0
{'collection-home'}    0    9    0    0   29    0    0    2   0   0
{'home'}               0    3    0    0    7    0    0    0   0   0
{'jeans'}             68  117   76   44   19    8  211    1   3  21
{'platinum'}           1    0    0    0    0    0    0    0   0  11
{'underwear'}          0    0    0    0  189    0    0    0   0   0
{'whitelabel'}        48    0  156  128   22  215    0  160  91  69

Median: 0.0 , Mean: 22.6375, StDev: 49.602732724619116, Max: 215, Min: 0, 
Sum of True Values on Hypothesis: 38 
Sparsity: 0.475

Corsstab with 'keywords' for cluster 'MeanShift'
Crosstab:

MeanShift               0    1    2    3
keywords              

In [42]:
sparsity = []
for col_ in clusters.columns:
    print('Corsstab with \'{}\' for cluster \'{}\''.format('cat1',col_))
    sparsity.append(crosstab_sparsity(df,'cat1',clusters[col_]))

Corsstab with 'cat1' for cluster 'MiniBatchKMeans'
Crosstab:

MiniBatchKMeans   0    1    2    3   4    5    6    7   8   9
cat1                                                         
boys             46    0    2    1   8    0    0    0   2   0
fragrance         0    0    0    0  16    0    0    0   0   3
girls            14    0   12    0  30    0    0    0   0   0
home              0   21    0    0  51    0    0    2   0   0
mens             61    0    0  151  69    0   49    0  78  29
towels            0    0    0    0   4    0    0    1   0   0
underwear         0    0    0    0   4    0    0    0   0   0
women             0    0    0    0  51    0    0    0   0   0
womens            0  119  226   25  65  230  167  175  18  80

Median: 0.0 , Mean: 20.11111111111111, StDev: 47.35291717974826, Max: 230, Min: 0, 
Sum of True Values on Hypothesis: 32 
Sparsity: 0.35555555555555557

Corsstab with 'cat1' for cluster 'MeanShift'
Crosstab:

MeanShift    0    1    2    3
cat1            

In [43]:
sparsity = []
for col_ in clusters.columns:
    print('Corsstab with \'{}\' for cluster \'{}\''.format('cat2',col_))
    sparsity.append(crosstab_sparsity(df,'cat2',clusters[col_]))

Corsstab with 'cat2' for cluster 'MiniBatchKMeans'
Crosstab:

MiniBatchKMeans             0   1    2   3   4    5    6    7   8   9
cat2                                                                 
baby-0-to-24-months        12   0    0   0   0    0    0    0   0   0
bags                        0   0    0  18   0    0    0    0   1   0
bathrobes                   0   0    0   0   4    0    0    1   0   0
belts                       0   0    0   0   0    0    0    0  10   0
blouses-shirts              0   0    0   0   0  134   12    0   0   0
boxer-briefs                0   0    0   0   5    0    0    0   0   0
bralettes                   0   0    0   0   1    0    0    0   0   0
bras                        0   0    0   0  40    0    0    0   0   0
briefs                      0   0    0   0  12    0    0    0   0   0
clothing-bottoms            4   0   12   0   0    0    0    0   0   0
clothing-dresses-skirts     0   0    0   0  15    0    0    0   0   0
clothing-outerwear          

In [44]:
sparsity = []
for col_ in clusters.columns:
    print('Corsstab with \'{}\' for cluster \'{}\''.format('cat_full',col_))
    sparsity.append(crosstab_sparsity(df,'cat_full',clusters[col_]))

Corsstab with 'cat_full' for cluster 'MiniBatchKMeans'
Crosstab:

MiniBatchKMeans                   0   1    2   3   4    5    6    7   8   9
cat_full                                                                   
3 for $33 Panty Essentials        0   0    0   0  25    0    0    0   0   0
40% Off Sale                      0   2    0   0  12    0    0    1   0   0
80% Off Final Sale                0  11    0   0   5    0    0    0   0   0
Activewear                        0   0   13   0   0    0    0    0   0   0
Baby 0-24 Months                 13   0    0   0   0    0    0    0   0   0
Bags                              0   0    0  18   0    0    0    0   1   0
Bath Towels                       0   0    0   0   4    0    0    1   0   0
Belts                             0   0    0   0   0    0    0    0  10   0
Bottoms                           4   0   12   0   0    0    0    0   0   0
Boxer Briefs                      0   0    0   0   5    0    0    0   0   0
Briefs                

In [45]:
sparsity = []
for col_ in clusters.columns:
    print('Corsstab with \'{}\' for cluster \'{}\''.format('cat_full',col_))
    sparsity.append(crosstab_sparsity(df,'cat_full',clusters[col_]))
    

Corsstab with 'cat_full' for cluster 'MiniBatchKMeans'
Crosstab:

MiniBatchKMeans                   0   1    2   3   4    5    6    7   8   9
cat_full                                                                   
3 for $33 Panty Essentials        0   0    0   0  25    0    0    0   0   0
40% Off Sale                      0   2    0   0  12    0    0    1   0   0
80% Off Final Sale                0  11    0   0   5    0    0    0   0   0
Activewear                        0   0   13   0   0    0    0    0   0   0
Baby 0-24 Months                 13   0    0   0   0    0    0    0   0   0
Bags                              0   0    0  18   0    0    0    0   1   0
Bath Towels                       0   0    0   0   4    0    0    1   0   0
Belts                             0   0    0   0   0    0    0    0  10   0
Bottoms                           4   0   12   0   0    0    0    0   0   0
Boxer Briefs                      0   0    0   0   5    0    0    0   0   0
Briefs                

In [46]:
sns.heatmap(pd.crosstab(clusters.Ward,clusters.GaussianMixture))

NameError: name 'sns' is not defined

## Merging the resourcesid with encoded x and cluster to find the best scoring resources similar to given resourcesid

In [47]:
df_scored = pd.DataFrame({'resource_id':df.resource_id})
df_scored.shape

(1811, 1)

In [48]:
x_encoded.shape

(1811, 5)

In [49]:
range(x_encoded.shape[1])

range(0, 5)

In [50]:
df_scored = pd.DataFrame({'resource':df.resource,
                          'resource_id':df.resource_id})
df_scored['resource_id'] = df_scored['resource_id'].apply(lambda x: str(x))
for i in range(x_encoded.shape[1]):
    df_scored['Feature_{}'.format(i+1)] = x_encoded[:,i]
    
for i in clusters.columns:
    df_scored[i] = clusters[i]
df_scored.head()

Unnamed: 0,resource,resource_id,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,MiniBatchKMeans,MeanShift,Ward,Birch,GaussianMixture
0,product|44017100,44017100,0.638153,-0.34998,1.609582,0.255209,0.639844,4,2,1,3,8
1,product|25017436,25017436,-1.63176,-0.07499,0.754914,-0.215618,-1.00081,6,3,5,2,3
2,product|23103237,23103237,-1.77263,0.345192,-0.557703,-0.461093,0.911038,3,1,4,1,1
3,product|47114014,47114014,-1.001859,0.99673,0.976865,-1.531329,-0.041816,0,1,0,1,8
4,product|18536579,18536579,-0.660602,-0.237507,0.056829,0.026566,-2.058557,6,3,5,2,3


In [66]:
def score(resource,data,cluster_type='Ward'):
#     data = df_scored
    cluster = data[data.resource.isin([resource])][cluster_type].unique()
    print('Cluster in whihc the item fall {}'.format(cluster))
    
    cluster = data[data[cluster_type].isin(cluster)]
    feature_space = [i for i in df_scored.columns.tolist() if i.startswith('Feature')]
    score = []
    
    base_vec = data[data.resource.isin([resource])][feature_space].values
    for resource in cluster.resource.unique():
        vec = data[data.resource == resource][feature_space].values
        score.append((resource,np.sqrt(np.sum((vec-base_vec)**2, axis=1))[0]))
    score = sorted(score, key=lambda x: x[1])
    return score

In [76]:
_ = score('product|23103237',df_scored)

Cluster in whihc the item fall [4]


In [70]:
def recommend(resource,scored_data,actual_data,num_recommend=5):
    scores = score(resource,scored_data)[1:(num_recommend+1)]
    scored_id = [x[0] for x in scores]
    actual = df[df.resource == resource]
    recommend = df[df.resource.isin(scored_id)]
    return actual,recommend

In [75]:
act, rec = recommend('product|44017100',df_scored,df)
print(act)
rec

Cluster in whihc the item fall [1]
           resource    cat2 resource_type  \
0  product|44017100  womens     {product}   

                                      thumbnail_alt1           cat_full  \
0  https://calvinklein.scene7.com/is/image/Calvin...  Women's Fragrance   

  thumbnail_alt4                                          thumbnail  \
0           NULL  https://calvinklein.scene7.com/is/image/Calvin...   

  availability thumbnail_alt2  \
0      instock           NULL   

                                       shipping_info  \
0  Most items leave warehouse within 1-2 full bus...   

                         ...                         price_currency  \
0                        ...                                    USD   

                                        returns_info  keywords       cat1  \
0  Returns must be made within 30 days of the shi...  {'NULL'}  fragrance   

   replace                   pubdate  \
0     None 2018-01-03 11:01:58+00:00   

                     

Unnamed: 0,resource,cat2,resource_type,thumbnail_alt1,cat_full,thumbnail_alt4,thumbnail,availability,thumbnail_alt2,shipping_info,...,price_currency,returns_info,keywords,cat1,replace,pubdate,url,thumbnail_alt3,body,description
55,product|44010045,womens,{product},https://calvinklein.scene7.com/is/image/Calvin...,Women's Fragrance,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'NULL'},fragrance,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/womens-clothing/f...,,expression. individuality. warmth. close your ...,expression. individuality. warmth. close your ...
751,product|44013005,womens,{product},https://calvinklein.scene7.com/is/image/Calvin...,Women's Fragrance,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'NULL'},fragrance,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/womens-clothing/f...,,mysterious. provocative. exciting. the fantasy...,mysterious. provocative. exciting. the fantasy...
902,product|44010016,womens,{product},https://calvinklein.scene7.com/is/image/Calvin...,Women's Fragrance,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'NULL'},fragrance,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/womens-clothing/f...,,luxurious. romantic. timeless. inspired by a s...,luxurious. romantic. timeless. inspired by a s...
1517,product|44010044,womens,{product},https://calvinklein.scene7.com/is/image/Calvin...,Women's Fragrance,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'NULL'},fragrance,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/womens-clothing/f...,,purity. unity. sensuality. the revolutionary f...,purity. unity. sensuality. the revolutionary f...
1711,product|44019000,womens,{product},https://calvinklein.scene7.com/is/image/Calvin...,Women's Fragrance,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'NULL'},fragrance,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/womens-clothing/f...,,deep euphoria is a new expression of euphoric ...,deep euphoria is a new expression of euphoric ...


In [74]:
act,rec = recommend('product|47114014',df_scored,df)
print(act)
rec

Cluster in whihc the item fall [0]
           resource     cat2 resource_type thumbnail_alt1 cat_full  \
3  product|47114014  wallets     {product}           NULL  Wallets   

  thumbnail_alt4                                          thumbnail  \
3           NULL  https://calvinklein.scene7.com/is/image/Calvin...   

  availability thumbnail_alt2  \
3      instock           NULL   

                                       shipping_info  \
3  Most items leave warehouse within 1-2 full bus...   

                         ...                         price_currency  \
3                        ...                                    USD   

                                        returns_info   keywords  cat1  \
3  Returns must be made within 30 days of the shi...  {'jeans'}  mens   

   replace                   pubdate  \
3     None 2018-01-03 11:01:58+00:00   

                                                 url thumbnail_alt3  \
3  http://www.calvinklein.us/en/mens-clothing/men...       

Unnamed: 0,resource,cat2,resource_type,thumbnail_alt1,cat_full,thumbnail_alt4,thumbnail,availability,thumbnail_alt2,shipping_info,...,price_currency,returns_info,keywords,cat1,replace,pubdate,url,thumbnail_alt3,body,description
374,product|47203491,wallets,{product},,Wallets,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'jeans'},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,"crafted with smooth spazzolato leather, this w...","crafted with smooth spazzolato leather, this w..."
473,product|47119064,wallets,{product},,Wallets,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'jeans'},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,cut in a square shape and designed with a ckj ...,cut in a square shape and designed with a ckj ...
951,product|47203475,wallets,{product},,Wallets,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'jeans'},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,featuring a ck logo on the front and smooth le...,featuring a ck logo on the front and smooth le...
1065,product|47119066,wallets,{product},https://calvinklein.scene7.com/is/image/Calvin...,Wallets,,https://calvinklein.scene7.com/is/image/Calvin...,instock,https://calvinklein.scene7.com/is/image/Calvin...,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'jeans'},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,"designed with a ckj logo stripe on the front, ...","designed with a ckj logo stripe on the front, ..."
1595,product|47114015,wallets,{product},,Wallets,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'jeans'},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,cut in a square shape and designed with an all...,cut in a square shape and designed with an all...


In [77]:
act,rec = recommend('product|25017436',df_scored,df)
print(act)
rec

Cluster in whihc the item fall [5]
           resource   cat2 resource_type thumbnail_alt1 cat_full  \
1  product|25017436  jeans     {product}           NULL    Jeans   

  thumbnail_alt4                                          thumbnail  \
1           NULL  https://calvinklein.scene7.com/is/image/Calvin...   

  availability thumbnail_alt2  \
1      instock           NULL   

                                       shipping_info  \
1  Most items leave warehouse within 1-2 full bus...   

                         ...                         price_currency  \
1                        ...                                    USD   

                                        returns_info   keywords  cat1  \
1  Returns must be made within 30 days of the shi...  {'jeans'}  mens   

   replace                   pubdate  \
1     None 2018-01-03 11:01:58+00:00   

                                                 url thumbnail_alt3  \
1  http://www.calvinklein.us/en/mens-clothing/men...           

Unnamed: 0,resource,cat2,resource_type,thumbnail_alt1,cat_full,thumbnail_alt4,thumbnail,availability,thumbnail_alt2,shipping_info,...,price_currency,returns_info,keywords,cat1,replace,pubdate,url,thumbnail_alt3,body,description
333,product|25017465,jeans,{product},,Jeans,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'jeans'},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,a pair of skinny leg jeans made with cotton st...,a pair of skinny leg jeans made with cotton st...
1154,product|25017186,jeans,{product},,Jeans,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'jeans'},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,"crafted with italian rigid denim, these straig...","crafted with italian rigid denim, these straig..."
1168,product|25016876,jeans,{product},,Jeans,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'jeans'},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,a pair of straight tapered jeans made with a w...,a pair of straight tapered jeans made with a w...
1311,product|25563531,jeans,{product},,Jeans,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'jeans'},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,a pair of sculpted slim jeans designed with po...,a pair of sculpted slim jeans designed with po...
1741,product|25017165,jeans,{product},,Jeans,,https://calvinklein.scene7.com/is/image/Calvin...,instock,,Most items leave warehouse within 1-2 full bus...,...,USD,Returns must be made within 30 days of the shi...,{'jeans'},mens,,2018-01-03 11:01:58+00:00,http://www.calvinklein.us/en/mens-clothing/men...,,a pair of straight leg jeans made with rigid c...,a pair of straight leg jeans made with rigid c...
