In [52]:
import numpy as np
import pandas as pd
import math
import statsmodels.api as sm
from scipy.stats import chisquare


import settings
import itertools
from sklearn.preprocessing import (LabelBinarizer, LabelEncoder, MinMaxScaler,
                                   OneHotEncoder, StandardScaler, RobustScaler)




def get_encoders(le_name,ohe_name,scaler_name):
    le_encoder = np.load(settings.models_path + le_name + '.npy').item()
    ohe_encoder = np.load(settings.models_path + ohe_name + '.npy').item()
    scaler = np.load(settings.models_path + scaler_name + '.npy').item()

    return le_encoder,ohe_encoder,scaler


def create_encoder(df, le_name = None, ohe_name = None, scaler_name=None, categorical_features=None, numeric_features=None):
    """Creates and stores a categorical encoder of a given dataframe
    
    Arguments:
        df {Dataframe} -- The Pandas Dataframe to encode
    
    Keyword Arguments:
        categorical_features {list} -- The list of categorical features to consider (default: {None})
        numeric_features {list} -- The list of non categorical features to ignore (default: {None})
    
    Returns:
        tuple(dict,dict,OneHotEncoder) -- Return the encoders used in every columns as a dictionnary
    """


    if (categorical_features is None):
        categorical_features = sorted(df.drop(numeric_features,axis=1).columns)
    le_dict = {}
    ohe_dict = {}
    scalers = {}
    for index, col in df[categorical_features].sort_index(axis=1).iteritems():
        if (numeric_features is not None) and (index in numeric_features):
            continue
        if index not in categorical_features:
            continue
        le = LabelEncoder().fit(col)
        le_dict[index] = le
        ohe = OneHotEncoder(categories="auto").fit(le.transform(col).reshape((-1, 1)))
        ohe_dict[index] = ohe

    labeled_df = df[categorical_features].sort_index(axis=1).apply(lambda x: le_dict[x.name].transform(x))
    ohe_encoder = OneHotEncoder(categories="auto").fit(labeled_df)

    # add numeric features
    if len(numeric_features)==0:
        numeric_features = (list(df.columns.to_series().groupby(df.dtypes).groups[np.dtype('float64')]))
    for f in numeric_features:
        values = df[[f]].values
        scaler = MinMaxScaler().fit(values)
        scalers[f] = scaler


    # if le_name is not None:
    #     np.save(settings.models_path + le_name + '.npy', le_dict)
    # if ohe_name is not None:
    #     np.save(settings.models_path + ohe_name + '.npy', ohe_encoder)
    # if scaler_name is not None:
    #     np.save(settings.models_path + scaler_name + '.npy', scalers)
    
    return labeled_df, le_dict, ohe_encoder, scalers, categorical_features, numeric_features
    


In [66]:
from data.preprocessing import load_file

df = load_file("clf_features", type_="P", index = ["Product"])

categorical_features = ["Color","Size","Age Group","Ldate","Person","Pname","Ptype","Currency","Sales Season"]
numeric_features = ["Tprice","s1","s2","s3","s4","s5"]
df.head()

Unnamed: 0_level_0,Color,Size,Ldate,Age Group,Person,Pname,Ptype,Tprice,Currency,Sales Season,s1,s2,s3,s4,s5
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3E+101_2,Blue,Thick,45,4-6,Girls,One Internal Pants,Thick,39.0,$,Winter,101.0,261.0,309.0,297.0,323.0
3E+201_2,Red,Thick,45,4-6,Girls,One Internal Pants,Thick,39.0,$,Winter,81.0,266.0,297.0,270.0,257.0
3E+301_2,Blue,Thick,45,4-6,Girls,One Internal Pants,Thick,39.0,$,Winter,49.0,179.0,190.0,192.0,179.0
30E000400_2,Black,Thick,45,4-6,Girls,One Internal Pants,Thick,39.0,$,Winter,55.0,222.0,261.0,275.0,279.0
30E823101_2,Grey,No Size,39,4-6,Girls,One Internal Pants,Thick,39.0,$,Winter,3.0,15.0,18.0,30.0,30.0


In [68]:
%timeit labeled_df, le_dict, ohe_encoder, scalers, categorical_features, num_features = create_encoder(df, numeric_features=numeric_features)

23.3 ms ± 3.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [76]:
t= np.zeros((1,1))

type(t)

numpy.ndarray