In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from onehot import OneHotDummy

## Load Demo Data
from [kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques)

In [2]:
df = pd.read_csv("../data/train.csv")

In [3]:
X = df['GarageType']

## Default Case

In [4]:
obj = OneHotDummy()
obj.fit(X)

OneHotDummy(droprule=None,
      mapping={0: 'CarPort', 1: 'Attchd', 2: '2Types', 3: 'Detchd', 4: 'Basment', 5: 'BuiltIn'},
      nametyp=None, nastate=False, prefix='col', sep='_', sparse=True)

In [5]:
Z = obj.transform(X)
Z

<1460x6 sparse matrix of type '<class 'numpy.int64'>'
	with 1379 stored elements in LInked List format>

In [6]:
colnam = obj.get_feature_names()
colnam

['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5']

In [7]:
tmp = pd.DataFrame(Z.toarray(), columns=colnam)
tmp.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5
0,0,1,0,0,0,0
1,0,1,0,0,0,0
2,0,1,0,0,0,0
3,0,0,0,1,0,0
4,0,1,0,0,0,0


In [8]:
tmp.sum()

col_0      9
col_1    870
col_2      6
col_3    387
col_4     19
col_5     88
dtype: int64

## Drop the least frequent dummy variable 
Avoid the dummy trap

In [9]:
obj = OneHotDummy(droprule='least')
obj.fit(X)

Z = obj.transform(X).toarray()
tmp = pd.DataFrame(Z, columns=obj.get_feature_names())
tmp.sum()

col_0      9
col_1    870
col_2    387
col_3     19
col_4     88
dtype: int64

## Drop the most frequent dummy variable
This rule exist because it was easy to implement. 
Maybe the most frequent label is the least important label (Like TF-IDF rationale)

In [10]:
obj = OneHotDummy(droprule='most')
obj.fit(X)

Z = obj.transform(X).toarray()
tmp = pd.DataFrame(Z, columns=obj.get_feature_names())
tmp.sum()

col_0      9
col_1      6
col_2    387
col_3     19
col_4     88
dtype: int64

## Add dummy variable for missing values (NA)
A extra column is added after conducting the one-hot transformation!

In [11]:
obj = OneHotDummy(nastate=True)
obj.fit(X)

Z = obj.transform(X).toarray()
tmp = pd.DataFrame(Z, columns=obj.get_feature_names())
tmp.sum()

col_0       9
col_1     870
col_2       6
col_3     387
col_4      19
col_5      88
col_na     81
dtype: int64

## Don't mix droprule and nastate
If you drop a label it's automatically considered as NA. 
For example the 6 existing observations of `2Types` would would be included in the additional `col_na` dummy variable.

In [12]:
obj = OneHotDummy(droprule='least', nastate=True)
obj.fit(X)

Z = obj.transform(X).toarray()
tmp = pd.DataFrame(Z, columns=obj.get_feature_names())
tmp.sum()

col_0       9
col_1     870
col_2     387
col_3      19
col_4      88
col_na     87
dtype: int64

## Custom Mapping
The fitted mapping is a python dictionary

In [13]:
obj.mapping

{0: 'CarPort', 1: 'Attchd', 2: 'Detchd', 3: 'Basment', 4: 'BuiltIn'}

We can provide a custom mapping at object instantiation.
An additional fitting step `.fit` would not be necessary anymore.

In [14]:
mymap = {0: 'BuiltIn', 1: 'Attchd', 2: 'Basment', 3: 'Detchd'}

obj = OneHotDummy(mapping=mymap, nametyp='withlabel')

tmp = pd.DataFrame(
    data = obj.transform(X).toarray(), 
    columns = obj.get_feature_names())

tmp.head()

Unnamed: 0,col_0_builtin,col_1_attchd,col_2_basment,col_3_detchd
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,0,0,1
4,0,1,0,0


However, calling the `.fit` method overwrites an existing map

In [15]:
obj.fit(X)

tmp = pd.DataFrame(
    data = obj.transform(X).toarray(), 
    columns = obj.get_feature_names())

tmp.head()

Unnamed: 0,col_0_carport,col_1_attchd,col_2_2types,col_3_detchd,col_4_basment,col_5_builtin
0,0,1,0,0,0,0
1,0,1,0,0,0,0
2,0,1,0,0,0,0
3,0,0,0,1,0,0
4,0,1,0,0,0,0


## No Sparse Matrix
A sparse matrix is returned by default. 
This can be disabled

In [16]:
obj = OneHotDummy(sparse=False)
obj.fit(X)

Z = obj.transform(X)
type(Z)

numpy.ndarray