In [1]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from grouplabelencode import grouplabelencode

## Load Demo Data
from [kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques)

In [2]:
df = pd.read_csv("../data/train.csv")

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Nominal- and Ordinal-scale Variables
Both nominal- and ordinal-scale variables can be OneHot encoded

In [4]:
# values are strings or some other alphanumeric mix
nom_str = [
    'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 
    'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 
    'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 
    'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 
    'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 
    'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 
    'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

# nominal values are already encoded
nom_int = ['MSSubClass', 'MoSold']

# ordinal values encoded as integers
ord_int = ['OverallQual', 'OverallCond']

In [5]:
df[nom_str].describe()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1460,1460,91,1460,1460,1460,1460,1460,1460,1460,...,1379,1379,1379,1379,1460,7,281,54,1460,1460
unique,5,2,2,4,4,2,5,3,25,9,...,6,3,5,5,3,3,4,4,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
freq,1151,1454,50,925,1311,1459,1052,1382,225,1260,...,870,605,1311,1326,1340,3,157,49,1267,1198


In [6]:
df[nom_int].astype(str).describe()

Unnamed: 0,MSSubClass,MoSold
count,1460,1460
unique,15,12
top,20,6
freq,536,253


In [7]:
df[ord_int].astype(str).describe()

Unnamed: 0,OverallQual,OverallCond
count,1460,1460
unique,10,9
top,5,5
freq,397,821


## How to encode 1 variable
Let's pick `GarageType`. The variable has missing values (None, Nan, etc) and 6 distince values

In [8]:
x = df['GarageType']

### Label Encoding
Store the unique values in list named `mapping`

In [9]:
mapping = [e for e in set(x) if pd.notnull(e)]
mapping = dict(enumerate(mapping))  # this will be useful lateron
mapping

{0: 'Basment',
 1: '2Types',
 2: 'Attchd',
 3: 'BuiltIn',
 4: 'CarPort',
 5: 'Detchd'}

Encode mapped values. Everything else is encoded as `None`
(The same can be done with sklearn's `LabelEncoder`)

In [10]:
xencoded = grouplabelencode(x, mapping, nastate=False)

In [11]:
pd.DataFrame(np.c_[x, xencoded])[1447:1452]

Unnamed: 0,0,1
1447,Attchd,2.0
1448,Detchd,5.0
1449,,
1450,,
1451,Attchd,2.0


### One-Hot Encoding
For each encoding create 1 column, or resp. 1 new dummy variable, or resp. 1 new one-hot feature.
(The same can be done with `pd.get_dummies(xencoded, prefix='x')` or sklearn's `OneHotEncoder`)

In [12]:
def onehotencode(xencoded, mapping):
    import scipy.sparse
    out = scipy.sparse.lil_matrix((len(xencoded), len(mapping)), dtype=int)
    enc = list(mapping.keys())
    for i, j in enumerate(xencoded):
        if j in enc:
            out[i, j] = 1
    return out

In [13]:
xonehot = onehotencode(xencoded, mapping)
xonehot.toarray()

array([[0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       ...,
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0]])

In [14]:
import scipy.sparse
test = scipy.sparse.hstack([xonehot, xonehot.sum(axis=1)])
test

<1460x7 sparse matrix of type '<class 'numpy.int64'>'
	with 2758 stored elements in COOrdinate format>

In [15]:
np.sum(xonehot.toarray(), axis=0)

array([ 19,   6, 870,  88,   9, 387])

### Make it look good
If that's even possible ...

In [16]:
colnam = list(mapping.keys())
pd.DataFrame(xonehot.toarray(), columns=colnam)[1447:1452]

Unnamed: 0,0,1,2,3,4,5
1447,0,0,1,0,0,0
1448,0,0,0,0,0,1
1449,0,0,0,0,0,0
1450,0,0,0,0,0,0
1451,0,0,1,0,0,0


In [17]:
prefix = 'cool'
sep = '_'
colnam = [prefix + sep + str(k) for k in mapping.keys()]
pd.DataFrame(xonehot.toarray(), columns=colnam)[1447:1452]

Unnamed: 0,cool_0,cool_1,cool_2,cool_3,cool_4,cool_5
1447,0,0,1,0,0,0
1448,0,0,0,0,0,1
1449,0,0,0,0,0,0
1450,0,0,0,0,0,0
1451,0,0,1,0,0,0


In [18]:
colnam = list(mapping.values())
pd.DataFrame(xonehot.toarray(), columns=colnam)[1447:1452]

Unnamed: 0,Basment,2Types,Attchd,BuiltIn,CarPort,Detchd
1447,0,0,1,0,0,0
1448,0,0,0,0,0,1
1449,0,0,0,0,0,0
1450,0,0,0,0,0,0
1451,0,0,1,0,0,0


In [19]:
prefix = 'cool'
sep = '_'
colnam = [prefix + sep + str(v) for v in mapping.values()]
pd.DataFrame(xonehot.toarray(), columns=colnam)[1447:1452]

Unnamed: 0,cool_Basment,cool_2Types,cool_Attchd,cool_BuiltIn,cool_CarPort,cool_Detchd
1447,0,0,1,0,0,0
1448,0,0,0,0,0,1
1449,0,0,0,0,0,0
1450,0,0,0,0,0,0
1451,0,0,1,0,0,0


In [20]:
tostr = lambda s: ''.join(str(s).split()).lower()
prefix = 'cool'
sep = '_'
colnam = [prefix + sep + tostr(k) + sep + tostr(v) for k, v in mapping.items()]
pd.DataFrame(xonehot.toarray(), columns=colnam)[1447:1452]

Unnamed: 0,cool_0_basment,cool_1_2types,cool_2_attchd,cool_3_builtin,cool_4_carport,cool_5_detchd
1447,0,0,1,0,0,0
1448,0,0,0,0,0,1
1449,0,0,0,0,0,0
1450,0,0,0,0,0,0
1451,0,0,1,0,0,0
