## Label Encoding

In [13]:
mapping = {
    "Freezing": 0,
    "Warm": 1,
    "Cold": 2,
    "Boiling Hot": 3,
    "Hot": 4,
    "Lava Hot": 5
}

In [3]:
import pandas as pd

df = pd.read_csv("./resources/cat_train.csv")
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [4]:
df["ord_2"] = df["ord_2"].map(mapping)

In [6]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,4.0,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,1.0,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,0.0,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,5.0,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,2.0,h,C,OZ,5.0,12.0,0


In [12]:
df["ord_2"].value_counts()

ord_2
0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: count, dtype: int64

## LabelEncoder from scikit-learn

In [14]:
import pandas as od
from sklearn import preprocessing

In [15]:
df = pd.read_csv("./resources/cat_train.csv")

In [17]:
# use fillba from pandas since LabelEncoder from scikit-learn 
# does not handle NaN values
df["ord_2"] = df["ord_2"].fillna("NONE")

lbl_enc = preprocessing.LabelEncoder()
df["ord_2"] = lbl_enc.fit_transform(df["ord_2"].values)

In [18]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,3,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,6,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,2,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,4,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,1,h,C,OZ,5.0,12.0,0


This type of encoding cannot be used in linear models, support vector machines or neural networks as they expect data to be normalized (or standardized).

For these types of models, we can binarize the data.

## Spase Matrix
Definition: A sparse format is nothing but a representation or way of storing
data in memory in which you do not store all the values but only the values that
matter.

Any numpy array can be converted to a sparse matrix by simple python code.

In [22]:
import numpy as np
from scipy import sparse

In [23]:
example = np.array(
    [
        [1,0,1],
        [1,0,0],
        [1,0,1]
    ]
)

In [24]:
sparse_example = sparse.csr_matrix(example)

## One Hot Encoding

In [25]:
import numpy as no 
from sklearn import preprocessing

In [29]:
# create random 1-d array with 1001 different categories (int)
example = np.random.randint(1000, size=1000000)

# initialize OneHotEncoder from scikit-learn
# keep sparse = False to get dense array
ohe = preprocessing.OneHotEncoder(sparse_output=False)

# fit and transform data with dense one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1,1))

# print size in bytes for dense array
print(f"Size of dense array: {ohe_example.nbytes}")

Size of dense array: 8000000000


In [34]:
# initialize OneHotEncoder from scikit-learn
# keep sparse = True to get dense array
ohe = preprocessing.OneHotEncoder(sparse_output=True)

# fit and transform data with dense one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1,1))


# print size in bytes for dense array
print(f"Size of dense array: {ohe_example.data.nbytes}")

# Calculate memory usage
memory_usage = (ohe_example.data.nbytes + ohe_example.indices.nbytes + ohe_example.indptr.nbytes)
print(f"Memory usage of csr_matrix: {memory_usage} bytes")

Size of dense array: 8000000
Memory usage of csr_matrix: 16000004 bytes


## Convert categorical variables to numeric variables

In [35]:
df.groupby(["ord_2"])["id"].count()

ord_2
0     84790
1     97822
2    142726
3     67508
4     64840
5     18075
6    124239
Name: id, dtype: int64

In [36]:
df.groupby(["ord_2"])["id"].transform("count")

0          67508
1         124239
2         142726
3          64840
4          97822
           ...  
599995    142726
599996     84790
599997    142726
599998    124239
599999     84790
Name: id, Length: 600000, dtype: int64

In [40]:
df.groupby([
    "ord_1",
    "ord_2"
])["id"].count().reset_index(name="count")

Unnamed: 0,ord_1,ord_2,count
0,Contributor,0,15634
1,Contributor,1,17734
2,Contributor,2,26082
3,Contributor,3,12428
4,Contributor,4,11919
5,Contributor,5,3250
6,Contributor,6,22774
7,Expert,0,19477
8,Expert,1,22956
9,Expert,2,33249


In [41]:
df.groupby([
    "ord_1",
    "ord_2"
])["id"].transform("count")

0         12428.0
1         19899.0
2             NaN
3         17373.0
4         15464.0
           ...   
599995    38233.0
599996    22718.0
599997    26082.0
599998    15734.0
599999    15634.0
Name: id, Length: 600000, dtype: float64

## Create new features from the existing categorical variables

In [47]:
df["ord_1"].astype(str) + "_" + df["ord_2"].astype(str)

0         Contributor_3
1         Grandmaster_6
2                 nan_2
3              Novice_4
4         Grandmaster_1
              ...      
599995         Novice_2
599996         Novice_0
599997    Contributor_2
599998         Master_6
599999    Contributor_0
Length: 600000, dtype: object

In [49]:
df["ord_1"] + "_" + df["ord_1"]

0         Contributor_Contributor
1         Grandmaster_Grandmaster
2                             NaN
3                   Novice_Novice
4         Grandmaster_Grandmaster
                   ...           
599995              Novice_Novice
599996              Novice_Novice
599997    Contributor_Contributor
599998              Master_Master
599999    Contributor_Contributor
Name: ord_1, Length: 600000, dtype: object

# Handling NaN model
Note: Considering NaN as a new category is more ideal than droping them directly

In [50]:
df = pd.read_csv("./resources/cat_train.csv")

In [53]:
df["ord_2"].value_counts()

ord_2
Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: count, dtype: int64

In [56]:
df["ord_2"].fillna("NONE").value_counts()

ord_2
Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
NONE            18075
Name: count, dtype: int64