In [1]:
import pandas as pd
df = pd.read_csv("G:/Databases/Python Database/cat_in_the_dat/train.csv")
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [2]:
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

<b>Label Encoding: Mapping the Ordinal Categorical Values</b>

In [3]:
mapping = {
    "Freezing":0,
    "Warm": 1,
    "Cold": 2,
    "Boiling Hot": 3,
    "Hot": 4,
    "Lava Hot": 5,
}
df.loc[:, "ord_2"] = df.ord_2.map(mapping)

In [4]:
df.ord_2.value_counts()

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: ord_2, dtype: int64

<b> Label Encoding with Scikit Learn</b>

In [5]:
from sklearn import preprocessing
df_1 = pd.read_csv("G:/Databases/Python Database/cat_in_the_dat/train.csv")
print(df_1['ord_2'].head())
df_1.loc[:,"ord_2"] = df_1.ord_2.fillna("NONE")
lbl_enc = preprocessing.LabelEncoder()
df_1.loc[:, "ord_2"] = lbl_enc.fit_transform(df_1.ord_2.values)
print(df_1["ord_2"].head())

0         Hot
1        Warm
2    Freezing
3    Lava Hot
4        Cold
Name: ord_2, dtype: object
0    3
1    6
2    2
3    4
4    1
Name: ord_2, dtype: int32


<b>Label Encoding</b> can used for <b>Decision Trees</b>,<b></b>,<b>Random Forest</b>,<b>Extra Trees</b>,<b>XGBoost</b>,<b>GBM</b>,<b>LightGBM</b> but cannot be used for <b>Support Vector Machine</b>,<b>Neural Networks</b>. For these models we can binarize the data (one-hot encoding).

<b>Understanding Sparse Matrix and One-Hot Encoding </b>

In [6]:
import numpy as np
from scipy import sparse

In [None]:
# number of rows
n_rows = 10000
# number of columns
n_cols = 100000
# create random binary matrix with only 5% values as 1s
example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))
# print size in bytes
print(f"Size of dense array: {example.nbytes}")
# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)
# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (
sparse_example.data.nbytes +
sparse_example.indptr.nbytes +
sparse_example.indices.nbytes
)
# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")


Size of dense array: 4000000000


In [None]:

# create binary matrix
example = np.array(
[
[0, 0, 0, 0, 1, 0],
[0, 1, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0]
]
)
# print size in bytes
print(f"Size of dense array: {example.nbytes}")
# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)
# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (
sparse_example.data.nbytes +
sparse_example.indptr.nbytes +
sparse_example.indices.nbytes
)
# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")


<b>One-hot Encoding using Scikit-Learn</b>

In [None]:
# create random 1-d array with 1001 different categories (int)
example = np.random.randint(1000, size=1000000)
# initialize OneHotEncoder from scikit-learn
# keep sparse = False to get dense array
ohe = preprocessing.OneHotEncoder(sparse=False)
# fit and transform data with dense one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))
# print size in bytes for dense array
print(f"Size of dense array: {ohe_example.nbytes}")
# initialize OneHotEncoder from scikit-learn
# keep sparse = True to get sparse array
ohe = preprocessing.OneHotEncoder(sparse=True)
# fit and transform data with sparse one-hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1))
# print size of this sparse matrix
print(f"Size of sparse array: {ohe_example.data.nbytes}")
full_size = (
ohe_example.data.nbytes +
ohe_example.indptr.nbytes + ohe_example.indices.nbytes
)
# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")


Back to <b>Cat in the Dat</b> dataset
The following code cells shows how you can group various attributes and values depending on thier quantity, category or class.

In [None]:
df = pd.read_csv("G:/Databases/Python Database/cat_in_the_dat/train.csv")
df[df.ord_2 == "Boiling Hot"].shape

In [None]:
df.groupby(["ord_2"])["id"].count()

In [None]:
df.groupby(["ord_2"])["id"].transform("count")

In [None]:
df.groupby(
[
    "ord_1",
    "ord_2"
]
)["id"].count().reset_index(name="count")

<b>Creating new features from these categorical variables</b><br>
<i>Combining ord_1 and ord_2</i>

In [None]:
df["new_features"] = (
        df.ord_1.astype(str)
    +"_"
    + df.ord_2.astype(str)
)
df.new_features

<b>Handling NaN Values</b>

In [None]:
# with NaN values
df.ord_2.value_counts()

In [None]:
# After filling the NaN cells
df.ord_2.fillna("NONE").value_counts()

In [None]:
df= pd.read_csv("G:/Databases/Python Database/cat_in_the_dat/train.csv")
df.ord_2.fillna("NONE").value_counts()

In [None]:
df.ord_4 = df.ord_4.fillna("NONE")
df.loc[
    df["ord_4"].value_counts()[df["ord_4"]].values < 2000,"ord_4"] = "RARE"
df.ord_4.value_counts()