## Ordinal Encoder - Issue Handling

#### Label exists in training but not in valid ds, so it raises error to handle it, this is a simple technique

In [None]:
object_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
len(object_cols)

good_label_cols = [col for col in object_cols
                  if set(X_valid[col]).issubset(set(X_train[col]))]

bad_label_cols = list(set(object_cols) - set(good_label_cols))

### Find no. of unique values in each object cols

In [None]:
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
object_nunique

In [None]:
dict(zip(object_cols, object_nunique))

### High Cardinality (>10) finder

In [None]:
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)

## OneHot encode

In [None]:
OHEnc = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_Cols_train = pd.DataFrame(OHEnc.fit_transform(X_train[low_cardinality_cols]))
OH_Cols_valid = pd.DataFrame(OHEnc.transform(X_valid[low_cardinality_cols]))

In [None]:
# OneHot encode will remove Index, so re-apply the index.
OH_Cols_train.index  = X_train.index
OH_Cols_valid.index = X_valid.index

In [None]:
#remove all the object cols.
t1 = X_train_bkup.drop(object_cols, axis=1)
v1 = X_valid_bkup.drop(object_cols, axis=1)

In [None]:
#concat othe than object cols with OneHot encoded cols.
X_train_new = pd.concat([t1, OH_Cols_train], axis=1)
X_valid_new = pd.concat([v1, OH_Cols_valid], axis=1)