# Imputing categorical values


In [60]:
#import packages
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt

from fancyimpute import IterativeImputer
from fancyimpute import KNN

from sklearn.preprocessing import OrdinalEncoder

In [65]:
users=pd.read_csv("userprofile.csv")
df_co=['smoker' ,'drink_level' , 'dress_preference' , 'ambience' ,'hijos','activity','budget']
users=users[df_co]
users.head()

Unnamed: 0,smoker,drink_level,dress_preference,ambience,hijos,activity,budget
0,False,abstemious,informal,family,independent,student,medium
1,False,abstemious,informal,family,independent,student,low
2,False,social drinker,formal,family,independent,student,low
3,False,abstemious,informal,family,independent,professional,medium
4,False,abstemious,no preference,family,independent,student,medium


## Ordinal Encoding

In [72]:
#create ordinal Encoder
ambience_ord_enc = OrdinalEncoder()

In [73]:
# Select non-null values in ambience
ambience = users['ambience']
ambience_not_null = ambience[ambience.notnull()]
reshaped_vals = ambience_not_null.values.reshape(-1, 1)

In [74]:
# Encode the non-null values of ambience
encoded_vals = ambience_ord_enc.fit_transform(reshaped_vals)

In [75]:
# Replace the ambience column with ordinal values
users.loc[ambience.notnull(),'ambience'] = np.squeeze(encoded_vals)

In [77]:
# Create dictionary for Ordinal encoders
ordinal_enc_dict = {}
# Loop over columns to encode
for col_name in users:
    # Create ordinal encoder for the column
    ordinal_enc_dict[col_name] = OrdinalEncoder()
    col = users[col_name]# Select the non-null values in the column
    col_not_null = col[col.notnull()]
    reshaped_vals = col_not_null.values.reshape(-1, 1)
    
    # Encode the non-null values of the column
    
    encoded_vals = ordinal_enc_dict[col_name].fit_transform(reshaped_vals)
    
    # Replace the values in the column with ordinal values
    
    users.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)

## Imputing with KNN


In [79]:
users_KNN_imputed = users.copy(deep=True)
# Create KNN imputer
KNN_imputer = KNN()
users_KNN_imputed.iloc[:, :] = np.round(KNN_imputer.fit_transform(users_KNN_imputed))

for col in users_KNN_imputed:
    reshaped_col = users_KNN_imputed[col].values.reshape(-1, 1)
    users_KNN_imputed[col] = ordinal_enc_dict[col].inverse_transform(reshaped_col)

Imputing row 1/138 with 0 missing, elapsed time: 0.005
Imputing row 101/138 with 0 missing, elapsed time: 0.005




## Exercise


### Ordinal encoding of a categorical column


In [None]:
# Create Ordinal encoder
ambience_ord_enc = OrdinalEncoder()

# Select non-null values of ambience column in users
ambience = users['ambience']
ambience_not_null = ambience[ambience.notnull()]

# Reshape ambience_not_null to shape (-1, 1)
reshaped_vals = ambience_not_null.values.reshape(-1, 1)

# Ordinally encode reshaped_vals
encoded_vals = ambience_ord_enc.fit_transform(reshaped_vals)

# Assign back encoded values to non-null values of ambience in users
users.loc[ambience.notnull(), 'ambience'] = np.squeeze(encoded_vals)

### Ordinal encoding of a DataFrame


In [80]:
# Create an empty dictionary ordinal_enc_dict
ordinal_enc_dict = {}

for col_name in users:
    # Create Ordinal encoder for col
    ordinal_enc_dict[col_name] = OrdinalEncoder()
    col = users[col_name]
    
    # Select non-null values of col
    col_not_null = col[col.notnull()]
    reshaped_vals = col_not_null.values.reshape(-1, 1)
    encoded_vals = ordinal_enc_dict[col_name].fit_transform(reshaped_vals)
    
    # Store the values to non-null values of the column in users
    users.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)

### KNN imputation of categorical values


In [81]:
# Create KNN imputer
KNN_imputer = KNN()

# Impute users DataFrame. You can round it to get integer values
users.iloc[:, :] = np.round(KNN_imputer.fit_transform(users))

# Loop over the column names in users
for col_name in users:
    
    # Reshape the data
    reshaped = users[col_name].values.reshape(-1, 1)
    
    # Select the Encoder and perform inverse transform on reshaped
    users[col_name] = ordinal_enc_dict[col_name].inverse_transform(reshaped)

Imputing row 1/138 with 0 missing, elapsed time: 0.005
Imputing row 101/138 with 0 missing, elapsed time: 0.006


