# Categorical Variable Embeddings

### Load the requied libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score

from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Concatenate, Flatten, Input, Reshape, Dropout

### Read the data

In [None]:
df = pd.read_csv('cmc.data', header=None, names=['Age','Education','H_education',
                                                 'num_child','Religion', 'Employ',
                                                 'H_occupation','living_standard',
                                                 'Media_exposure','contraceptive'])
df.head()

Unnamed: 0,Age,Education,H_education,num_child,Religion,Employ,H_occupation,living_standard,Media_exposure,contraceptive
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1


In [None]:
df.dtypes

Age                int64
Education          int64
H_education        int64
num_child          int64
Religion           int64
Employ             int64
H_occupation       int64
living_standard    int64
Media_exposure     int64
contraceptive      int64
dtype: object

### Null check: no missing record in this data

In [None]:
df.isnull().sum()

Age                0
Education          0
H_education        0
num_child          0
Religion           0
Employ             0
H_occupation       0
living_standard    0
Media_exposure     0
contraceptive      0
dtype: int64

### Check bad records: no bad record in this data like negative age or negative num_child

In [None]:
df.describe()

Unnamed: 0,Age,Education,H_education,num_child,Religion,Employ,H_occupation,living_standard,Media_exposure,contraceptive
count,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0
mean,32.538357,2.958588,3.429735,3.261371,0.850645,0.749491,2.137814,3.133741,0.073999,1.919891
std,8.227245,1.014994,0.816349,2.358549,0.356559,0.433453,0.864857,0.976161,0.261858,0.876376
min,16.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
25%,26.0,2.0,3.0,1.0,1.0,0.0,1.0,3.0,0.0,1.0
50%,32.0,3.0,4.0,3.0,1.0,1.0,2.0,3.0,0.0,2.0
75%,39.0,4.0,4.0,4.0,1.0,1.0,3.0,4.0,0.0,3.0
max,49.0,4.0,4.0,16.0,1.0,1.0,4.0,4.0,1.0,3.0


### Contraceptive is the target attribute and it has three levels. So this is multiclass classification problem.  

In [None]:
df['contraceptive'].value_counts()

1    629
3    511
2    333
Name: contraceptive, dtype: int64

In [None]:
target = df['contraceptive'].copy()
target.head()

0    1
1    1
2    1
3    1
4    1
Name: contraceptive, dtype: int64

### One-hot encoder on the target variable

In [None]:
df = pd.get_dummies(columns=['contraceptive'], data=df, prefix_sep="_", drop_first=False)
df.head()

Unnamed: 0,Age,Education,H_education,num_child,Religion,Employ,H_occupation,living_standard,Media_exposure,contraceptive_1,contraceptive_2,contraceptive_3
0,24,2,3,3,1,1,2,3,0,1,0,0
1,45,1,3,10,1,1,3,4,0,1,0,0
2,43,2,3,7,1,1,3,4,0,1,0,0
3,42,3,2,9,1,1,3,3,0,1,0,0
4,36,3,3,8,1,1,3,2,0,1,0,0


### Education, H_education, Religion, Employ, H_occupation, living_standard, Media_exposure and Contraceptive are having fixed set of values. Even though they are number they are Categorical Attributes. 

In [None]:
categorical_vars = ['Education', 'H_education', 'Religion', 'Employ', 'H_occupation',
                   'living_standard', 'Media_exposure']
integral_vars = ['Age', 'num_child']
target_vars = ['contraceptive_1', 'contraceptive_2', 'contraceptive_3']

In [None]:
for categorical_var in categorical_vars:
    print(categorical_var, df[categorical_var].unique())

Education [2 1 3 4]
H_education [3 2 4 1]
Religion [1 0]
Employ [1 0]
H_occupation [2 3 1 4]
living_standard [3 4 2 1]
Media_exposure [0 1]


### Train-test split

In [None]:
#Performing train test split on the data
X_df, y_df = df.loc[:, categorical_vars+integral_vars], df.loc[:, target_vars]
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.3, random_state=123)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1031, 9) (1031, 3) (442, 9) (442, 3)


### Categorical embeddings + numerical inputs:

In [None]:
inputs = []
embeddings = []
for categorical_var in categorical_vars:   
    input_cat = Input(shape=(1,))
    no_of_levels = X_train[categorical_var].nunique()
    embedding_size = int(min(np.ceil(np.sqrt(no_of_levels)), 30))
    # Categorical embedding (square root of the number of levels or 30 whichever is lower)
    
    embedding = Embedding(no_of_levels+1 ,embedding_size, input_length=1)(input_cat)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    inputs.append(input_cat)
    embeddings.append(embedding)
    
input_numeric = Input(shape=(len(integral_vars),))
embedding_numeric = Dense(5)(input_numeric) 
inputs.append(input_numeric)
embeddings.append(embedding_numeric)

### Converting data to list format to match the network structure

In [None]:
input_list_train = []
input_list_test = []
    
#the categorical cols to be embedded: rescaling to range [0, # levels)
for categorical_var in categorical_vars:
    levels = X_train[categorical_var].unique()
    level_map = {}
    for i in range(len(levels)):
        level_map[levels[i]] = i       
    input_list_train.append(X_train[categorical_var].map(level_map).values)
    input_list_test.append(X_test[categorical_var].map(level_map).values)
     
# Standardize numerical columns:
scale = MinMaxScaler()
scale.fit(X_train[integral_vars])
input_list_train.append(scale.transform(X_train[integral_vars]))
input_list_test.append(scale.transform(X_test[integral_vars]))

### Perceptron Model Building: Multi-Class Cross-Entropy Loss and 'softmax' activation in order to predict the probability for each class.

In [None]:
first_layer = Concatenate()(embeddings)
outputs = Dense(len(target_vars), activation='softmax')(first_layer)
perceptron_model = Model(inputs, outputs)
perceptron_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
perceptron_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_9 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
input_10 (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
input_11 (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
input_12 (InputLayer)            (None, 1)             0                                            
___________________________________________________________________________________________

In [None]:
perceptron_model.fit(input_list_train, y_train.values, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f1b73ac97f0>

#### Make predictions and evaluate the predictions by using accuracy, recall and F1 score

In [None]:
test_pred = perceptron_model.predict(input_list_test)
train_pred = perceptron_model.predict(input_list_train)

y_test_pred = test_pred.argmax(axis=1) + 1
y_train_pred = train_pred.argmax(axis=1) + 1

print('Accuracy on testing data: {}'.format(accuracy_score(target.loc[y_test.index], y_test_pred)))
print('Recall on testing data: {}'.format(recall_score(target.loc[y_test.index], y_test_pred, average='macro')))
print('F1 score on testing data: {}'.format(f1_score(target.loc[y_test.index], y_test_pred, average='macro')))

print('Accuracy on training data: {}'.format(accuracy_score(target.loc[y_train.index], y_train_pred)))
print('Recall on training data: {}'.format(recall_score(target.loc[y_train.index], y_train_pred, average='macro')))
print('F1 score on training data: {}'.format(f1_score(target.loc[y_train.index], y_train_pred, average='macro')))

Accuracy on testing data: 0.5271493212669683
Recall on testing data: 0.48971082164273305
F1 score on testing data: 0.49101335295504206
Accuracy on training data: 0.5169738118331717
Recall on training data: 0.487140673517078
F1 score on training data: 0.4916821562459728


### Add one hidden layer

In [None]:
embedding_layer = Concatenate()(embeddings)
embedding_layer_dropout = Dropout(0.1)(embedding_layer)
hidden_layer = Dense(100, activation='relu')(embedding_layer_dropout)                                   
hidden_layer_dropout = Dropout(0.2)(hidden_layer)                                  
outputs_with_1_hidden_layer = Dense(len(target_vars), activation='softmax')(hidden_layer_dropout)
one_hidden_layer_model = Model(inputs, outputs_with_1_hidden_layer)
one_hidden_layer_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
one_hidden_layer_model.fit(input_list_train, y_train.values, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f1b71786358>

#### Evaluate predictions

In [None]:
test_pred = one_hidden_layer_model.predict(input_list_test)
train_pred = one_hidden_layer_model.predict(input_list_train)

y_test_pred = test_pred.argmax(axis=1) + 1
y_train_pred = train_pred.argmax(axis=1) + 1

print('Accuracy on testing data: {}'.format(accuracy_score(target.loc[y_test.index], y_test_pred)))
print('Recall on testing data: {}'.format(recall_score(target.loc[y_test.index], y_test_pred, average='macro')))
print('F1 score on testing data: {}'.format(f1_score(target.loc[y_test.index], y_test_pred, average='macro')))

print('Accuracy on training data: {}'.format(accuracy_score(target.loc[y_train.index], y_train_pred)))
print('Recall on training data: {}'.format(recall_score(target.loc[y_train.index], y_train_pred, average='macro')))
print('F1 score on training data: {}'.format(f1_score(target.loc[y_train.index], y_train_pred, average='macro')))

Accuracy on testing data: 0.581447963800905
Recall on testing data: 0.5609240476959919
F1 score on testing data: 0.562087052226994
Accuracy on training data: 0.6488845780795345
Recall on training data: 0.6306758888219562
F1 score on training data: 0.6311072166758441
