#  Ordinal Encoding
   In ordinal encoding, each unique category value is assigned an integer value.

For example, “red” is 1, “green” is 2, and “blue” is 3.

This is called an ordinal encoding or an integer encoding and is easily reversible. Often, integer values starting at zero are used.

For some variables, an ordinal encoding may be enough. The integer values have a natural ordered relationship between each other and machine learning algorithms may be able to understand and harness this relationship.

It is a natural encoding for ordinal variables. For categorical variables, it imposes an ordinal relationship where no such relationship may exist. This can cause problems and a one-hot encoding may be used instead.

In [199]:
import numpy as np
import pandas as pd
df=pd.read_csv(r'C:\Users\logic\Documents\datascience\IT Salary Survey EU 2018.csv')
ds=df
df=df.dropna()
df.sample(5)


Unnamed: 0,Timestamp,Age,Gender,City,Position,Years of experience,Your level,Current Salary,Salary one year ago,Salary two years ago,Are you getting any Stock Options?,Main language at work,Company size,Company type
408,16/12/2018 21:28:14,31.0,M,Nürnberg,Software Engineer,6.0,Middle,59000.0,50000.0,50000.0,No,Deutsch,1000+,Consulting
692,29/12/2018 00:06:13,29.0,M,München,.net developer,8.0,Senior,77000.0,55000.0,44000.0,No,English,1000+,Product
129,14/12/2018 15:09:01,28.0,F,Berlin,Frontend Developer,4.0,Middle,56000.0,51000.0,45000.0,No,English,1000+,Product
197,14/12/2018 18:22:41,32.0,M,Berlin,Machine Learning Engineer,8.0,Senior,95000.0,88000.0,80000.0,Yes,English,50-100,Product
619,21/12/2018 14:10:43,30.0,M,München,iOS Developer,7.0,Senior,68000.0,68000.0,60000.0,No,English,100-1000,Product


In [2]:
df=df.iloc[ :, [2,6,10]]
df

Unnamed: 0,Gender,Your level,Are you getting any Stock Options?
0,M,Senior,No
1,F,Senior,No
2,M,Senior,No
3,M,Senior,Yes
4,M,Senior,No
...,...,...,...
732,M,Senior,Yes
735,M,Senior,No
746,M,Senior,No
756,M,Middle,No


In [89]:
# train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,[0,1]],df.iloc[:,[2]],test_size=0.2,random_state=2)
X_train.head(10)

Unnamed: 0,Gender,Your level
296,M,Senior
330,M,Middle
432,M,Senior
45,M,Senior
452,M,Senior
418,M,Senior
389,F,Middle
621,M,Senior
299,M,Middle
632,M,Senior


In [84]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['F','M'],['Junior','Middle','Senior']])


In [86]:
oe.fit(X_train)


OrdinalEncoder(categories=[['F', 'M'], ['Junior', 'Middle', 'Senior']])

In [87]:
X_train = oe.transform(X_train)
X_test = oe.transform(X_test)
X_train

array([[1., 2.],
       [1., 1.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [0., 1.],
       [1., 2.],
       [1., 1.],
       [1., 2.],
       [0., 0.],
       [0., 1.],
       [1., 1.],
       [1., 1.],
       [0., 1.],
       [1., 2.],
       [1., 2.],
       [1., 1.],
       [1., 2.],
       [1., 2.],
       [1., 1.],
       [1., 1.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 1.],
       [0., 1.],
       [1., 2.],
       [1., 1.],
       [1., 1.],
       [1., 2.],
       [1., 1.],
       [1., 2.],
       [1., 1.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 1.],
       [1., 2.],
       [1., 2.],
       [1., 1.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 1.],
       [0., 1.],
       [1., 1.],
       [0., 1.],
       [0., 1.],
       [1., 1.],
       [0., 2.],
       [0., 1.],
       [1., 2.],
       [1., 1.

In [88]:
y_train

Unnamed: 0,Are you getting any Stock Options?
296,Yes
330,No
432,No
45,No
452,No
...,...
607,Yes
31,No
147,No
20,Yes


# Label Encoding
Label Encoding refers to converting the labels into a numeric form so as to convert them into the machine-readable form. Machine learning algorithms can then decide in a better way how those labels must be operated. It is an important pre-processing step for the structured dataset in supervised learning.
 ## We apply Label Encoding on  dataset on the target column 

In [35]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)


LabelEncoder()

In [31]:
le.classes_

array([0, 1], dtype=int64)

In [32]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [26]:
y_train

array([1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0], dtype=int64)

In [27]:
y_test

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0], dtype=int64)

# One Hot Encoding
 using Column Transformer

In [200]:

ds=ds.dropna()
ds=ds.iloc[ :, [2,6,10,11]]
ds

Unnamed: 0,Gender,Your level,Are you getting any Stock Options?,Main language at work
0,M,Senior,No,Deutsch
1,F,Senior,No,Deutsch
2,M,Senior,No,Deutsch
3,M,Senior,Yes,English
4,M,Senior,No,English
...,...,...,...,...
732,M,Senior,Yes,English
735,M,Senior,No,English
746,M,Senior,No,Deutsch
756,M,Middle,No,English


In [203]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(ds.drop(columns=['Are you getting any Stock Options?']),ds['Are you getting any Stock Options?'],
                                                test_size=0.2)

In [204]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [205]:
from sklearn.compose import ColumnTransformer


# applied OHE to Gender and Main Language at work
  NOW gender has 2-1=1 columns and Language has 5-1=4 columns

# applied Ordinal encoding to Your Levels 

In [208]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',OneHotEncoder(sparse=False,drop='first'),['Main language at work']),
    ('tnf2',OrdinalEncoder(categories=[['Junior','Middle','Senior']]),['Your level']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['Gender'])
],remainder='passthrough')

In [209]:
transformer.fit_transform(X_train)

array([[0., 1., 0., 0., 2., 1.],
       [0., 1., 0., 0., 2., 1.],
       [0., 1., 0., 0., 2., 1.],
       ...,
       [0., 0., 0., 0., 1., 1.],
       [0., 1., 0., 0., 2., 0.],
       [0., 1., 0., 0., 2., 1.]])

In [215]:
transformer.transform(X_test).shape

(74, 6)

In [216]:
X_train

Unnamed: 0,Gender,Your level,Main language at work
207,M,Senior,English
509,M,Senior,English
621,M,Senior,English
275,F,Senior,English
154,F,Middle,English
...,...,...,...
646,M,Senior,Deutsch
171,M,Senior,Deutsch
595,M,Middle,Deutsch
694,F,Senior,English


In [217]:
y_train

207    Yes
509     No
621     No
275     No
154     No
      ... 
646     No
171     No
595     No
694     No
337    Yes
Name: Are you getting any Stock Options?, Length: 294, dtype: object

In [226]:
ds['Main language at work'].value_counts()

English             280
Deutsch              70
Russian              15
Polish                2
Deutsch/Englisch      1
Name: Main language at work, dtype: int64